001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import com.google.common.base.Preconditions; 020import com.google.common.base.Strings; 021import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; 022import com.hp.hpl.jena.rdf.model.Model; 023import com.hp.hpl.jena.rdf.model.ModelFactory; 024import org.apache.commons.codec.digest.DigestUtils; 025import org.apache.commons.io.IOUtils; 026import org.apache.tika.config.TikaConfig; 027import org.apache.tika.detect.Detector; 028import org.apache.tika.io.TikaInputStream; 029import org.apache.tika.metadata.Metadata; 030import org.apache.tika.mime.MimeType; 031import org.apache.tika.mime.MimeTypeException; 032import org.apache.tika.mime.MimeTypes; 033import org.fcrepo.migration.DatastreamVersion; 034import org.fcrepo.migration.FedoraObjectVersionHandler; 035import org.fcrepo.migration.MigrationType; 036import org.fcrepo.migration.ObjectVersionReference; 037import org.fcrepo.storage.ocfl.InteractionModel; 038import org.fcrepo.storage.ocfl.OcflObjectSession; 039import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 040import org.fcrepo.storage.ocfl.ResourceHeaders; 041import org.slf4j.Logger; 042 043import java.io.ByteArrayInputStream; 044import java.io.ByteArrayOutputStream; 045import java.io.IOException; 046import java.io.InputStream; 047import java.io.UncheckedIOException; 048import java.net.URI; 049import java.time.Instant; 050import java.time.OffsetDateTime; 051import java.util.ArrayList; 052import java.util.HashMap; 053import java.util.Map; 054import java.util.concurrent.atomic.AtomicBoolean; 055 056import static org.slf4j.LoggerFactory.getLogger; 057 058/** 059 * Writes a Fedora object as a single ArchiveGroup. 060 * <p> 061 * All datastreams and object metadata from a fcrepo3 object are persisted to a 062 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 063 * </p> 064 * <p> 065 * The contents of each datastream are written verbatim. No attempt is made to 066 * re-write the RELS-EXT to replace subjects and objects with their LDP 067 * counterparts. 068 * </p> 069 * <p> 070 * Note: fedora-specific OCFL serialization features (such as redirects, 071 * container metadata, etc) is not fully defined yet, so are not included here 072 * 073 * @author apb@jhu.edu 074 */ 075public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 076 077 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 078 079 private static final String FCREPO_ROOT = "info:fedora/"; 080 081 private static final Map<String, String> externalHandlingMap = Map.of( 082 "E", "proxy", 083 "R", "redirect" 084 ); 085 086 private static final String INLINE_XML = "X"; 087 088 private static final String DS_INACTIVE = "I"; 089 private static final String DS_DELETED = "D"; 090 091 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 092 private static final String OBJ_INACTIVE = "Inactive"; 093 private static final String OBJ_DELETED = "Deleted"; 094 095 private final OcflObjectSessionFactory sessionFactory; 096 private final boolean addDatastreamExtensions; 097 private final boolean deleteInactive; 098 private final MigrationType migrationType; 099 private final String user; 100 private final Detector mimeDetector; 101 102 /** 103 * Create an ArchiveGroupHandler, 104 * 105 * @param sessionFactory 106 * OCFL session factory 107 * @param migrationType 108 * the type of migration to do 109 * @param addDatastreamExtensions 110 * true if datastreams should be written with file extensions 111 * @param deleteInactive 112 * true if inactive objects and datastreams should be migrated as deleted 113 * @param user 114 * the username to associated with the migrated resources 115 */ 116 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 117 final MigrationType migrationType, 118 final boolean addDatastreamExtensions, 119 final boolean deleteInactive, 120 final String user) { 121 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 122 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 123 this.addDatastreamExtensions = addDatastreamExtensions; 124 this.deleteInactive = deleteInactive; 125 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 126 try { 127 this.mimeDetector = new TikaConfig().getDetector(); 128 } catch (Exception e) { 129 throw new RuntimeException(e); 130 } 131 } 132 133 @Override 134 public void processObjectVersions(final Iterable<ObjectVersionReference> versions) { 135 // We use the PID to identify the OCFL object 136 String objectId = null; 137 String f6ObjectId = null; 138 139 // We need to manually keep track of the datastream creation dates 140 final Map<String, String> dsCreateDates = new HashMap<>(); 141 142 String objectState = null; 143 final Map<String, String> datastreamStates = new HashMap<>(); 144 145 for (var ov : versions) { 146 if (ov.isFirstVersion()) { 147 objectId = ov.getObjectInfo().getPid(); 148 f6ObjectId = FCREPO_ROOT + objectId; 149 objectState = getObjectState(ov); 150 } 151 152 final OcflObjectSession session = sessionFactory.newSession(f6ObjectId); 153 154 // Object properties are written only once (as fcrepo3 object properties were unversioned). 155 if (ov.isFirstVersion()) { 156 writeObjectFiles(f6ObjectId, ov, session); 157 } 158 159 // Write datastreams and their metadata 160 for (var dv : ov.listChangedDatastreams()) { 161 final var mimeType = resolveMimeType(dv); 162 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 163 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId, mimeType); 164 final var datastreamFilename = lastPartFromId(f6DsId); 165 166 if (dv.isFirstVersionIn(ov.getObject())) { 167 dsCreateDates.put(dsId, dv.getCreated()); 168 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 169 } 170 final var createDate = dsCreateDates.get(dsId); 171 172 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 173 datastreamFilename, mimeType, createDate); 174 175 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 176 InputStream content = null; 177 // Write a file for external content only for plain OCFL migration 178 if (migrationType == MigrationType.PLAIN_OCFL) { 179 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL()); 180 } 181 session.writeResource(datastreamHeaders, content); 182 } else { 183 try (var content = dv.getContent()) { 184 session.writeResource(datastreamHeaders, content); 185 } catch (final IOException e) { 186 throw new UncheckedIOException(e); 187 } 188 } 189 190 writeDescriptionFiles(f6DsId, datastreamFilename, createDate, datastreamHeaders, dv, session); 191 } 192 193 LOGGER.debug("Committing object <{}>", f6ObjectId); 194 195 session.versionCreationTimestamp(OffsetDateTime.parse(ov.getVersionDate())); 196 session.commit(); 197 } 198 199 handleDeletedResources(f6ObjectId, objectState, datastreamStates); 200 } 201 202 private void handleDeletedResources(final String f6ObjectId, 203 final String objectState, 204 final Map<String, String> datastreamStates) { 205 final OcflObjectSession session = sessionFactory.newSession(f6ObjectId); 206 207 try { 208 final var now = OffsetDateTime.now(); 209 final var hasDeletes = new AtomicBoolean(false); 210 211 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 212 hasDeletes.set(true); 213 214 datastreamStates.keySet().forEach(f6DsId -> { 215 deleteDatastream(f6DsId, now.toInstant(), session); 216 }); 217 218 if (migrationType == MigrationType.PLAIN_OCFL) { 219 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 220 } else { 221 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 222 } 223 } else { 224 datastreamStates.forEach((f6DsId, state) -> { 225 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 226 hasDeletes.set(true); 227 deleteDatastream(f6DsId, now.toInstant(), session); 228 } 229 }); 230 } 231 232 if (hasDeletes.get()) { 233 session.versionCreationTimestamp(now); 234 session.commit(); 235 } else { 236 session.abort(); 237 } 238 } catch (RuntimeException e) { 239 session.abort(); 240 throw e; 241 } 242 } 243 244 private void writeObjectFiles(final String f6ObjectId, 245 final ObjectVersionReference ov, 246 final OcflObjectSession session) { 247 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 248 final var content = getObjTriples(ov); 249 session.writeResource(objectHeaders, content); 250 } 251 252 private void writeDescriptionFiles(final String f6Dsid, 253 final String datastreamFilename, 254 final String createDate, 255 final ResourceHeaders datastreamHeaders, 256 final DatastreamVersion dv, 257 final OcflObjectSession session) { 258 final var descriptionHeaders = createDescriptionHeaders(f6Dsid, 259 datastreamFilename, 260 datastreamHeaders); 261 session.writeResource(descriptionHeaders, getDsTriples(dv, f6Dsid, createDate)); 262 } 263 264 private String f6DescriptionId(final String f6ResourceId) { 265 return f6ResourceId + "/fcr:metadata"; 266 } 267 268 private String lastPartFromId(final String id) { 269 return id.substring(id.lastIndexOf('/') + 1); 270 } 271 272 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId, final String mimeType) { 273 var id = f6ObjectId + "/" + datastreamId; 274 275 if (addDatastreamExtensions && !Strings.isNullOrEmpty(mimeType)) { 276 id += getExtension(mimeType); 277 } 278 279 return id; 280 } 281 282 private ResourceHeaders.Builder createHeaders(final String id, 283 final String parentId, 284 final InteractionModel model) { 285 final var headers = ResourceHeaders.builder(); 286 headers.withId(id); 287 headers.withParent(parentId); 288 headers.withInteractionModel(model.getUri()); 289 return headers; 290 } 291 292 private ResourceHeaders createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 293 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 294 headers.withArchivalGroup(true); 295 headers.withObjectRoot(true); 296 headers.withLastModifiedBy(user); 297 headers.withCreatedBy(user); 298 299 ov.getObjectProperties().listProperties().forEach(p -> { 300 if (p.getName().contains("lastModifiedDate")) { 301 final var lastModified = Instant.parse(p.getValue()); 302 headers.withLastModifiedDate(lastModified); 303 headers.withStateToken(DigestUtils.md5Hex( 304 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 305 } else if (p.getName().contains("createdDate")) { 306 headers.withCreatedDate(Instant.parse(p.getValue())); 307 } 308 }); 309 310 return headers.build(); 311 } 312 313 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 314 final String f6DsId, 315 final String f6ObjectId, 316 final String filename, 317 final String mime, 318 final String createDate) { 319 final var lastModified = Instant.parse(dv.getCreated()); 320 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 321 headers.withFilename(filename); 322 headers.withCreatedDate(Instant.parse(createDate)); 323 headers.withLastModifiedDate(lastModified); 324 headers.withLastModifiedBy(user); 325 headers.withCreatedBy(user); 326 327 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 328 headers.withExternalHandling( 329 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 330 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 331 } 332 333 headers.withArchivalGroup(false); 334 headers.withObjectRoot(false); 335 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 336 headers.withContentSize(dv.getSize()); 337 } 338 339 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 340 final var digest = dv.getContentDigest(); 341 final var digests = new ArrayList<URI>(); 342 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase())); 343 headers.withDigests(digests); 344 } 345 346 headers.withMimeType(mime); 347 headers.withStateToken(DigestUtils.md5Hex( 348 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 349 350 return headers.build(); 351 } 352 353 private ResourceHeaders createDescriptionHeaders(final String f6DsId, 354 final String filename, 355 final ResourceHeaders datastreamHeaders) { 356 final var id = f6DescriptionId(f6DsId); 357 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 358 359 headers.withFilename(filename); 360 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 361 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 362 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 363 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 364 365 headers.withArchivalGroup(false); 366 headers.withObjectRoot(false); 367 headers.withStateToken(datastreamHeaders.getStateToken()); 368 369 return headers.build(); 370 } 371 372 private String resolveMimeType(final DatastreamVersion dv) { 373 String mime = dv.getMimeType(); 374 375 if (Strings.isNullOrEmpty(mime)) { 376 final var meta = new Metadata(); 377 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 378 try (var content = TikaInputStream.get(dv.getContent())) { 379 mime = mimeDetector.detect(content, meta).toString(); 380 } catch (IOException e) { 381 throw new UncheckedIOException(e); 382 } 383 } 384 385 return mime; 386 } 387 388 private void deleteDatastream(final String id, 389 final Instant lastModified, 390 final OcflObjectSession session) { 391 if (migrationType == MigrationType.PLAIN_OCFL) { 392 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 393 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 394 } else { 395 deleteF6MigratedResource(id, lastModified, session); 396 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 397 } 398 } 399 400 private void deleteF6MigratedResource(final String id, 401 final Instant lastModified, 402 final OcflObjectSession session) { 403 LOGGER.debug("Deleting resource {}", id); 404 final var headers = session.readHeaders(id); 405 session.deleteContentFile(ResourceHeaders.builder(headers) 406 .withDeleted(true) 407 .withLastModifiedDate(lastModified) 408 .build()); 409 } 410 411 private void deleteOcflMigratedResource(final String id, 412 final InteractionModel interactionModel, 413 final OcflObjectSession session) { 414 LOGGER.debug("Deleting resource {}", id); 415 session.deleteContentFile(ResourceHeaders.builder() 416 .withId(id) 417 .withInteractionModel(interactionModel.getUri()) 418 .build()); 419 } 420 421 private String getObjectState(final ObjectVersionReference ov) { 422 return ov.getObjectProperties().listProperties().stream() 423 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 424 .findFirst() 425 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 426 ov.getObjectInfo().getPid()))) 427 .getValue(); 428 } 429 430 // Get object-level triples 431 private static InputStream getObjTriples(final ObjectVersionReference o) { 432 final ByteArrayOutputStream out = new ByteArrayOutputStream(); 433 final Model triples = ModelFactory.createDefaultModel(); 434 final String uri = "info:fedora/" + o.getObjectInfo().getPid(); 435 436 o.getObjectProperties().listProperties().forEach(p -> { 437 if (p.getName().contains("Date")) { 438 addDateLiteral(triples, uri, p.getName(), p.getValue()); 439 } else { 440 addStringLiteral(triples, uri, p.getName(), p.getValue()); 441 } 442 }); 443 444 triples.write(out, "N-TRIPLES"); 445 return new ByteArrayInputStream(out.toByteArray()); 446 } 447 448 // Get datastream-level triples 449 private InputStream getDsTriples(final DatastreamVersion dv, 450 final String f6DsId, 451 final String createDate) { 452 final ByteArrayOutputStream out = new ByteArrayOutputStream(); 453 final Model triples = ModelFactory.createDefaultModel(); 454 455 if (migrationType == MigrationType.PLAIN_OCFL) { 456 // These triples are server managed in F6 457 addDateLiteral(triples, 458 f6DsId, 459 "http://fedora.info/definitions/v4/repository#created", 460 createDate); 461 addDateLiteral(triples, 462 f6DsId, 463 "http://fedora.info/definitions/v4/repository#lastModified", 464 dv.getCreated()); 465 addStringLiteral(triples, 466 f6DsId, 467 "http://purl.org/dc/terms/identifier", 468 dv.getDatastreamInfo().getDatastreamId()); 469 addStringLiteral(triples, 470 f6DsId, 471 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 472 dv.getMimeType()); 473 addLongLiteral(triples, 474 f6DsId, 475 "http://www.loc.gov/premis/rdf/v1#size", 476 dv.getSize()); 477 478 if (dv.getContentDigest() != null) { 479 addStringLiteral(triples, 480 f6DsId, 481 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 482 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 483 dv.getContentDigest().getDigest().toLowerCase()); 484 } 485 } 486 487 addStringLiteral(triples, 488 f6DsId, 489 "http://purl.org/dc/terms/title", 490 dv.getLabel()); 491 addStringLiteral(triples, 492 f6DsId, 493 "http://fedora.info/definitions/1/0/access/objState", 494 dv.getDatastreamInfo().getState()); 495 addStringLiteral(triples, 496 f6DsId, 497 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 498 dv.getFormatUri()); 499 500 triples.write(out, "N-TRIPLES"); 501 return new ByteArrayInputStream(out.toByteArray()); 502 } 503 504 private static void addStringLiteral(final Model m, 505 final String s, 506 final String p, 507 final String o) { 508 if (o != null) { 509 m.add(m.createResource(s), m.createProperty(p), o); 510 } 511 } 512 513 private static void addDateLiteral(final Model m, 514 final String s, 515 final String p, 516 final String date) { 517 if (date != null) { 518 m.addLiteral(m.createResource(s), 519 m.createProperty(p), 520 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 521 } 522 } 523 524 private static void addLongLiteral(final Model m, 525 final String s, 526 final String p, 527 final long number) { 528 if (number != -1) { 529 m.addLiteral(m.createResource(s), 530 m.createProperty(p), 531 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 532 } 533 } 534 535 /** 536 * @param mime any mimetype as String 537 * @return extension associated with arg mime, return includes '.' in extension (.txt). 538 * ..Empty String if unrecognized mime 539 */ 540 private static String getExtension(final String mime) { 541 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 542 MimeType type; 543 try { 544 type = allTypes.forName(mime); 545 } catch (final MimeTypeException e) { 546 type = null; 547 } 548 549 if (type != null) { 550 return type.getExtension(); 551 } 552 553 LOGGER.warn("No mimetype found for '{}'", mime); 554 return ""; 555 } 556 557}