001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import at.favre.lib.bytes.Bytes; 020import com.google.common.base.Preconditions; 021import com.google.common.base.Strings; 022import com.google.common.collect.Sets; 023import org.apache.commons.codec.digest.DigestUtils; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.lang3.StringUtils; 026import org.apache.jena.datatypes.xsd.XSDDatatype; 027import org.apache.jena.rdf.model.Model; 028import org.apache.jena.rdf.model.ModelFactory; 029import org.apache.jena.riot.Lang; 030import org.apache.jena.riot.RDFDataMgr; 031import org.apache.tika.config.TikaConfig; 032import org.apache.tika.detect.Detector; 033import org.apache.tika.io.TikaInputStream; 034import org.apache.tika.metadata.Metadata; 035import org.apache.tika.mime.MimeType; 036import org.apache.tika.mime.MimeTypeException; 037import org.apache.tika.mime.MimeTypes; 038import org.fcrepo.migration.ContentDigest; 039import org.fcrepo.migration.DatastreamVersion; 040import org.fcrepo.migration.FedoraObjectVersionHandler; 041import org.fcrepo.migration.MigrationType; 042import org.fcrepo.migration.ObjectInfo; 043import org.fcrepo.migration.ObjectVersionReference; 044import org.fcrepo.migration.ResourceMigrationType; 045import org.fcrepo.storage.ocfl.InteractionModel; 046import org.fcrepo.storage.ocfl.OcflObjectSession; 047import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 048import org.fcrepo.storage.ocfl.ResourceHeaders; 049import org.fcrepo.storage.ocfl.ResourceHeadersVersion; 050import org.fcrepo.storage.ocfl.exception.NotFoundException; 051import org.slf4j.Logger; 052 053import java.io.BufferedInputStream; 054import java.io.ByteArrayInputStream; 055import java.io.ByteArrayOutputStream; 056import java.io.IOException; 057import java.io.InputStream; 058import java.io.UncheckedIOException; 059import java.net.URI; 060import java.nio.charset.StandardCharsets; 061import java.nio.file.Files; 062import java.security.DigestInputStream; 063import java.security.MessageDigest; 064import java.security.NoSuchAlgorithmException; 065import java.time.Instant; 066import java.time.OffsetDateTime; 067import java.time.ZoneOffset; 068import java.util.ArrayList; 069import java.util.HashMap; 070import java.util.HashSet; 071import java.util.Map; 072import java.util.Set; 073import java.util.concurrent.atomic.AtomicBoolean; 074 075import static org.slf4j.LoggerFactory.getLogger; 076 077/** 078 * Writes a Fedora object as a single ArchiveGroup. 079 * <p> 080 * All datastreams and object metadata from a fcrepo3 object are persisted to a 081 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 082 * </p> 083 * <p> 084 * The contents of each datastream are written verbatim. No attempt is made to 085 * re-write the RELS-EXT to replace subjects and objects with their LDP 086 * counterparts. 087 * </p> 088 * <p> 089 * Note: fedora-specific OCFL serialization features (such as redirects, 090 * container metadata, etc) is not fully defined yet, so are not included here 091 * 092 * @author apb@jhu.edu 093 */ 094public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 095 096 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 097 098 private static final String FCREPO_ROOT = "info:fedora/"; 099 private static final String FCRMETA_SUFFIX = "/fcr:metadata"; 100 101 private static final Map<String, String> externalHandlingMap = Map.of( 102 "E", "proxy", 103 "R", "redirect" 104 ); 105 106 private static final String INLINE_XML = "X"; 107 108 private static final String DS_INACTIVE = "I"; 109 private static final String DS_DELETED = "D"; 110 111 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 112 private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename"; 113 private static final String OBJ_INACTIVE = "Inactive"; 114 private static final String OBJ_DELETED = "Deleted"; 115 116 private static final String RELS_EXT = "RELS-EXT"; 117 private static final String RELS_INT = "RELS-INT"; 118 119 private final OcflObjectSessionFactory sessionFactory; 120 private final boolean addDatastreamExtensions; 121 private final boolean deleteInactive; 122 private final boolean foxmlFile; 123 private final MigrationType migrationType; 124 private final ResourceMigrationType resourceMigrationType; 125 private final String user; 126 private final String idPrefix; 127 private final Detector mimeDetector; 128 private final boolean disableChecksumValidation; 129 130 /** 131 * Create an ArchiveGroupHandler, 132 * 133 * @param sessionFactory 134 * OCFL session factory 135 * @param migrationType 136 * the type of migration to do 137 * @param resourceMigrationType 138 * how resources should be migrated 139 * @param addDatastreamExtensions 140 * true if datastreams should be written with file extensions 141 * @param deleteInactive 142 * true if inactive objects and datastreams should be migrated as deleted 143 * @param foxmlFile 144 * true if foxml file should be migrated as a whole file, instead of creating property files 145 * @param user 146 * the username to associated with the migrated resources 147 * @param idPrefix 148 * the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3) 149 * @param disableChecksumValidation 150 * if true, migrator should not try to verify that the datastream content matches Fedora 3 checksums 151 */ 152 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 153 final MigrationType migrationType, 154 final ResourceMigrationType resourceMigrationType, 155 final boolean addDatastreamExtensions, 156 final boolean deleteInactive, 157 final boolean foxmlFile, 158 final String user, 159 final String idPrefix, 160 final boolean disableChecksumValidation) { 161 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 162 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 163 this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType, 164 "resourceMigrationType cannot be null"); 165 this.addDatastreamExtensions = addDatastreamExtensions; 166 this.deleteInactive = deleteInactive; 167 this.foxmlFile = foxmlFile; 168 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 169 this.idPrefix = idPrefix; 170 this.disableChecksumValidation = disableChecksumValidation; 171 try { 172 this.mimeDetector = new TikaConfig().getDetector(); 173 } catch (Exception e) { 174 throw new RuntimeException(e); 175 } 176 } 177 178 @Override 179 public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) { 180 // We use the PID to identify the OCFL object 181 final String objectId = objectInfo.getPid(); 182 final String f6ObjectId = idPrefix + objectId; 183 184 // We need to manually keep track of the datastream creation dates 185 final Map<String, String> dsCreateDates = new HashMap<>(); 186 187 String objectState = null; 188 final Map<String, String> datastreamStates = new HashMap<>(); 189 // tracks the triples used to create containers and binary descriptions 190 final Map<String, MetaHolder> metaMap = new HashMap<>(); 191 // tracks info about binary resources needed to construct filenames 192 final Map<String, BinaryMeta> binaryMeta = new HashMap<>(); 193 // tracks filenames pulled from RELS-INT 194 final Map<String, String> filenameMap = new HashMap<>(); 195 196 for (var ov : versions) { 197 // tracks the binary descriptions that need to be written 198 final Set<String> toWrite = new HashSet<>(); 199 // tracks the binaries that need their filename updated base on RELS-INT 200 final Set<String> relsFilenameUpdates = new HashSet<>(); 201 // tracks the binaries that need their filename updated based on a RELS-INT removal 202 final Map<String, String> relsDeletedFilenames = new HashMap<>(); 203 204 final var objectSession = newSession(f6ObjectId); 205 206 if (ov.isFirstVersion()) { 207 if (objectSession.containsResource(f6ObjectId)) { 208 throw new RuntimeException(f6ObjectId + " already exists!"); 209 } 210 objectState = getObjectState(ov, objectId); 211 // Object properties are written only once (as fcrepo3 object properties were unversioned). 212 if (foxmlFile) { 213 try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) { 214 final var foxmlDsId = f6ObjectId + "/FOXML"; 215 final var headers = createHeaders(foxmlDsId, f6ObjectId, 216 InteractionModel.NON_RDF).build(); 217 objectSession.writeResource(headers, is); 218 //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources() 219 datastreamStates.put(foxmlDsId, DS_DELETED); 220 } catch (IOException io) { 221 LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io); 222 throw new UncheckedIOException(io); 223 } 224 } else { 225 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 226 final var content = getObjTriples(ov, objectId); 227 final var meta = MetaHolder.fromContent(content, objectHeaders); 228 metaMap.put(f6ObjectId, meta); 229 objectSession.writeResource(meta.headers.build(), meta.constructTriples()); 230 } 231 } 232 233 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 234 235 // Write datastreams and their metadata 236 for (var dv : ov.listChangedDatastreams()) { 237 final var mimeType = resolveMimeType(dv); 238 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 239 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId); 240 final var datastreamFilename = lastPartFromId(f6DsId); 241 242 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 243 k -> datastreamSession(f6DsId, objectSession)); 244 245 if (dv.isFirstVersionIn(ov.getObject())) { 246 dsCreateDates.put(dsId, dv.getCreated()); 247 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 248 } 249 final var createDate = dsCreateDates.get(dsId); 250 251 final var filename = resolveFilename(datastreamFilename, 252 dv.getLabel(), filenameMap.get(f6DsId), mimeType); 253 254 relsDeletedFilenames.remove(f6DsId); 255 256 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 257 filename, mimeType, createDate); 258 259 binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel())); 260 261 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 262 InputStream content = null; 263 // for plain OCFL migrations, write a file containing the external/redirect URL 264 if (migrationType == MigrationType.PLAIN_OCFL) { 265 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8); 266 } 267 datastreamSession.writeResource(datastreamHeaders, content); 268 } else { 269 try (var contentStream = dv.getContent()) { 270 writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession); 271 } catch (final IOException e) { 272 throw new UncheckedIOException(e); 273 } 274 } 275 276 if (!foxmlFile) { 277 final var f6DescId = f6DescriptionId(f6DsId); 278 final var descriptionHeaders = createDescriptionHeaders(f6DsId, 279 datastreamHeaders); 280 final var descriptionTriples = getDsTriples(dv, f6DsId, createDate); 281 metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder()) 282 .setHeaders(descriptionHeaders) 283 .setContentTriples(descriptionTriples); 284 toWrite.add(f6DescId); 285 286 if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) { 287 final var triples = parseRdfXml(dv); 288 if (RELS_EXT.equals(dsId)) { 289 metaMap.get(f6ObjectId).setRelsTriples(triples); 290 toWrite.add(f6ObjectId); 291 } else { 292 final Map<String, Model> splitModels = splitRelsInt(triples); 293 final var oldIds = new HashSet<>(filenameMap.keySet()); 294 filenameMap.clear(); 295 296 splitModels.forEach((id, model) -> { 297 final var descId = f6DescriptionId(id); 298 metaMap.computeIfAbsent(descId, k -> new MetaHolder()) 299 .setRelsTriples(model); 300 toWrite.add(descId); 301 302 // Check to see if there are any file names that need updated 303 for (final var it = model.listStatements(); it.hasNext(); ) { 304 final var statement = it.next(); 305 if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) { 306 filenameMap.put(id, statement.getObject().toString()); 307 relsFilenameUpdates.add(id); 308 break; 309 } 310 } 311 }); 312 313 // The filename was set once but is no longer 314 final var deleted = Sets.difference(oldIds, filenameMap.keySet()); 315 deleted.forEach(id -> { 316 final var meta = binaryMeta.get(id); 317 if (meta != null) { 318 relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label, 319 null, meta.mimeType)); 320 } 321 }); 322 } 323 } 324 } 325 } 326 327 writeMeta(toWrite, metaMap, objectSession, datastreamSessions); 328 updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions); 329 330 LOGGER.debug("Committing object <{}>", f6ObjectId); 331 332 final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate()); 333 334 objectSession.versionCreationTimestamp(creationTimestamp); 335 objectSession.commit(); 336 337 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 338 datastreamSessions.forEach((id, session) -> { 339 LOGGER.debug("Committing object <{}>", id); 340 session.versionCreationTimestamp(creationTimestamp); 341 session.commit(); 342 }); 343 } 344 } 345 346 handleDeletedResources(f6ObjectId, objectState, datastreamStates); 347 } 348 349 /** 350 * Resolves the filename of the datastream based on the following precedence: 351 * 352 * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT 353 * 2. LABEL from datastream meta 354 * 3. Name of the datastream 355 * 356 * If extensions should be added, then an extension is picked based on the mime type. If the filename already 357 * includes a `.` then no extension is added. 358 * 359 * @param dsName the name of the datastream 360 * @param labelName the datastream's label 361 * @param downloadName the download name from RELS-INT 362 * @param mimeType the datastream's mime type 363 * @return the resolved filename 364 */ 365 private String resolveFilename(final String dsName, 366 final String labelName, 367 final String downloadName, 368 final String mimeType) { 369 String filename; 370 if (StringUtils.isNotBlank(downloadName)) { 371 filename = downloadName; 372 } else if (StringUtils.isNotBlank(labelName)) { 373 filename = labelName; 374 } else { 375 filename = dsName; 376 } 377 378 if (addDatastreamExtensions 379 && StringUtils.isNotBlank(mimeType) 380 && !filename.contains(".")) { 381 filename += getExtension(mimeType); 382 } 383 384 return filename; 385 } 386 387 /** 388 * RDF resources are written after writing all other binaries in the version because they can be affected by 389 * RELS-INT or RELS-EXT updates. 390 * 391 * @param toWrite the set of resources that should be written to this version 392 * @param metaMap the map of all known rdf resources 393 * @param objectSession the ocfl session for the object 394 * @param datastreamSessions the ocfl sessions for the datastreams 395 */ 396 private void writeMeta(final Set<String> toWrite, 397 final Map<String, MetaHolder> metaMap, 398 final OcflObjectSession objectSession, 399 final Map<String, OcflObjectSession> datastreamSessions) { 400 for (final var id : toWrite) { 401 final var meta = metaMap.get(id); 402 403 if (meta.headers == null) { 404 // This only happens if there's a RELS-INT that references a datastream before it exists. 405 // Skip for now. The triples will be added once the datastream exists. 406 continue; 407 } 408 409 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 410 k -> datastreamSession(k, objectSession)); 411 412 // Need to copy over the memento created date from the existing headers because it may have been updated 413 // when a description's binary was updated 414 if (migrationType == MigrationType.FEDORA_OCFL) { 415 try { 416 final var existingHeaders = session.readHeaders(id); 417 meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate()); 418 } catch (NotFoundException e) { 419 // this just means the resource hasn't been written yet 420 } 421 } 422 session.writeResource(meta.headers.build(), meta.constructTriples()); 423 } 424 } 425 426 private void updateFilenames(final Set<String> toUpdate, 427 final Map<String, String> filenameMap, 428 final Map<String, String> relsDeletedFilenames, 429 final OcflObjectSession objectSession, 430 final Map<String, OcflObjectSession> datastreamSessions) { 431 if (migrationType == MigrationType.FEDORA_OCFL) { 432 toUpdate.forEach(id -> { 433 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 434 k -> datastreamSession(k, objectSession)); 435 final var origHeaders = session.readHeaders(id); 436 final var filename = filenameMap.get(id); 437 if (StringUtils.isNotBlank(filename)) { 438 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 439 session.writeHeaders(newHeaders); 440 } 441 }); 442 relsDeletedFilenames.forEach((id, filename) -> { 443 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 444 k -> datastreamSession(k, objectSession)); 445 final var origHeaders = session.readHeaders(id); 446 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 447 session.writeHeaders(newHeaders); 448 }); 449 } 450 } 451 452 private boolean fedora3DigestValid(final ContentDigest f3Digest) { 453 return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) && 454 StringUtils.isNotBlank(f3Digest.getDigest()); 455 } 456 457 private void writeDatastreamContent(final DatastreamVersion dv, 458 final ResourceHeaders datastreamHeaders, 459 final InputStream contentStream, 460 final OcflObjectSession session) throws IOException { 461 if (disableChecksumValidation) { 462 session.writeResource(datastreamHeaders, contentStream); 463 return; 464 } 465 final var f3Digest = dv.getContentDigest(); 466 final var ocflObjectId = session.ocflObjectId(); 467 final var datastreamId = dv.getDatastreamInfo().getDatastreamId(); 468 final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup(); 469 if (fedora3DigestValid(f3Digest)) { 470 try { 471 final var messageDigest = MessageDigest.getInstance(f3Digest.getType()); 472 if (migrationType == MigrationType.PLAIN_OCFL) { 473 session.writeResource(datastreamHeaders, contentStream); 474 } else { 475 try (var digestStream = new DigestInputStream(contentStream, messageDigest)) { 476 session.writeResource(datastreamHeaders, digestStream); 477 final var expectedDigest = f3Digest.getDigest(); 478 final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex(); 479 if (!actualDigest.equalsIgnoreCase(expectedDigest)) { 480 final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s", 481 ocflObjectId, datastreamId, actualDigest, expectedDigest); 482 throw new RuntimeException(msg); 483 } 484 } 485 } 486 } catch (final NoSuchAlgorithmException e) { 487 final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.", 488 ocflObjectId, datastreamId, f3Digest.getType()); 489 LOGGER.warn(msg); 490 session.writeResource(datastreamHeaders, contentStream); 491 } 492 } else { 493 if (datastreamControlGroup.equalsIgnoreCase("M")) { 494 final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.", 495 ocflObjectId, datastreamId); 496 LOGGER.warn(msg); 497 } 498 session.writeResource(datastreamHeaders, contentStream); 499 } 500 } 501 502 private void handleDeletedResources(final String f6ObjectId, 503 final String objectState, 504 final Map<String, String> datastreamStates) { 505 final OcflObjectSession session = newSession(f6ObjectId); 506 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 507 508 try { 509 final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC); 510 final var hasDeletes = new AtomicBoolean(false); 511 512 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 513 hasDeletes.set(true); 514 515 datastreamStates.keySet().forEach(f6DsId -> { 516 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 517 k -> datastreamSession(f6DsId, session)); 518 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 519 }); 520 521 if (migrationType == MigrationType.PLAIN_OCFL) { 522 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 523 } else { 524 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 525 } 526 } else { 527 datastreamStates.forEach((f6DsId, state) -> { 528 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 529 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 530 k -> datastreamSession(f6DsId, session)); 531 hasDeletes.set(true); 532 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 533 } 534 }); 535 } 536 537 if (hasDeletes.get()) { 538 session.versionCreationTimestamp(now); 539 session.commit(); 540 541 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 542 datastreamSessions.forEach((id, dsSession) -> { 543 dsSession.versionCreationTimestamp(now); 544 dsSession.commit(); 545 }); 546 } 547 } else { 548 session.abort(); 549 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 550 datastreamSessions.forEach((id, dsSession) -> { 551 dsSession.abort(); 552 }); 553 } 554 } 555 } catch (RuntimeException e) { 556 session.abort(); 557 throw e; 558 } 559 } 560 561 private String f6DescriptionId(final String f6ResourceId) { 562 return f6ResourceId + FCRMETA_SUFFIX; 563 } 564 565 private String lastPartFromId(final String id) { 566 return id.substring(id.lastIndexOf('/') + 1); 567 } 568 569 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) { 570 return f6ObjectId + "/" + datastreamId; 571 } 572 573 private ResourceHeaders.Builder createHeaders(final String id, 574 final String parentId, 575 final InteractionModel model) { 576 final var headers = ResourceHeaders.builder(); 577 headers.withHeadersVersion(ResourceHeadersVersion.V1_0); 578 headers.withId(id); 579 headers.withParent(parentId); 580 headers.withInteractionModel(model.getUri()); 581 return headers; 582 } 583 584 private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 585 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 586 headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL); 587 headers.withObjectRoot(true); 588 headers.withLastModifiedBy(user); 589 headers.withCreatedBy(user); 590 591 ov.getObjectProperties().listProperties().forEach(p -> { 592 if (p.getName().contains("lastModifiedDate")) { 593 final var lastModified = Instant.parse(p.getValue()); 594 headers.withLastModifiedDate(lastModified); 595 headers.withMementoCreatedDate(lastModified); 596 headers.withStateToken(DigestUtils.md5Hex( 597 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 598 } else if (p.getName().contains("createdDate")) { 599 headers.withCreatedDate(Instant.parse(p.getValue())); 600 } 601 }); 602 603 return headers; 604 } 605 606 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 607 final String f6DsId, 608 final String f6ObjectId, 609 final String filename, 610 final String mime, 611 final String createDate) { 612 final var lastModified = Instant.parse(dv.getCreated()); 613 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 614 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 615 headers.withArchivalGroupId(f6ObjectId); 616 } 617 headers.withFilename(filename); 618 headers.withCreatedDate(Instant.parse(createDate)); 619 headers.withLastModifiedDate(lastModified); 620 headers.withLastModifiedBy(user); 621 headers.withCreatedBy(user); 622 headers.withMementoCreatedDate(lastModified); 623 624 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 625 headers.withExternalHandling( 626 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 627 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 628 } 629 630 headers.withArchivalGroup(false); 631 headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC); 632 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 633 headers.withContentSize(dv.getSize()); 634 } 635 636 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 637 final var digest = dv.getContentDigest(); 638 final var digests = new ArrayList<URI>(); 639 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase())); 640 headers.withDigests(digests); 641 } 642 643 headers.withMimeType(mime); 644 headers.withStateToken(DigestUtils.md5Hex( 645 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 646 647 return headers.build(); 648 } 649 650 private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId, 651 final ResourceHeaders datastreamHeaders) { 652 final var id = f6DescriptionId(f6DsId); 653 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 654 655 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 656 headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId()); 657 } 658 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 659 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 660 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 661 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 662 headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate()); 663 664 headers.withArchivalGroup(false); 665 headers.withObjectRoot(false); 666 headers.withStateToken(datastreamHeaders.getStateToken()); 667 668 return headers; 669 } 670 671 private String resolveMimeType(final DatastreamVersion dv) { 672 String mime = dv.getMimeType(); 673 674 if (Strings.isNullOrEmpty(mime)) { 675 final var meta = new Metadata(); 676 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 677 try (var content = TikaInputStream.get(dv.getContent())) { 678 mime = mimeDetector.detect(content, meta).toString(); 679 } catch (IOException e) { 680 throw new UncheckedIOException(e); 681 } 682 } 683 684 return mime; 685 } 686 687 private void deleteDatastream(final String id, 688 final Instant lastModified, 689 final OcflObjectSession session) { 690 if (migrationType == MigrationType.PLAIN_OCFL) { 691 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 692 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 693 } else { 694 deleteF6MigratedResource(id, lastModified, session); 695 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 696 } 697 } 698 699 private void deleteF6MigratedResource(final String id, 700 final Instant lastModified, 701 final OcflObjectSession session) { 702 LOGGER.debug("Deleting resource {}", id); 703 final var headers = session.readHeaders(id); 704 session.deleteContentFile(ResourceHeaders.builder(headers) 705 .withDeleted(true) 706 .withLastModifiedDate(lastModified) 707 .withMementoCreatedDate(lastModified) 708 .build()); 709 } 710 711 private void deleteOcflMigratedResource(final String id, 712 final InteractionModel interactionModel, 713 final OcflObjectSession session) { 714 LOGGER.debug("Deleting resource {}", id); 715 session.deleteContentFile(ResourceHeaders.builder() 716 .withId(id) 717 .withInteractionModel(interactionModel.getUri()) 718 .build()); 719 } 720 721 private String getObjectState(final ObjectVersionReference ov, final String pid) { 722 return ov.getObjectProperties().listProperties().stream() 723 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 724 .findFirst() 725 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 726 pid))) 727 .getValue(); 728 } 729 730 // Get object-level triples 731 private static Model getObjTriples(final ObjectVersionReference o, final String pid) { 732 final Model triples = ModelFactory.createDefaultModel(); 733 final String uri = "info:fedora/" + pid; 734 735 o.getObjectProperties().listProperties().forEach(p -> { 736 if (p.getName().contains("Date")) { 737 addDateLiteral(triples, uri, p.getName(), p.getValue()); 738 } else { 739 addStringLiteral(triples, uri, p.getName(), p.getValue()); 740 } 741 }); 742 743 return triples; 744 } 745 746 // Get datastream-level triples 747 private Model getDsTriples(final DatastreamVersion dv, 748 final String f6DsId, 749 final String createDate) { 750 final Model triples = ModelFactory.createDefaultModel(); 751 752 if (migrationType == MigrationType.PLAIN_OCFL) { 753 // These triples are server managed in F6 754 addDateLiteral(triples, 755 f6DsId, 756 "http://fedora.info/definitions/v4/repository#created", 757 createDate); 758 addDateLiteral(triples, 759 f6DsId, 760 "http://fedora.info/definitions/v4/repository#lastModified", 761 dv.getCreated()); 762 addStringLiteral(triples, 763 f6DsId, 764 "http://purl.org/dc/terms/identifier", 765 dv.getDatastreamInfo().getDatastreamId()); 766 addStringLiteral(triples, 767 f6DsId, 768 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 769 dv.getMimeType()); 770 addLongLiteral(triples, 771 f6DsId, 772 "http://www.loc.gov/premis/rdf/v1#size", 773 dv.getSize()); 774 775 if (dv.getContentDigest() != null) { 776 addStringLiteral(triples, 777 f6DsId, 778 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 779 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 780 dv.getContentDigest().getDigest().toLowerCase()); 781 } 782 } 783 784 addStringLiteral(triples, 785 f6DsId, 786 "http://purl.org/dc/terms/title", 787 dv.getLabel()); 788 addStringLiteral(triples, 789 f6DsId, 790 "http://fedora.info/definitions/1/0/access/objState", 791 dv.getDatastreamInfo().getState()); 792 addStringLiteral(triples, 793 f6DsId, 794 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 795 dv.getFormatUri()); 796 797 return triples; 798 } 799 800 private static void addStringLiteral(final Model m, 801 final String s, 802 final String p, 803 final String o) { 804 if (o != null) { 805 m.add(m.createResource(s), m.createProperty(p), o); 806 } 807 } 808 809 private static void addDateLiteral(final Model m, 810 final String s, 811 final String p, 812 final String date) { 813 if (date != null) { 814 m.addLiteral(m.createResource(s), 815 m.createProperty(p), 816 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 817 } 818 } 819 820 private static void addLongLiteral(final Model m, 821 final String s, 822 final String p, 823 final long number) { 824 if (number != -1) { 825 m.addLiteral(m.createResource(s), 826 m.createProperty(p), 827 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 828 } 829 } 830 831 /** 832 * @param mime any mimetype as String 833 * @return extension associated with arg mime, return includes '.' in extension (.txt). 834 * ..Empty String if unrecognized mime 835 */ 836 private static String getExtension(final String mime) { 837 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 838 MimeType type; 839 try { 840 type = allTypes.forName(mime); 841 } catch (final MimeTypeException e) { 842 type = null; 843 } 844 845 if (type != null) { 846 return type.getExtension(); 847 } 848 849 LOGGER.warn("No mimetype found for '{}'", mime); 850 return ""; 851 } 852 853 private Model parseRdfXml(final DatastreamVersion datastreamVersion) { 854 final var model = ModelFactory.createDefaultModel(); 855 try (final var is = datastreamVersion.getContent()) { 856 RDFDataMgr.read(model, is, Lang.RDFXML); 857 return model; 858 } catch (Exception e) { 859 throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s", 860 datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(), 861 datastreamVersion.getDatastreamInfo().getDatastreamId()), e); 862 } 863 } 864 865 private Map<String, Model> splitRelsInt(final Model relsIntModel) { 866 final Map<String, Model> splitModels = new HashMap<>(); 867 for (final var it = relsIntModel.listStatements(); it.hasNext();) { 868 final var statement = it.next(); 869 final var id = statement.getSubject().getURI(); 870 final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel()); 871 model.add(statement); 872 } 873 return splitModels; 874 } 875 876 /** 877 * Creates a new session for the datastream when migrating as atomic resources, or returns the object session, 878 * when migrating as archival groups. 879 * 880 * @param id the datastream's id in fedora 6 881 * @param objectSession the datastream's object session 882 * @return either a new datastream session or the object session 883 */ 884 private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) { 885 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 886 return objectSession; 887 } else { 888 return newSession(id); 889 } 890 } 891 892 private OcflObjectSession newSession(final String id) { 893 return new OcflObjectSessionWrapper(sessionFactory.newSession(id)); 894 } 895 896 /** 897 * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content 898 * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from 899 * one of the RELS-* files. They are maintained separately because it's possible for them to be updated 900 * independently and we need to be able to construct the correct set of triples when one changes. 901 */ 902 private static class MetaHolder { 903 Model contentTriples; 904 Model relsTriples; 905 ResourceHeaders.Builder headers; 906 907 public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) { 908 return new MetaHolder(contentTriples, null, headers); 909 } 910 911 private MetaHolder() { 912 } 913 914 private MetaHolder(final Model contentTriples, 915 final Model relsTriples, 916 final ResourceHeaders.Builder headers) { 917 this.contentTriples = contentTriples; 918 this.relsTriples = relsTriples; 919 this.headers = headers; 920 } 921 922 /** 923 * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples. 924 * 925 * @return n-triples input stream 926 */ 927 public InputStream constructTriples() { 928 final var output = new ByteArrayOutputStream(); 929 final var triples = ModelFactory.createDefaultModel(); 930 931 if (contentTriples != null) { 932 triples.add(contentTriples.listStatements()); 933 } 934 935 if (relsTriples != null) { 936 triples.add(relsTriples.listStatements()); 937 } 938 939 triples.write(output, Lang.NTRIPLES.getName()); 940 return new ByteArrayInputStream(output.toByteArray()); 941 } 942 943 public MetaHolder setHeaders(final ResourceHeaders.Builder headers) { 944 this.headers = headers; 945 return this; 946 } 947 948 public MetaHolder setContentTriples(final Model contentTriples) { 949 this.contentTriples = contentTriples; 950 return this; 951 } 952 953 public MetaHolder setRelsTriples(final Model relsTriples) { 954 this.relsTriples = relsTriples; 955 return this; 956 } 957 } 958 959 private static class BinaryMeta { 960 final String name; 961 final String mimeType; 962 final String label; 963 964 public BinaryMeta(final String name, final String mimeType, final String label) { 965 this.name = name; 966 this.mimeType = mimeType; 967 this.label = label; 968 } 969 } 970 971}