001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import at.favre.lib.bytes.Bytes; 020import com.google.common.base.Preconditions; 021import com.google.common.base.Strings; 022import com.google.common.collect.Sets; 023import org.apache.commons.codec.digest.DigestUtils; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.lang3.StringUtils; 026import org.apache.jena.datatypes.xsd.XSDDatatype; 027import org.apache.jena.rdf.model.Model; 028import org.apache.jena.rdf.model.ModelFactory; 029import org.apache.jena.riot.Lang; 030import org.apache.jena.riot.RDFDataMgr; 031import org.apache.tika.config.TikaConfig; 032import org.apache.tika.detect.Detector; 033import org.apache.tika.io.TikaInputStream; 034import org.apache.tika.metadata.Metadata; 035import org.apache.tika.mime.MimeType; 036import org.apache.tika.mime.MimeTypeException; 037import org.apache.tika.mime.MimeTypes; 038import org.fcrepo.migration.ContentDigest; 039import org.fcrepo.migration.DatastreamVersion; 040import org.fcrepo.migration.FedoraObjectVersionHandler; 041import org.fcrepo.migration.MigrationType; 042import org.fcrepo.migration.ObjectInfo; 043import org.fcrepo.migration.ObjectVersionReference; 044import org.fcrepo.migration.ResourceMigrationType; 045import org.fcrepo.storage.ocfl.InteractionModel; 046import org.fcrepo.storage.ocfl.OcflObjectSession; 047import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 048import org.fcrepo.storage.ocfl.ResourceHeaders; 049import org.fcrepo.storage.ocfl.ResourceHeadersVersion; 050import org.fcrepo.storage.ocfl.exception.NotFoundException; 051import org.slf4j.Logger; 052 053import java.io.BufferedInputStream; 054import java.io.ByteArrayInputStream; 055import java.io.ByteArrayOutputStream; 056import java.io.IOException; 057import java.io.InputStream; 058import java.io.UncheckedIOException; 059import java.net.URI; 060import java.nio.charset.StandardCharsets; 061import java.nio.file.Files; 062import java.security.DigestInputStream; 063import java.security.MessageDigest; 064import java.security.NoSuchAlgorithmException; 065import java.time.Instant; 066import java.time.OffsetDateTime; 067import java.time.ZoneOffset; 068import java.util.ArrayList; 069import java.util.HashMap; 070import java.util.HashSet; 071import java.util.Map; 072import java.util.Set; 073import java.util.concurrent.atomic.AtomicBoolean; 074 075import static org.slf4j.LoggerFactory.getLogger; 076 077/** 078 * Writes a Fedora object as a single ArchiveGroup. 079 * <p> 080 * All datastreams and object metadata from a fcrepo3 object are persisted to a 081 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 082 * </p> 083 * <p> 084 * The contents of each datastream are written verbatim. No attempt is made to 085 * re-write the RELS-EXT to replace subjects and objects with their LDP 086 * counterparts. 087 * </p> 088 * <p> 089 * Note: fedora-specific OCFL serialization features (such as redirects, 090 * container metadata, etc) is not fully defined yet, so are not included here 091 * 092 * @author apb@jhu.edu 093 */ 094public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 095 096 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 097 098 private static final String FCREPO_ROOT = "info:fedora/"; 099 private static final String FCRMETA_SUFFIX = "/fcr:metadata"; 100 101 private static final Map<String, String> externalHandlingMap = Map.of( 102 "E", "proxy", 103 "R", "redirect" 104 ); 105 106 private static final String INLINE_XML = "X"; 107 108 private static final String DS_INACTIVE = "I"; 109 private static final String DS_DELETED = "D"; 110 111 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 112 private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename"; 113 private static final String OBJ_INACTIVE = "Inactive"; 114 private static final String OBJ_DELETED = "Deleted"; 115 116 private static final String RELS_EXT = "RELS-EXT"; 117 private static final String RELS_INT = "RELS-INT"; 118 119 private final OcflObjectSessionFactory sessionFactory; 120 private final boolean addDatastreamExtensions; 121 private final boolean deleteInactive; 122 private final boolean foxmlFile; 123 private final MigrationType migrationType; 124 private final ResourceMigrationType resourceMigrationType; 125 private final String user; 126 private final String idPrefix; 127 private final Detector mimeDetector; 128 private final boolean headOnly; 129 private final boolean disableChecksumValidation; 130 131 /** 132 * Create an ArchiveGroupHandler, 133 * 134 * @param sessionFactory 135 * OCFL session factory 136 * @param migrationType 137 * the type of migration to do 138 * @param resourceMigrationType 139 * how resources should be migrated 140 * @param addDatastreamExtensions 141 * true if datastreams should be written with file extensions 142 * @param deleteInactive 143 * true if inactive objects and datastreams should be migrated as deleted 144 * @param foxmlFile 145 * true if foxml file should be migrated as a whole file, instead of creating property files 146 * @param user 147 * the username to associated with the migrated resources 148 * @param idPrefix 149 * the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3) 150 * @param headOnly 151 * flag to enable head only migrations 152 * @param disableChecksumValidation 153 */ 154 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 155 final MigrationType migrationType, 156 final ResourceMigrationType resourceMigrationType, 157 final boolean addDatastreamExtensions, 158 final boolean deleteInactive, 159 final boolean foxmlFile, 160 final String user, 161 final String idPrefix, 162 final boolean headOnly, 163 final boolean disableChecksumValidation) { 164 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 165 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 166 this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType, 167 "resourceMigrationType cannot be null"); 168 this.addDatastreamExtensions = addDatastreamExtensions; 169 this.deleteInactive = deleteInactive; 170 this.foxmlFile = foxmlFile; 171 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 172 this.idPrefix = idPrefix; 173 this.headOnly = headOnly; 174 this.disableChecksumValidation = disableChecksumValidation; 175 try { 176 this.mimeDetector = new TikaConfig().getDetector(); 177 } catch (Exception e) { 178 throw new RuntimeException(e); 179 } 180 } 181 182 @Override 183 public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) { 184 // We use the PID to identify the OCFL object 185 final String objectId = objectInfo.getPid(); 186 final String f6ObjectId = idPrefix + objectId; 187 188 // We need to manually keep track of the datastream creation dates 189 final Map<String, String> dsCreateDates = new HashMap<>(); 190 191 String objectState = null; 192 OffsetDateTime objectCreation = null; 193 OcflObjectSession objectSession = null; 194 195 final Map<String, String> datastreamStates = new HashMap<>(); 196 // tracks the triples used to create containers and binary descriptions 197 final Map<String, MetaHolder> metaMap = new HashMap<>(); 198 // tracks info about binary resources needed to construct filenames 199 final Map<String, BinaryMeta> binaryMeta = new HashMap<>(); 200 // tracks filenames pulled from RELS-INT 201 final Map<String, String> filenameMap = new HashMap<>(); 202 203 for (var ov : versions) { 204 // tracks the binary descriptions that need to be written 205 final Set<String> toWrite = new HashSet<>(); 206 // tracks the binaries that need their filename updated base on RELS-INT 207 final Set<String> relsFilenameUpdates = new HashSet<>(); 208 // tracks the binaries that need their filename updated based on a RELS-INT removal 209 final Map<String, String> relsDeletedFilenames = new HashMap<>(); 210 211 // reuse the objectSession when headOnly is set 212 objectSession = (objectSession == null || !headOnly) ? newSession(f6ObjectId) : objectSession; 213 214 if (ov.isFirstVersion()) { 215 if (objectSession.containsResource(f6ObjectId)) { 216 throw new RuntimeException(f6ObjectId + " already exists!"); 217 } 218 objectCreation = OffsetDateTime.parse(ov.getVersionDate()); 219 objectState = getObjectState(ov, objectId); 220 // Object properties are written only once (as fcrepo3 object properties were unversioned). 221 if (foxmlFile) { 222 try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) { 223 final var foxmlDsId = f6ObjectId + "/FOXML"; 224 final var headers = createHeaders(foxmlDsId, f6ObjectId, 225 InteractionModel.NON_RDF).build(); 226 objectSession.writeResource(headers, is); 227 //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources() 228 datastreamStates.put(foxmlDsId, DS_DELETED); 229 } catch (IOException io) { 230 LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io); 231 throw new UncheckedIOException(io); 232 } 233 } else { 234 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 235 final var content = getObjTriples(ov, objectId); 236 final var meta = MetaHolder.fromContent(content, objectHeaders); 237 metaMap.put(f6ObjectId, meta); 238 objectSession.writeResource(meta.headers.build(), meta.constructTriples()); 239 } 240 } 241 242 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 243 244 // Write datastreams and their metadata 245 for (var dv : ov.listChangedDatastreams()) { 246 final var mimeType = resolveMimeType(dv); 247 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 248 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId); 249 final var datastreamFilename = lastPartFromId(f6DsId); 250 251 final var datastreamSession = datastreamSession(f6DsId, objectSession); 252 datastreamSessions.putIfAbsent(f6DsId, datastreamSession); 253 254 if (dv.isFirstVersionIn(ov.getObject())) { 255 dsCreateDates.put(dsId, dv.getCreated()); 256 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 257 } 258 259 final var createDate = dsCreateDates.get(dsId); 260 261 final var filename = resolveFilename(datastreamFilename, 262 dv.getLabel(), filenameMap.get(f6DsId), mimeType); 263 264 relsDeletedFilenames.remove(f6DsId); 265 266 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 267 filename, mimeType, createDate); 268 269 binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel())); 270 271 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 272 InputStream content = null; 273 // for plain OCFL migrations, write a file containing the external/redirect URL 274 if (migrationType == MigrationType.PLAIN_OCFL) { 275 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8); 276 } 277 datastreamSession.writeResource(datastreamHeaders, content); 278 } else { 279 try (var contentStream = dv.getContent()) { 280 writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession); 281 } catch (final IOException e) { 282 throw new UncheckedIOException(e); 283 } 284 } 285 286 if (!foxmlFile) { 287 final var f6DescId = f6DescriptionId(f6DsId); 288 final var descriptionHeaders = createDescriptionHeaders(f6DsId, 289 datastreamHeaders); 290 final var descriptionTriples = getDsTriples(dv, f6DsId, createDate); 291 metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder()) 292 .setHeaders(descriptionHeaders) 293 .setContentTriples(descriptionTriples); 294 toWrite.add(f6DescId); 295 296 if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) { 297 final var triples = parseRdfXml(dv); 298 if (RELS_EXT.equals(dsId)) { 299 metaMap.get(f6ObjectId).setRelsTriples(triples); 300 toWrite.add(f6ObjectId); 301 } else { 302 final Map<String, Model> splitModels = splitRelsInt(triples); 303 final var oldIds = new HashSet<>(filenameMap.keySet()); 304 filenameMap.clear(); 305 306 splitModels.forEach((id, model) -> { 307 final var descId = f6DescriptionId(id); 308 metaMap.computeIfAbsent(descId, k -> new MetaHolder()) 309 .setRelsTriples(model); 310 toWrite.add(descId); 311 312 // Check to see if there are any file names that need updated 313 for (final var it = model.listStatements(); it.hasNext(); ) { 314 final var statement = it.next(); 315 if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) { 316 filenameMap.put(id, statement.getObject().toString()); 317 relsFilenameUpdates.add(id); 318 break; 319 } 320 } 321 }); 322 323 // The filename was set once but is no longer 324 final var deleted = Sets.difference(oldIds, filenameMap.keySet()); 325 deleted.forEach(id -> { 326 final var meta = binaryMeta.get(id); 327 if (meta != null) { 328 relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label, 329 null, meta.mimeType)); 330 } 331 }); 332 } 333 } 334 } 335 } 336 337 writeMeta(toWrite, metaMap, objectSession, datastreamSessions); 338 updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions); 339 340 if (!headOnly) { 341 LOGGER.debug("Committing object <{}>", f6ObjectId); 342 343 final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate()); 344 345 objectSession.versionCreationTimestamp(creationTimestamp); 346 objectSession.commit(); 347 348 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 349 datastreamSessions.forEach((id, session) -> { 350 LOGGER.debug("Committing object <{}>", id); 351 session.versionCreationTimestamp(creationTimestamp); 352 session.commit(); 353 }); 354 } 355 } 356 } 357 358 handleDeletedResources(f6ObjectId, objectState, datastreamStates, objectSession); 359 360 // final commit when headOnly is set 361 if (headOnly && objectSession != null) { 362 LOGGER.debug("Committing object <{}>", f6ObjectId); 363 objectSession.versionCreationTimestamp(objectCreation); 364 objectSession.commit(); 365 } 366 } 367 368 /** 369 * Resolves the filename of the datastream based on the following precedence: 370 * 371 * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT 372 * 2. LABEL from datastream meta 373 * 3. Name of the datastream 374 * 375 * If extensions should be added, then an extension is picked based on the mime type. If the filename already 376 * includes a `.` then no extension is added. 377 * 378 * @param dsName the name of the datastream 379 * @param labelName the datastream's label 380 * @param downloadName the download name from RELS-INT 381 * @param mimeType the datastream's mime type 382 * @return the resolved filename 383 */ 384 private String resolveFilename(final String dsName, 385 final String labelName, 386 final String downloadName, 387 final String mimeType) { 388 String filename; 389 if (StringUtils.isNotBlank(downloadName)) { 390 filename = downloadName; 391 } else if (StringUtils.isNotBlank(labelName)) { 392 filename = labelName; 393 } else { 394 filename = dsName; 395 } 396 397 if (addDatastreamExtensions 398 && StringUtils.isNotBlank(mimeType) 399 && !filename.contains(".")) { 400 filename += getExtension(mimeType); 401 } 402 403 return filename; 404 } 405 406 /** 407 * RDF resources are written after writing all other binaries in the version because they can be affected by 408 * RELS-INT or RELS-EXT updates. 409 * 410 * @param toWrite the set of resources that should be written to this version 411 * @param metaMap the map of all known rdf resources 412 * @param objectSession the ocfl session for the object 413 * @param datastreamSessions the ocfl sessions for the datastreams 414 */ 415 private void writeMeta(final Set<String> toWrite, 416 final Map<String, MetaHolder> metaMap, 417 final OcflObjectSession objectSession, 418 final Map<String, OcflObjectSession> datastreamSessions) { 419 for (final var id : toWrite) { 420 final var meta = metaMap.get(id); 421 422 if (meta.headers == null) { 423 // This only happens if there's a RELS-INT that references a datastream before it exists. 424 // Skip for now. The triples will be added once the datastream exists. 425 continue; 426 } 427 428 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 429 k -> datastreamSession(k, objectSession)); 430 431 // Need to copy over the memento created date from the existing headers because it may have been updated 432 // when a description's binary was updated 433 if (migrationType == MigrationType.FEDORA_OCFL) { 434 try { 435 final var existingHeaders = session.readHeaders(id); 436 meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate()); 437 } catch (NotFoundException e) { 438 // this just means the resource hasn't been written yet 439 } 440 } 441 session.writeResource(meta.headers.build(), meta.constructTriples()); 442 } 443 } 444 445 private void updateFilenames(final Set<String> toUpdate, 446 final Map<String, String> filenameMap, 447 final Map<String, String> relsDeletedFilenames, 448 final OcflObjectSession objectSession, 449 final Map<String, OcflObjectSession> datastreamSessions) { 450 if (migrationType == MigrationType.FEDORA_OCFL) { 451 toUpdate.forEach(id -> { 452 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 453 k -> datastreamSession(k, objectSession)); 454 final var origHeaders = session.readHeaders(id); 455 final var filename = filenameMap.get(id); 456 if (StringUtils.isNotBlank(filename)) { 457 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 458 session.writeHeaders(newHeaders); 459 } 460 }); 461 relsDeletedFilenames.forEach((id, filename) -> { 462 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 463 k -> datastreamSession(k, objectSession)); 464 final var origHeaders = session.readHeaders(id); 465 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 466 session.writeHeaders(newHeaders); 467 }); 468 } 469 } 470 471 private boolean fedora3DigestValid(final ContentDigest f3Digest) { 472 return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) && 473 StringUtils.isNotBlank(f3Digest.getDigest()); 474 } 475 476 private void writeDatastreamContent(final DatastreamVersion dv, 477 final ResourceHeaders datastreamHeaders, 478 final InputStream contentStream, 479 final OcflObjectSession session) throws IOException { 480 if (disableChecksumValidation) { 481 session.writeResource(datastreamHeaders, contentStream); 482 return; 483 } 484 final var f3Digest = dv.getContentDigest(); 485 final var ocflObjectId = session.ocflObjectId(); 486 final var datastreamId = dv.getDatastreamInfo().getDatastreamId(); 487 final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup(); 488 if (fedora3DigestValid(f3Digest)) { 489 try { 490 final var messageDigest = MessageDigest.getInstance(f3Digest.getType()); 491 if (migrationType == MigrationType.PLAIN_OCFL) { 492 session.writeResource(datastreamHeaders, contentStream); 493 } else { 494 try (var digestStream = new DigestInputStream(contentStream, messageDigest)) { 495 session.writeResource(datastreamHeaders, digestStream); 496 final var expectedDigest = f3Digest.getDigest(); 497 final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex(); 498 if (!actualDigest.equalsIgnoreCase(expectedDigest)) { 499 final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s", 500 ocflObjectId, datastreamId, actualDigest, expectedDigest); 501 throw new RuntimeException(msg); 502 } 503 } 504 } 505 } catch (final NoSuchAlgorithmException e) { 506 final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.", 507 ocflObjectId, datastreamId, f3Digest.getType()); 508 LOGGER.warn(msg); 509 session.writeResource(datastreamHeaders, contentStream); 510 } 511 } else { 512 if (datastreamControlGroup.equalsIgnoreCase("M")) { 513 final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.", 514 ocflObjectId, datastreamId); 515 LOGGER.warn(msg); 516 } 517 session.writeResource(datastreamHeaders, contentStream); 518 } 519 } 520 521 private void handleDeletedResources(final String f6ObjectId, 522 final String objectState, 523 final Map<String, String> datastreamStates, 524 final OcflObjectSession objectSession) { 525 final OcflObjectSession session = headOnly ? objectSession : newSession(f6ObjectId); 526 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 527 528 try { 529 final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC); 530 final var hasDeletes = new AtomicBoolean(false); 531 532 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 533 hasDeletes.set(true); 534 535 datastreamStates.keySet().forEach(f6DsId -> { 536 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 537 k -> datastreamSession(f6DsId, session)); 538 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 539 }); 540 541 if (migrationType == MigrationType.PLAIN_OCFL) { 542 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 543 } else { 544 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 545 } 546 } else { 547 datastreamStates.forEach((f6DsId, state) -> { 548 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 549 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 550 k -> datastreamSession(f6DsId, session)); 551 hasDeletes.set(true); 552 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 553 } 554 }); 555 } 556 557 if (!headOnly && hasDeletes.get()) { 558 session.versionCreationTimestamp(now); 559 session.commit(); 560 561 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 562 datastreamSessions.forEach((id, dsSession) -> { 563 dsSession.versionCreationTimestamp(now); 564 dsSession.commit(); 565 }); 566 } 567 } else if (!headOnly) { 568 session.abort(); 569 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 570 datastreamSessions.forEach((id, dsSession) -> { 571 dsSession.abort(); 572 }); 573 } 574 } 575 } catch (RuntimeException e) { 576 session.abort(); 577 throw e; 578 } 579 } 580 581 private String f6DescriptionId(final String f6ResourceId) { 582 return f6ResourceId + FCRMETA_SUFFIX; 583 } 584 585 private String lastPartFromId(final String id) { 586 return id.substring(id.lastIndexOf('/') + 1); 587 } 588 589 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) { 590 return f6ObjectId + "/" + datastreamId; 591 } 592 593 private ResourceHeaders.Builder createHeaders(final String id, 594 final String parentId, 595 final InteractionModel model) { 596 final var headers = ResourceHeaders.builder(); 597 headers.withHeadersVersion(ResourceHeadersVersion.V1_0); 598 headers.withId(id); 599 headers.withParent(parentId); 600 headers.withInteractionModel(model.getUri()); 601 return headers; 602 } 603 604 private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 605 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 606 headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL); 607 headers.withObjectRoot(true); 608 headers.withLastModifiedBy(user); 609 headers.withCreatedBy(user); 610 611 ov.getObjectProperties().listProperties().forEach(p -> { 612 if (p.getName().contains("lastModifiedDate")) { 613 final var lastModified = Instant.parse(p.getValue()); 614 headers.withLastModifiedDate(lastModified); 615 headers.withMementoCreatedDate(lastModified); 616 headers.withStateToken(DigestUtils.md5Hex( 617 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 618 } else if (p.getName().contains("createdDate")) { 619 headers.withCreatedDate(Instant.parse(p.getValue())); 620 } 621 }); 622 623 return headers; 624 } 625 626 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 627 final String f6DsId, 628 final String f6ObjectId, 629 final String filename, 630 final String mime, 631 final String createDate) { 632 final var lastModified = Instant.parse(dv.getCreated()); 633 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 634 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 635 headers.withArchivalGroupId(f6ObjectId); 636 } 637 headers.withFilename(filename); 638 headers.withCreatedDate(Instant.parse(createDate)); 639 headers.withLastModifiedDate(lastModified); 640 headers.withLastModifiedBy(user); 641 headers.withCreatedBy(user); 642 headers.withMementoCreatedDate(lastModified); 643 644 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 645 headers.withExternalHandling( 646 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 647 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 648 } 649 650 headers.withArchivalGroup(false); 651 headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC); 652 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 653 headers.withContentSize(dv.getSize()); 654 } 655 656 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 657 if (!dv.getContentDigest().getDigest().equals("none")) { 658 final var digest = dv.getContentDigest(); 659 final var digests = new ArrayList<URI>(); 660 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + 661 digest.getDigest().toLowerCase())); 662 headers.withDigests(digests); 663 } else { 664 LOGGER.warn("Digest content 'none' found. Not adding to header"); 665 } 666 } 667 668 headers.withMimeType(mime); 669 headers.withStateToken(DigestUtils.md5Hex( 670 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 671 672 return headers.build(); 673 } 674 675 private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId, 676 final ResourceHeaders datastreamHeaders) { 677 final var id = f6DescriptionId(f6DsId); 678 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 679 680 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 681 headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId()); 682 } 683 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 684 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 685 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 686 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 687 headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate()); 688 689 headers.withArchivalGroup(false); 690 headers.withObjectRoot(false); 691 headers.withStateToken(datastreamHeaders.getStateToken()); 692 693 return headers; 694 } 695 696 private String resolveMimeType(final DatastreamVersion dv) { 697 String mime = dv.getMimeType(); 698 699 if (Strings.isNullOrEmpty(mime)) { 700 final var meta = new Metadata(); 701 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 702 try (var content = TikaInputStream.get(dv.getContent())) { 703 mime = mimeDetector.detect(content, meta).toString(); 704 } catch (IOException e) { 705 throw new UncheckedIOException(e); 706 } 707 } 708 709 return mime; 710 } 711 712 private void deleteDatastream(final String id, 713 final Instant lastModified, 714 final OcflObjectSession session) { 715 if (migrationType == MigrationType.PLAIN_OCFL) { 716 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 717 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 718 } else { 719 deleteF6MigratedResource(id, lastModified, session); 720 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 721 } 722 } 723 724 private void deleteF6MigratedResource(final String id, 725 final Instant lastModified, 726 final OcflObjectSession session) { 727 LOGGER.debug("Deleting resource {}", id); 728 final var headers = session.readHeaders(id); 729 session.deleteContentFile(ResourceHeaders.builder(headers) 730 .withDeleted(true) 731 .withLastModifiedDate(lastModified) 732 .withMementoCreatedDate(lastModified) 733 .build()); 734 } 735 736 private void deleteOcflMigratedResource(final String id, 737 final InteractionModel interactionModel, 738 final OcflObjectSession session) { 739 LOGGER.debug("Deleting resource {}", id); 740 session.deleteContentFile(ResourceHeaders.builder() 741 .withId(id) 742 .withInteractionModel(interactionModel.getUri()) 743 .build()); 744 } 745 746 private String getObjectState(final ObjectVersionReference ov, final String pid) { 747 return ov.getObjectProperties().listProperties().stream() 748 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 749 .findFirst() 750 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 751 pid))) 752 .getValue(); 753 } 754 755 // Get object-level triples 756 private static Model getObjTriples(final ObjectVersionReference o, final String pid) { 757 final Model triples = ModelFactory.createDefaultModel(); 758 final String uri = "info:fedora/" + pid; 759 760 o.getObjectProperties().listProperties().forEach(p -> { 761 if (p.getName().contains("Date")) { 762 addDateLiteral(triples, uri, p.getName(), p.getValue()); 763 } else { 764 addStringLiteral(triples, uri, p.getName(), p.getValue()); 765 } 766 }); 767 768 return triples; 769 } 770 771 // Get datastream-level triples 772 private Model getDsTriples(final DatastreamVersion dv, 773 final String f6DsId, 774 final String createDate) { 775 final Model triples = ModelFactory.createDefaultModel(); 776 777 if (migrationType == MigrationType.PLAIN_OCFL) { 778 // These triples are server managed in F6 779 addDateLiteral(triples, 780 f6DsId, 781 "http://fedora.info/definitions/v4/repository#created", 782 createDate); 783 addDateLiteral(triples, 784 f6DsId, 785 "http://fedora.info/definitions/v4/repository#lastModified", 786 dv.getCreated()); 787 addStringLiteral(triples, 788 f6DsId, 789 "http://purl.org/dc/terms/identifier", 790 dv.getDatastreamInfo().getDatastreamId()); 791 addStringLiteral(triples, 792 f6DsId, 793 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 794 dv.getMimeType()); 795 addLongLiteral(triples, 796 f6DsId, 797 "http://www.loc.gov/premis/rdf/v1#size", 798 dv.getSize()); 799 800 if (dv.getContentDigest() != null) { 801 addStringLiteral(triples, 802 f6DsId, 803 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 804 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 805 dv.getContentDigest().getDigest().toLowerCase()); 806 } 807 } 808 809 addStringLiteral(triples, 810 f6DsId, 811 "http://purl.org/dc/terms/title", 812 dv.getLabel()); 813 addStringLiteral(triples, 814 f6DsId, 815 "http://fedora.info/definitions/1/0/access/objState", 816 dv.getDatastreamInfo().getState()); 817 addStringLiteral(triples, 818 f6DsId, 819 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 820 dv.getFormatUri()); 821 822 return triples; 823 } 824 825 private static void addStringLiteral(final Model m, 826 final String s, 827 final String p, 828 final String o) { 829 if (o != null) { 830 m.add(m.createResource(s), m.createProperty(p), o); 831 } 832 } 833 834 private static void addDateLiteral(final Model m, 835 final String s, 836 final String p, 837 final String date) { 838 if (date != null) { 839 m.addLiteral(m.createResource(s), 840 m.createProperty(p), 841 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 842 } 843 } 844 845 private static void addLongLiteral(final Model m, 846 final String s, 847 final String p, 848 final long number) { 849 if (number != -1) { 850 m.addLiteral(m.createResource(s), 851 m.createProperty(p), 852 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 853 } 854 } 855 856 /** 857 * @param mime any mimetype as String 858 * @return extension associated with arg mime, return includes '.' in extension (.txt). 859 * ..Empty String if unrecognized mime 860 */ 861 private static String getExtension(final String mime) { 862 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 863 MimeType type; 864 try { 865 type = allTypes.forName(mime); 866 } catch (final MimeTypeException e) { 867 type = null; 868 } 869 870 if (type != null) { 871 return type.getExtension(); 872 } 873 874 LOGGER.warn("No mimetype found for '{}'", mime); 875 return ""; 876 } 877 878 private Model parseRdfXml(final DatastreamVersion datastreamVersion) { 879 final var model = ModelFactory.createDefaultModel(); 880 try (final var is = datastreamVersion.getContent()) { 881 RDFDataMgr.read(model, is, Lang.RDFXML); 882 return model; 883 } catch (Exception e) { 884 throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s", 885 datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(), 886 datastreamVersion.getDatastreamInfo().getDatastreamId()), e); 887 } 888 } 889 890 private Map<String, Model> splitRelsInt(final Model relsIntModel) { 891 final Map<String, Model> splitModels = new HashMap<>(); 892 for (final var it = relsIntModel.listStatements(); it.hasNext();) { 893 final var statement = it.next(); 894 final var id = statement.getSubject().getURI(); 895 final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel()); 896 model.add(statement); 897 } 898 return splitModels; 899 } 900 901 /** 902 * Creates a new session for the datastream when migrating as atomic resources, or returns the object session, 903 * when migrating as archival groups. 904 * 905 * @param id the datastream's id in fedora 6 906 * @param objectSession the datastream's object session 907 * @return either a new datastream session or the object session 908 */ 909 private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) { 910 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 911 return objectSession; 912 } else { 913 return newSession(id); 914 } 915 } 916 917 private OcflObjectSession newSession(final String id) { 918 return new OcflObjectSessionWrapper(sessionFactory.newSession(id)); 919 } 920 921 /** 922 * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content 923 * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from 924 * one of the RELS-* files. They are maintained separately because it's possible for them to be updated 925 * independently and we need to be able to construct the correct set of triples when one changes. 926 */ 927 private static class MetaHolder { 928 Model contentTriples; 929 Model relsTriples; 930 ResourceHeaders.Builder headers; 931 932 public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) { 933 return new MetaHolder(contentTriples, null, headers); 934 } 935 936 private MetaHolder() { 937 } 938 939 private MetaHolder(final Model contentTriples, 940 final Model relsTriples, 941 final ResourceHeaders.Builder headers) { 942 this.contentTriples = contentTriples; 943 this.relsTriples = relsTriples; 944 this.headers = headers; 945 } 946 947 /** 948 * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples. 949 * 950 * @return n-triples input stream 951 */ 952 public InputStream constructTriples() { 953 final var output = new ByteArrayOutputStream(); 954 final var triples = ModelFactory.createDefaultModel(); 955 956 if (contentTriples != null) { 957 triples.add(contentTriples.listStatements()); 958 } 959 960 if (relsTriples != null) { 961 triples.add(relsTriples.listStatements()); 962 } 963 964 triples.write(output, Lang.NTRIPLES.getName()); 965 return new ByteArrayInputStream(output.toByteArray()); 966 } 967 968 public MetaHolder setHeaders(final ResourceHeaders.Builder headers) { 969 this.headers = headers; 970 return this; 971 } 972 973 public MetaHolder setContentTriples(final Model contentTriples) { 974 this.contentTriples = contentTriples; 975 return this; 976 } 977 978 public MetaHolder setRelsTriples(final Model relsTriples) { 979 this.relsTriples = relsTriples; 980 return this; 981 } 982 } 983 984 private static class BinaryMeta { 985 final String name; 986 final String mimeType; 987 final String label; 988 989 public BinaryMeta(final String name, final String mimeType, final String label) { 990 this.name = name; 991 this.mimeType = mimeType; 992 this.label = label; 993 } 994 } 995 996}