001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import at.favre.lib.bytes.Bytes; 020import com.google.common.base.Preconditions; 021import com.google.common.base.Strings; 022import com.google.common.collect.Sets; 023import org.apache.commons.codec.digest.DigestUtils; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.lang3.StringUtils; 026import org.apache.jena.datatypes.xsd.XSDDatatype; 027import org.apache.jena.rdf.model.Model; 028import org.apache.jena.rdf.model.ModelFactory; 029import org.apache.jena.riot.Lang; 030import org.apache.jena.riot.RDFDataMgr; 031import org.apache.tika.config.TikaConfig; 032import org.apache.tika.detect.Detector; 033import org.apache.tika.io.TikaInputStream; 034import org.apache.tika.metadata.Metadata; 035import org.apache.tika.mime.MimeType; 036import org.apache.tika.mime.MimeTypeException; 037import org.apache.tika.mime.MimeTypes; 038import org.fcrepo.migration.ContentDigest; 039import org.fcrepo.migration.DatastreamVersion; 040import org.fcrepo.migration.FedoraObjectVersionHandler; 041import org.fcrepo.migration.MigrationType; 042import org.fcrepo.migration.ObjectInfo; 043import org.fcrepo.migration.ObjectVersionReference; 044import org.fcrepo.migration.ResourceMigrationType; 045import org.fcrepo.storage.ocfl.InteractionModel; 046import org.fcrepo.storage.ocfl.OcflObjectSession; 047import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 048import org.fcrepo.storage.ocfl.ResourceHeaders; 049import org.fcrepo.storage.ocfl.ResourceHeadersVersion; 050import org.fcrepo.storage.ocfl.exception.NotFoundException; 051import org.slf4j.Logger; 052 053import java.io.BufferedInputStream; 054import java.io.ByteArrayInputStream; 055import java.io.ByteArrayOutputStream; 056import java.io.IOException; 057import java.io.InputStream; 058import java.io.UncheckedIOException; 059import java.net.URI; 060import java.nio.charset.StandardCharsets; 061import java.nio.file.Files; 062import java.security.DigestInputStream; 063import java.security.MessageDigest; 064import java.security.NoSuchAlgorithmException; 065import java.time.Instant; 066import java.time.OffsetDateTime; 067import java.time.ZoneOffset; 068import java.util.ArrayList; 069import java.util.HashMap; 070import java.util.HashSet; 071import java.util.Map; 072import java.util.Set; 073import java.util.concurrent.atomic.AtomicBoolean; 074 075import static org.slf4j.LoggerFactory.getLogger; 076 077/** 078 * Writes a Fedora object as a single ArchiveGroup. 079 * <p> 080 * All datastreams and object metadata from a fcrepo3 object are persisted to a 081 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 082 * </p> 083 * <p> 084 * The contents of each datastream are written verbatim. No attempt is made to 085 * re-write the RELS-EXT to replace subjects and objects with their LDP 086 * counterparts. 087 * </p> 088 * <p> 089 * Note: fedora-specific OCFL serialization features (such as redirects, 090 * container metadata, etc) is not fully defined yet, so are not included here 091 * 092 * @author apb@jhu.edu 093 */ 094public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 095 096 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 097 098 private static final String FCREPO_ROOT = "info:fedora/"; 099 private static final String FCRMETA_SUFFIX = "/fcr:metadata"; 100 101 private static final Map<String, String> externalHandlingMap = Map.of( 102 "E", "proxy", 103 "R", "redirect" 104 ); 105 106 private static final String INLINE_XML = "X"; 107 108 private static final String DS_INACTIVE = "I"; 109 private static final String DS_DELETED = "D"; 110 111 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 112 private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename"; 113 private static final String OBJ_INACTIVE = "Inactive"; 114 private static final String OBJ_DELETED = "Deleted"; 115 116 private static final String RELS_EXT = "RELS-EXT"; 117 private static final String RELS_INT = "RELS-INT"; 118 119 private final OcflObjectSessionFactory sessionFactory; 120 private final boolean addDatastreamExtensions; 121 private final boolean deleteInactive; 122 private final boolean foxmlFile; 123 private final MigrationType migrationType; 124 private final ResourceMigrationType resourceMigrationType; 125 private final String user; 126 private final String idPrefix; 127 private final Detector mimeDetector; 128 private final boolean headOnly; 129 private final boolean disableChecksumValidation; 130 131 /** 132 * Create an ArchiveGroupHandler, 133 * 134 * @param sessionFactory 135 * OCFL session factory 136 * @param migrationType 137 * the type of migration to do 138 * @param resourceMigrationType 139 * how resources should be migrated 140 * @param addDatastreamExtensions 141 * true if datastreams should be written with file extensions 142 * @param deleteInactive 143 * true if inactive objects and datastreams should be migrated as deleted 144 * @param foxmlFile 145 * true if foxml file should be migrated as a whole file, instead of creating property files 146 * @param user 147 * the username to associated with the migrated resources 148 * @param idPrefix 149 * the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3) 150 * @param headOnly 151 * flag to enable head only migrations 152 * @param disableChecksumValidation 153 */ 154 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 155 final MigrationType migrationType, 156 final ResourceMigrationType resourceMigrationType, 157 final boolean addDatastreamExtensions, 158 final boolean deleteInactive, 159 final boolean foxmlFile, 160 final String user, 161 final String idPrefix, 162 final boolean headOnly, 163 final boolean disableChecksumValidation) { 164 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 165 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 166 this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType, 167 "resourceMigrationType cannot be null"); 168 this.addDatastreamExtensions = addDatastreamExtensions; 169 this.deleteInactive = deleteInactive; 170 this.foxmlFile = foxmlFile; 171 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 172 this.idPrefix = idPrefix; 173 this.headOnly = headOnly; 174 this.disableChecksumValidation = disableChecksumValidation; 175 try { 176 this.mimeDetector = new TikaConfig().getDetector(); 177 } catch (Exception e) { 178 throw new RuntimeException(e); 179 } 180 } 181 182 @Override 183 public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) { 184 // We use the PID to identify the OCFL object 185 final String objectId = objectInfo.getPid(); 186 final String f6ObjectId = idPrefix + objectId; 187 188 // We need to manually keep track of the datastream creation dates 189 final Map<String, String> dsCreateDates = new HashMap<>(); 190 191 String objectState = null; 192 OffsetDateTime objectCreation = null; 193 OcflObjectSession objectSession = null; 194 195 final Map<String, String> datastreamStates = new HashMap<>(); 196 // tracks the triples used to create containers and binary descriptions 197 final Map<String, MetaHolder> metaMap = new HashMap<>(); 198 // tracks info about binary resources needed to construct filenames 199 final Map<String, BinaryMeta> binaryMeta = new HashMap<>(); 200 // tracks filenames pulled from RELS-INT 201 final Map<String, String> filenameMap = new HashMap<>(); 202 203 for (var ov : versions) { 204 // tracks the binary descriptions that need to be written 205 final Set<String> toWrite = new HashSet<>(); 206 // tracks the binaries that need their filename updated base on RELS-INT 207 final Set<String> relsFilenameUpdates = new HashSet<>(); 208 // tracks the binaries that need their filename updated based on a RELS-INT removal 209 final Map<String, String> relsDeletedFilenames = new HashMap<>(); 210 211 // reuse the objectSession when headOnly is set 212 objectSession = (objectSession == null || !headOnly) ? newSession(f6ObjectId) : objectSession; 213 214 if (ov.isFirstVersion()) { 215 if (objectSession.containsResource(f6ObjectId)) { 216 throw new RuntimeException(f6ObjectId + " already exists!"); 217 } 218 objectCreation = OffsetDateTime.parse(ov.getVersionDate()); 219 objectState = getObjectState(ov, objectId); 220 // Object properties are written only once (as fcrepo3 object properties were unversioned). 221 if (foxmlFile) { 222 try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) { 223 final var foxmlDsId = f6ObjectId + "/FOXML"; 224 final var headers = createHeaders(foxmlDsId, f6ObjectId, 225 InteractionModel.NON_RDF).build(); 226 objectSession.writeResource(headers, is); 227 //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources() 228 datastreamStates.put(foxmlDsId, DS_DELETED); 229 } catch (IOException io) { 230 LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io); 231 throw new UncheckedIOException(io); 232 } 233 } else { 234 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 235 final var content = getObjTriples(ov, objectId); 236 final var meta = MetaHolder.fromContent(content, objectHeaders); 237 metaMap.put(f6ObjectId, meta); 238 objectSession.writeResource(meta.headers.build(), meta.constructTriples()); 239 } 240 } 241 242 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 243 244 // Write datastreams and their metadata 245 for (var dv : ov.listChangedDatastreams()) { 246 final var mimeType = resolveMimeType(dv); 247 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 248 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId); 249 final var datastreamFilename = lastPartFromId(f6DsId); 250 251 final var datastreamSession = datastreamSession(f6DsId, objectSession); 252 datastreamSessions.putIfAbsent(f6DsId, datastreamSession); 253 254 if (dv.isFirstVersionIn(ov.getObject())) { 255 dsCreateDates.put(dsId, dv.getCreated()); 256 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 257 } 258 259 final var createDate = dsCreateDates.get(dsId); 260 261 final var filename = resolveFilename(datastreamFilename, 262 dv.getLabel(), filenameMap.get(f6DsId), mimeType); 263 264 relsDeletedFilenames.remove(f6DsId); 265 266 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 267 filename, mimeType, createDate); 268 269 binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel())); 270 271 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 272 InputStream content = null; 273 // for plain OCFL migrations, write a file containing the external/redirect URL 274 if (migrationType == MigrationType.PLAIN_OCFL) { 275 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8); 276 } 277 datastreamSession.writeResource(datastreamHeaders, content); 278 } else { 279 try (var contentStream = dv.getContent()) { 280 writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession); 281 } catch (final IOException e) { 282 throw new UncheckedIOException(e); 283 } 284 } 285 286 if (!foxmlFile) { 287 final var f6DescId = f6DescriptionId(f6DsId); 288 final var descriptionHeaders = createDescriptionHeaders(f6DsId, 289 datastreamHeaders); 290 final var descriptionTriples = getDsTriples(dv, f6DsId, createDate); 291 metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder()) 292 .setHeaders(descriptionHeaders) 293 .setContentTriples(descriptionTriples); 294 toWrite.add(f6DescId); 295 296 if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) { 297 final var triples = parseRdfXml(dv); 298 if (RELS_EXT.equals(dsId)) { 299 metaMap.get(f6ObjectId).setRelsTriples(triples); 300 toWrite.add(f6ObjectId); 301 } else { 302 final Map<String, Model> splitModels = splitRelsInt(triples); 303 final var oldIds = new HashSet<>(filenameMap.keySet()); 304 filenameMap.clear(); 305 306 splitModels.forEach((id, model) -> { 307 final var descId = f6DescriptionId(id); 308 metaMap.computeIfAbsent(descId, k -> new MetaHolder()) 309 .setRelsTriples(model); 310 toWrite.add(descId); 311 312 // Check to see if there are any file names that need updated 313 for (final var it = model.listStatements(); it.hasNext(); ) { 314 final var statement = it.next(); 315 if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) { 316 filenameMap.put(id, statement.getObject().toString()); 317 relsFilenameUpdates.add(id); 318 break; 319 } 320 } 321 }); 322 323 // The filename was set once but is no longer 324 final var deleted = Sets.difference(oldIds, filenameMap.keySet()); 325 deleted.forEach(id -> { 326 final var meta = binaryMeta.get(id); 327 if (meta != null) { 328 relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label, 329 null, meta.mimeType)); 330 } 331 }); 332 } 333 } 334 } 335 } 336 337 writeMeta(toWrite, metaMap, objectSession, datastreamSessions); 338 updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions); 339 340 if (!headOnly) { 341 LOGGER.debug("Committing object <{}>", f6ObjectId); 342 343 final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate()); 344 345 objectSession.versionCreationTimestamp(creationTimestamp); 346 objectSession.commit(); 347 348 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 349 datastreamSessions.forEach((id, session) -> { 350 LOGGER.debug("Committing object <{}>", id); 351 session.versionCreationTimestamp(creationTimestamp); 352 session.commit(); 353 }); 354 } 355 } 356 } 357 358 handleDeletedResources(f6ObjectId, objectState, datastreamStates, objectSession); 359 360 // final commit when headOnly is set 361 if (headOnly && objectSession != null) { 362 LOGGER.debug("Committing object <{}>", f6ObjectId); 363 objectSession.versionCreationTimestamp(objectCreation); 364 objectSession.commit(); 365 } 366 } 367 368 /** 369 * Resolves the filename of the datastream based on the following precedence: 370 * 371 * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT 372 * 2. LABEL from datastream meta 373 * 3. Name of the datastream 374 * 375 * If extensions should be added, then an extension is picked based on the mime type. If the filename already 376 * includes a `.` then no extension is added. 377 * 378 * @param dsName the name of the datastream 379 * @param labelName the datastream's label 380 * @param downloadName the download name from RELS-INT 381 * @param mimeType the datastream's mime type 382 * @return the resolved filename 383 */ 384 private String resolveFilename(final String dsName, 385 final String labelName, 386 final String downloadName, 387 final String mimeType) { 388 String filename; 389 if (StringUtils.isNotBlank(downloadName)) { 390 filename = downloadName; 391 } else if (StringUtils.isNotBlank(labelName)) { 392 filename = labelName; 393 } else { 394 filename = dsName; 395 } 396 397 if (addDatastreamExtensions 398 && StringUtils.isNotBlank(mimeType) 399 && !filename.contains(".")) { 400 filename += getExtension(mimeType); 401 } 402 403 return filename; 404 } 405 406 /** 407 * RDF resources are written after writing all other binaries in the version because they can be affected by 408 * RELS-INT or RELS-EXT updates. 409 * 410 * @param toWrite the set of resources that should be written to this version 411 * @param metaMap the map of all known rdf resources 412 * @param objectSession the ocfl session for the object 413 * @param datastreamSessions the ocfl sessions for the datastreams 414 */ 415 private void writeMeta(final Set<String> toWrite, 416 final Map<String, MetaHolder> metaMap, 417 final OcflObjectSession objectSession, 418 final Map<String, OcflObjectSession> datastreamSessions) { 419 for (final var id : toWrite) { 420 final var meta = metaMap.get(id); 421 422 if (meta.headers == null) { 423 // This only happens if there's a RELS-INT that references a datastream before it exists. 424 // Skip for now. The triples will be added once the datastream exists. 425 continue; 426 } 427 428 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 429 k -> datastreamSession(k, objectSession)); 430 431 // Need to copy over the memento created date from the existing headers because it may have been updated 432 // when a description's binary was updated 433 if (migrationType == MigrationType.FEDORA_OCFL) { 434 try { 435 final var existingHeaders = session.readHeaders(id); 436 meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate()); 437 } catch (NotFoundException e) { 438 // this just means the resource hasn't been written yet 439 } 440 } 441 session.writeResource(meta.headers.build(), meta.constructTriples()); 442 } 443 } 444 445 private void updateFilenames(final Set<String> toUpdate, 446 final Map<String, String> filenameMap, 447 final Map<String, String> relsDeletedFilenames, 448 final OcflObjectSession objectSession, 449 final Map<String, OcflObjectSession> datastreamSessions) { 450 if (migrationType == MigrationType.FEDORA_OCFL) { 451 toUpdate.forEach(id -> { 452 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 453 k -> datastreamSession(k, objectSession)); 454 final var origHeaders = session.readHeaders(id); 455 final var filename = filenameMap.get(id); 456 if (StringUtils.isNotBlank(filename)) { 457 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 458 session.writeHeaders(newHeaders); 459 } 460 }); 461 relsDeletedFilenames.forEach((id, filename) -> { 462 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 463 k -> datastreamSession(k, objectSession)); 464 final var origHeaders = session.readHeaders(id); 465 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 466 session.writeHeaders(newHeaders); 467 }); 468 } 469 } 470 471 private boolean fedora3DigestValid(final ContentDigest f3Digest) { 472 return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) && 473 StringUtils.isNotBlank(f3Digest.getDigest()); 474 } 475 476 private void writeDatastreamContent(final DatastreamVersion dv, 477 final ResourceHeaders datastreamHeaders, 478 final InputStream contentStream, 479 final OcflObjectSession session) throws IOException { 480 if (disableChecksumValidation) { 481 session.writeResource(datastreamHeaders, contentStream); 482 return; 483 } 484 final var f3Digest = dv.getContentDigest(); 485 final var ocflObjectId = session.ocflObjectId(); 486 final var datastreamId = dv.getDatastreamInfo().getDatastreamId(); 487 final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup(); 488 if (fedora3DigestValid(f3Digest)) { 489 try { 490 final var messageDigest = MessageDigest.getInstance(f3Digest.getType()); 491 if (migrationType == MigrationType.PLAIN_OCFL) { 492 session.writeResource(datastreamHeaders, contentStream); 493 } else { 494 try (var digestStream = new DigestInputStream(contentStream, messageDigest)) { 495 session.writeResource(datastreamHeaders, digestStream); 496 final var expectedDigest = f3Digest.getDigest(); 497 final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex(); 498 if (!actualDigest.equalsIgnoreCase(expectedDigest)) { 499 final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s", 500 ocflObjectId, datastreamId, actualDigest, expectedDigest); 501 throw new RuntimeException(msg); 502 } 503 } 504 } 505 } catch (final NoSuchAlgorithmException e) { 506 final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.", 507 ocflObjectId, datastreamId, f3Digest.getType()); 508 LOGGER.warn(msg); 509 session.writeResource(datastreamHeaders, contentStream); 510 } 511 } else { 512 if (datastreamControlGroup.equalsIgnoreCase("M")) { 513 final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.", 514 ocflObjectId, datastreamId); 515 LOGGER.warn(msg); 516 } 517 session.writeResource(datastreamHeaders, contentStream); 518 } 519 } 520 521 private void handleDeletedResources(final String f6ObjectId, 522 final String objectState, 523 final Map<String, String> datastreamStates, 524 final OcflObjectSession objectSession) { 525 final OcflObjectSession session = headOnly ? objectSession : newSession(f6ObjectId); 526 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 527 528 try { 529 final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC); 530 final var hasDeletes = new AtomicBoolean(false); 531 532 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 533 hasDeletes.set(true); 534 535 datastreamStates.keySet().forEach(f6DsId -> { 536 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 537 k -> datastreamSession(f6DsId, session)); 538 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 539 }); 540 541 if (migrationType == MigrationType.PLAIN_OCFL) { 542 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 543 } else { 544 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 545 } 546 } else { 547 datastreamStates.forEach((f6DsId, state) -> { 548 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 549 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 550 k -> datastreamSession(f6DsId, session)); 551 hasDeletes.set(true); 552 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 553 } 554 }); 555 } 556 557 if (!headOnly && hasDeletes.get()) { 558 session.versionCreationTimestamp(now); 559 session.commit(); 560 561 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 562 datastreamSessions.forEach((id, dsSession) -> { 563 dsSession.versionCreationTimestamp(now); 564 dsSession.commit(); 565 }); 566 } 567 } else if (!headOnly) { 568 session.abort(); 569 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 570 datastreamSessions.forEach((id, dsSession) -> { 571 dsSession.abort(); 572 }); 573 } 574 } 575 } catch (RuntimeException e) { 576 session.abort(); 577 throw e; 578 } 579 } 580 581 private String f6DescriptionId(final String f6ResourceId) { 582 return f6ResourceId + FCRMETA_SUFFIX; 583 } 584 585 private String lastPartFromId(final String id) { 586 return id.substring(id.lastIndexOf('/') + 1); 587 } 588 589 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) { 590 return f6ObjectId + "/" + datastreamId; 591 } 592 593 private ResourceHeaders.Builder createHeaders(final String id, 594 final String parentId, 595 final InteractionModel model) { 596 final var headers = ResourceHeaders.builder(); 597 headers.withHeadersVersion(ResourceHeadersVersion.V1_0); 598 headers.withId(id); 599 headers.withParent(parentId); 600 headers.withInteractionModel(model.getUri()); 601 return headers; 602 } 603 604 private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 605 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 606 headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL); 607 headers.withObjectRoot(true); 608 headers.withLastModifiedBy(user); 609 headers.withCreatedBy(user); 610 611 ov.getObjectProperties().listProperties().forEach(p -> { 612 if (p.getName().contains("lastModifiedDate")) { 613 final var lastModified = Instant.parse(p.getValue()); 614 headers.withLastModifiedDate(lastModified); 615 headers.withMementoCreatedDate(lastModified); 616 headers.withStateToken(DigestUtils.md5Hex( 617 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 618 } else if (p.getName().contains("createdDate")) { 619 headers.withCreatedDate(Instant.parse(p.getValue())); 620 } 621 }); 622 623 return headers; 624 } 625 626 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 627 final String f6DsId, 628 final String f6ObjectId, 629 final String filename, 630 final String mime, 631 final String createDate) { 632 final var lastModified = Instant.parse(dv.getCreated()); 633 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 634 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 635 headers.withArchivalGroupId(f6ObjectId); 636 } 637 headers.withFilename(filename); 638 headers.withCreatedDate(Instant.parse(createDate)); 639 headers.withLastModifiedDate(lastModified); 640 headers.withLastModifiedBy(user); 641 headers.withCreatedBy(user); 642 headers.withMementoCreatedDate(lastModified); 643 644 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 645 headers.withExternalHandling( 646 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 647 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 648 } 649 650 headers.withArchivalGroup(false); 651 headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC); 652 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 653 headers.withContentSize(dv.getSize()); 654 } 655 656 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 657 final var digest = dv.getContentDigest(); 658 final var digests = new ArrayList<URI>(); 659 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase())); 660 headers.withDigests(digests); 661 } 662 663 headers.withMimeType(mime); 664 headers.withStateToken(DigestUtils.md5Hex( 665 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 666 667 return headers.build(); 668 } 669 670 private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId, 671 final ResourceHeaders datastreamHeaders) { 672 final var id = f6DescriptionId(f6DsId); 673 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 674 675 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 676 headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId()); 677 } 678 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 679 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 680 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 681 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 682 headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate()); 683 684 headers.withArchivalGroup(false); 685 headers.withObjectRoot(false); 686 headers.withStateToken(datastreamHeaders.getStateToken()); 687 688 return headers; 689 } 690 691 private String resolveMimeType(final DatastreamVersion dv) { 692 String mime = dv.getMimeType(); 693 694 if (Strings.isNullOrEmpty(mime)) { 695 final var meta = new Metadata(); 696 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 697 try (var content = TikaInputStream.get(dv.getContent())) { 698 mime = mimeDetector.detect(content, meta).toString(); 699 } catch (IOException e) { 700 throw new UncheckedIOException(e); 701 } 702 } 703 704 return mime; 705 } 706 707 private void deleteDatastream(final String id, 708 final Instant lastModified, 709 final OcflObjectSession session) { 710 if (migrationType == MigrationType.PLAIN_OCFL) { 711 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 712 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 713 } else { 714 deleteF6MigratedResource(id, lastModified, session); 715 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 716 } 717 } 718 719 private void deleteF6MigratedResource(final String id, 720 final Instant lastModified, 721 final OcflObjectSession session) { 722 LOGGER.debug("Deleting resource {}", id); 723 final var headers = session.readHeaders(id); 724 session.deleteContentFile(ResourceHeaders.builder(headers) 725 .withDeleted(true) 726 .withLastModifiedDate(lastModified) 727 .withMementoCreatedDate(lastModified) 728 .build()); 729 } 730 731 private void deleteOcflMigratedResource(final String id, 732 final InteractionModel interactionModel, 733 final OcflObjectSession session) { 734 LOGGER.debug("Deleting resource {}", id); 735 session.deleteContentFile(ResourceHeaders.builder() 736 .withId(id) 737 .withInteractionModel(interactionModel.getUri()) 738 .build()); 739 } 740 741 private String getObjectState(final ObjectVersionReference ov, final String pid) { 742 return ov.getObjectProperties().listProperties().stream() 743 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 744 .findFirst() 745 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 746 pid))) 747 .getValue(); 748 } 749 750 // Get object-level triples 751 private static Model getObjTriples(final ObjectVersionReference o, final String pid) { 752 final Model triples = ModelFactory.createDefaultModel(); 753 final String uri = "info:fedora/" + pid; 754 755 o.getObjectProperties().listProperties().forEach(p -> { 756 if (p.getName().contains("Date")) { 757 addDateLiteral(triples, uri, p.getName(), p.getValue()); 758 } else { 759 addStringLiteral(triples, uri, p.getName(), p.getValue()); 760 } 761 }); 762 763 return triples; 764 } 765 766 // Get datastream-level triples 767 private Model getDsTriples(final DatastreamVersion dv, 768 final String f6DsId, 769 final String createDate) { 770 final Model triples = ModelFactory.createDefaultModel(); 771 772 if (migrationType == MigrationType.PLAIN_OCFL) { 773 // These triples are server managed in F6 774 addDateLiteral(triples, 775 f6DsId, 776 "http://fedora.info/definitions/v4/repository#created", 777 createDate); 778 addDateLiteral(triples, 779 f6DsId, 780 "http://fedora.info/definitions/v4/repository#lastModified", 781 dv.getCreated()); 782 addStringLiteral(triples, 783 f6DsId, 784 "http://purl.org/dc/terms/identifier", 785 dv.getDatastreamInfo().getDatastreamId()); 786 addStringLiteral(triples, 787 f6DsId, 788 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 789 dv.getMimeType()); 790 addLongLiteral(triples, 791 f6DsId, 792 "http://www.loc.gov/premis/rdf/v1#size", 793 dv.getSize()); 794 795 if (dv.getContentDigest() != null) { 796 addStringLiteral(triples, 797 f6DsId, 798 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 799 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 800 dv.getContentDigest().getDigest().toLowerCase()); 801 } 802 } 803 804 addStringLiteral(triples, 805 f6DsId, 806 "http://purl.org/dc/terms/title", 807 dv.getLabel()); 808 addStringLiteral(triples, 809 f6DsId, 810 "http://fedora.info/definitions/1/0/access/objState", 811 dv.getDatastreamInfo().getState()); 812 addStringLiteral(triples, 813 f6DsId, 814 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 815 dv.getFormatUri()); 816 817 return triples; 818 } 819 820 private static void addStringLiteral(final Model m, 821 final String s, 822 final String p, 823 final String o) { 824 if (o != null) { 825 m.add(m.createResource(s), m.createProperty(p), o); 826 } 827 } 828 829 private static void addDateLiteral(final Model m, 830 final String s, 831 final String p, 832 final String date) { 833 if (date != null) { 834 m.addLiteral(m.createResource(s), 835 m.createProperty(p), 836 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 837 } 838 } 839 840 private static void addLongLiteral(final Model m, 841 final String s, 842 final String p, 843 final long number) { 844 if (number != -1) { 845 m.addLiteral(m.createResource(s), 846 m.createProperty(p), 847 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 848 } 849 } 850 851 /** 852 * @param mime any mimetype as String 853 * @return extension associated with arg mime, return includes '.' in extension (.txt). 854 * ..Empty String if unrecognized mime 855 */ 856 private static String getExtension(final String mime) { 857 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 858 MimeType type; 859 try { 860 type = allTypes.forName(mime); 861 } catch (final MimeTypeException e) { 862 type = null; 863 } 864 865 if (type != null) { 866 return type.getExtension(); 867 } 868 869 LOGGER.warn("No mimetype found for '{}'", mime); 870 return ""; 871 } 872 873 private Model parseRdfXml(final DatastreamVersion datastreamVersion) { 874 final var model = ModelFactory.createDefaultModel(); 875 try (final var is = datastreamVersion.getContent()) { 876 RDFDataMgr.read(model, is, Lang.RDFXML); 877 return model; 878 } catch (Exception e) { 879 throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s", 880 datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(), 881 datastreamVersion.getDatastreamInfo().getDatastreamId()), e); 882 } 883 } 884 885 private Map<String, Model> splitRelsInt(final Model relsIntModel) { 886 final Map<String, Model> splitModels = new HashMap<>(); 887 for (final var it = relsIntModel.listStatements(); it.hasNext();) { 888 final var statement = it.next(); 889 final var id = statement.getSubject().getURI(); 890 final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel()); 891 model.add(statement); 892 } 893 return splitModels; 894 } 895 896 /** 897 * Creates a new session for the datastream when migrating as atomic resources, or returns the object session, 898 * when migrating as archival groups. 899 * 900 * @param id the datastream's id in fedora 6 901 * @param objectSession the datastream's object session 902 * @return either a new datastream session or the object session 903 */ 904 private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) { 905 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 906 return objectSession; 907 } else { 908 return newSession(id); 909 } 910 } 911 912 private OcflObjectSession newSession(final String id) { 913 return new OcflObjectSessionWrapper(sessionFactory.newSession(id)); 914 } 915 916 /** 917 * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content 918 * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from 919 * one of the RELS-* files. They are maintained separately because it's possible for them to be updated 920 * independently and we need to be able to construct the correct set of triples when one changes. 921 */ 922 private static class MetaHolder { 923 Model contentTriples; 924 Model relsTriples; 925 ResourceHeaders.Builder headers; 926 927 public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) { 928 return new MetaHolder(contentTriples, null, headers); 929 } 930 931 private MetaHolder() { 932 } 933 934 private MetaHolder(final Model contentTriples, 935 final Model relsTriples, 936 final ResourceHeaders.Builder headers) { 937 this.contentTriples = contentTriples; 938 this.relsTriples = relsTriples; 939 this.headers = headers; 940 } 941 942 /** 943 * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples. 944 * 945 * @return n-triples input stream 946 */ 947 public InputStream constructTriples() { 948 final var output = new ByteArrayOutputStream(); 949 final var triples = ModelFactory.createDefaultModel(); 950 951 if (contentTriples != null) { 952 triples.add(contentTriples.listStatements()); 953 } 954 955 if (relsTriples != null) { 956 triples.add(relsTriples.listStatements()); 957 } 958 959 triples.write(output, Lang.NTRIPLES.getName()); 960 return new ByteArrayInputStream(output.toByteArray()); 961 } 962 963 public MetaHolder setHeaders(final ResourceHeaders.Builder headers) { 964 this.headers = headers; 965 return this; 966 } 967 968 public MetaHolder setContentTriples(final Model contentTriples) { 969 this.contentTriples = contentTriples; 970 return this; 971 } 972 973 public MetaHolder setRelsTriples(final Model relsTriples) { 974 this.relsTriples = relsTriples; 975 return this; 976 } 977 } 978 979 private static class BinaryMeta { 980 final String name; 981 final String mimeType; 982 final String label; 983 984 public BinaryMeta(final String name, final String mimeType, final String label) { 985 this.name = name; 986 this.mimeType = mimeType; 987 this.label = label; 988 } 989 } 990 991}