001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import at.favre.lib.bytes.Bytes; 020import com.google.common.base.Preconditions; 021import com.google.common.base.Strings; 022import com.google.common.collect.Sets; 023import org.apache.commons.codec.digest.DigestUtils; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.lang3.StringUtils; 026import org.apache.jena.datatypes.xsd.XSDDatatype; 027import org.apache.jena.rdf.model.Model; 028import org.apache.jena.rdf.model.ModelFactory; 029import org.apache.jena.riot.Lang; 030import org.apache.jena.riot.RDFDataMgr; 031import org.apache.jena.graph.NodeFactory; 032import org.apache.jena.graph.Triple; 033import org.apache.jena.rdf.model.Statement; 034 035import org.apache.tika.config.TikaConfig; 036import org.apache.tika.detect.Detector; 037import org.apache.tika.io.TikaInputStream; 038import org.apache.tika.metadata.Metadata; 039import org.apache.tika.mime.MimeType; 040import org.apache.tika.mime.MimeTypeException; 041import org.apache.tika.mime.MimeTypes; 042import org.fcrepo.migration.ContentDigest; 043import org.fcrepo.migration.DatastreamVersion; 044import org.fcrepo.migration.FedoraObjectVersionHandler; 045import org.fcrepo.migration.MigrationType; 046import org.fcrepo.migration.ObjectInfo; 047import org.fcrepo.migration.ObjectVersionReference; 048import org.fcrepo.migration.ResourceMigrationType; 049import org.fcrepo.migration.foxml.DC; 050import org.fcrepo.storage.ocfl.InteractionModel; 051import org.fcrepo.storage.ocfl.OcflObjectSession; 052import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 053import org.fcrepo.storage.ocfl.ResourceHeaders; 054import org.fcrepo.storage.ocfl.ResourceHeadersVersion; 055import org.fcrepo.storage.ocfl.exception.NotFoundException; 056import org.slf4j.Logger; 057 058import java.io.BufferedInputStream; 059import java.io.ByteArrayInputStream; 060import java.io.ByteArrayOutputStream; 061import java.io.IOException; 062import java.io.InputStream; 063import java.io.UncheckedIOException; 064import java.net.URI; 065import java.nio.charset.StandardCharsets; 066import java.nio.file.Files; 067import java.security.DigestInputStream; 068import java.security.MessageDigest; 069import java.security.NoSuchAlgorithmException; 070import java.time.Instant; 071import java.time.OffsetDateTime; 072import java.time.ZoneOffset; 073import java.util.ArrayList; 074import java.util.HashMap; 075import java.util.HashSet; 076import java.util.Map; 077import java.util.Set; 078import java.util.concurrent.atomic.AtomicBoolean; 079import static org.slf4j.LoggerFactory.getLogger; 080 081/** 082 * Writes a Fedora object as a single ArchiveGroup. 083 * <p> 084 * All datastreams and object metadata from a fcrepo3 object are persisted to a 085 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 086 * </p> 087 * <p> 088 * The contents of each datastream are written verbatim. No attempt is made to 089 * re-write the RELS-EXT to replace subjects and objects with their LDP 090 * counterparts. 091 * </p> 092 * <p> 093 * Note: fedora-specific OCFL serialization features (such as redirects, 094 * container metadata, etc) is not fully defined yet, so are not included here 095 * 096 * @author apb@jhu.edu 097 */ 098public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 099 100 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 101 102 private static final String FCREPO_ROOT = "info:fedora/"; 103 private static final String FCRMETA_SUFFIX = "/fcr:metadata"; 104 105 private static final Map<String, String> externalHandlingMap = Map.of( 106 "E", "proxy", 107 "R", "redirect" 108 ); 109 110 private static final String INLINE_XML = "X"; 111 112 private static final String DS_INACTIVE = "I"; 113 private static final String DS_DELETED = "D"; 114 115 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 116 private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename"; 117 private static final String OBJ_INACTIVE = "Inactive"; 118 private static final String OBJ_DELETED = "Deleted"; 119 120 private static final String RELS_EXT = "RELS-EXT"; 121 private static final String RELS_INT = "RELS-INT"; 122 private static final String DC_DS = "DC"; 123 124 private final OcflObjectSessionFactory sessionFactory; 125 private final boolean addDatastreamExtensions; 126 private final boolean deleteInactive; 127 private final boolean foxmlFile; 128 private final MigrationType migrationType; 129 private final ResourceMigrationType resourceMigrationType; 130 private final String user; 131 private final String idPrefix; 132 private final Detector mimeDetector; 133 private final boolean headOnly; 134 private final boolean disableChecksumValidation; 135 private final boolean disableDc; 136 137 /** 138 * Create an ArchiveGroupHandler, 139 * 140 * @param sessionFactory 141 * OCFL session factory 142 * @param migrationType 143 * the type of migration to do 144 * @param resourceMigrationType 145 * how resources should be migrated 146 * @param addDatastreamExtensions 147 * true if datastreams should be written with file extensions 148 * @param deleteInactive 149 * true if inactive objects and datastreams should be migrated as deleted 150 * @param foxmlFile 151 * true if foxml file should be migrated as a whole file, instead of creating property files 152 * @param user 153 * the username to associated with the migrated resources 154 * @param idPrefix 155 * the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3) 156 * @param headOnly 157 * flag to enable head only migrations 158 * @param disableChecksumValidation 159 * disable Checksum validation 160 * @param disableDc 161 * true if DC datastreams should not be migrated to RDF object properties 162 */ 163 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 164 final MigrationType migrationType, 165 final ResourceMigrationType resourceMigrationType, 166 final boolean addDatastreamExtensions, 167 final boolean deleteInactive, 168 final boolean foxmlFile, 169 final String user, 170 final String idPrefix, 171 final boolean headOnly, 172 final boolean disableChecksumValidation, 173 final boolean disableDc) { 174 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 175 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 176 this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType, 177 "resourceMigrationType cannot be null"); 178 this.addDatastreamExtensions = addDatastreamExtensions; 179 this.deleteInactive = deleteInactive; 180 this.foxmlFile = foxmlFile; 181 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 182 this.idPrefix = idPrefix; 183 this.headOnly = headOnly; 184 this.disableChecksumValidation = disableChecksumValidation; 185 this.disableDc = disableDc; 186 try { 187 this.mimeDetector = new TikaConfig().getDetector(); 188 } catch (Exception e) { 189 throw new RuntimeException(e); 190 } 191 } 192 193 @Override 194 public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) { 195 // We use the PID to identify the OCFL object 196 final String objectId = objectInfo.getPid(); 197 final String f6ObjectId = idPrefix + objectId; 198 199 // We need to manually keep track of the datastream creation dates 200 final Map<String, String> dsCreateDates = new HashMap<>(); 201 202 String objectState = null; 203 OffsetDateTime objectCreation = null; 204 OcflObjectSession objectSession = null; 205 206 final Map<String, String> datastreamStates = new HashMap<>(); 207 // tracks the triples used to create containers and binary descriptions 208 final Map<String, MetaHolder> metaMap = new HashMap<>(); 209 // tracks info about binary resources needed to construct filenames 210 final Map<String, BinaryMeta> binaryMeta = new HashMap<>(); 211 // tracks filenames pulled from RELS-INT 212 final Map<String, String> filenameMap = new HashMap<>(); 213 214 for (var ov : versions) { 215 // tracks the binary descriptions that need to be written 216 final Set<String> toWrite = new HashSet<>(); 217 // tracks the binaries that need their filename updated base on RELS-INT 218 final Set<String> relsFilenameUpdates = new HashSet<>(); 219 // tracks the binaries that need their filename updated based on a RELS-INT removal 220 final Map<String, String> relsDeletedFilenames = new HashMap<>(); 221 222 // reuse the objectSession when headOnly is set 223 objectSession = (objectSession == null || !headOnly) ? newSession(f6ObjectId) : objectSession; 224 225 if (ov.isFirstVersion()) { 226 if (objectSession.containsResource(f6ObjectId)) { 227 throw new RuntimeException(f6ObjectId + " already exists!"); 228 } 229 objectCreation = OffsetDateTime.parse(ov.getVersionDate()); 230 objectState = getObjectState(ov, objectId); 231 // Object properties are written only once (as fcrepo3 object properties were unversioned). 232 if (foxmlFile) { 233 try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) { 234 final var foxmlDsId = f6ObjectId + "/FOXML"; 235 final var headers = createHeaders(foxmlDsId, f6ObjectId, 236 InteractionModel.NON_RDF).build(); 237 objectSession.writeResource(headers, is); 238 //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources() 239 datastreamStates.put(foxmlDsId, DS_DELETED); 240 } catch (IOException io) { 241 LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io); 242 throw new UncheckedIOException(io); 243 } 244 } else { 245 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 246 final var content = getObjTriples(ov, objectId); 247 final var meta = MetaHolder.fromContent(content, objectHeaders); 248 metaMap.put(f6ObjectId, meta); 249 objectSession.writeResource(meta.headers.build(), meta.constructTriples()); 250 } 251 } 252 253 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 254 255 // Write datastreams and their metadata 256 for (var dv : ov.listChangedDatastreams()) { 257 final var mimeType = resolveMimeType(dv); 258 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 259 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId); 260 final var datastreamFilename = lastPartFromId(f6DsId); 261 262 final var datastreamSession = datastreamSession(f6DsId, objectSession); 263 datastreamSessions.putIfAbsent(f6DsId, datastreamSession); 264 265 if (dv.isFirstVersionIn(ov.getObject())) { 266 dsCreateDates.put(dsId, dv.getCreated()); 267 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 268 } 269 270 final var createDate = dsCreateDates.get(dsId); 271 272 final var filename = resolveFilename(datastreamFilename, 273 dv.getLabel(), filenameMap.get(f6DsId), mimeType); 274 275 relsDeletedFilenames.remove(f6DsId); 276 277 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 278 filename, mimeType, createDate); 279 280 binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel())); 281 282 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 283 InputStream content = null; 284 // for plain OCFL migrations, write a file containing the external/redirect URL 285 if (migrationType == MigrationType.PLAIN_OCFL) { 286 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8); 287 } 288 datastreamSession.writeResource(datastreamHeaders, content); 289 } else { 290 try (var contentStream = dv.getContent()) { 291 writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession); 292 } catch (final IOException e) { 293 throw new UncheckedIOException(e); 294 } 295 } 296 297 if (!foxmlFile) { 298 final var f6DescId = f6DescriptionId(f6DsId); 299 final var descriptionHeaders = createDescriptionHeaders(f6DsId, 300 datastreamHeaders); 301 final var descriptionTriples = getDsTriples(dv, f6DsId, createDate); 302 metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder()) 303 .setHeaders(descriptionHeaders) 304 .setContentTriples(descriptionTriples); 305 toWrite.add(f6DescId); 306 307 if (DC_DS.equals(dsId) && !disableDc) { 308 DC dc = new DC(); 309 try { 310 dc = DC.parseDC(dv.getContent()); 311 } catch (Exception e) { 312 throw new RuntimeException(String.format("Failed to parse DC XML in %s/%s", 313 objectId,f6DsId), e); 314 } 315 316 final var model = ModelFactory.createDefaultModel(); 317 for (String uri : dc.getRepresentedElementURIs()) { 318 for (String value : dc.getValuesForURI(uri)) { 319 final Triple dcTriple = new Triple( 320 NodeFactory.createURI(f6ObjectId), 321 NodeFactory.createURI(uri), 322 NodeFactory.createLiteral(value, XSDDatatype.XSDstring)); 323 final Statement statement = model.asStatement(dcTriple); 324 model.add(statement); 325 LOGGER.debug(dcTriple.toString()); 326 } 327 } 328 329 metaMap.get(f6ObjectId).setDcTriples(model); 330 toWrite.add(f6ObjectId); 331 332 } 333 334 if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) { 335 final var triples = parseRdfXml(dv); 336 if (RELS_EXT.equals(dsId)) { 337 metaMap.get(f6ObjectId).setRelsTriples(triples); 338 toWrite.add(f6ObjectId); 339 } else { 340 final Map<String, Model> splitModels = splitRelsInt(triples); 341 final var oldIds = new HashSet<>(filenameMap.keySet()); 342 filenameMap.clear(); 343 344 splitModels.forEach((id, model) -> { 345 final var descId = f6DescriptionId(id); 346 metaMap.computeIfAbsent(descId, k -> new MetaHolder()) 347 .setRelsTriples(model); 348 toWrite.add(descId); 349 350 // Check to see if there are any file names that need updated 351 for (final var it = model.listStatements(); it.hasNext(); ) { 352 final var statement = it.next(); 353 if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) { 354 filenameMap.put(id, statement.getObject().toString()); 355 relsFilenameUpdates.add(id); 356 break; 357 } 358 } 359 }); 360 361 // The filename was set once but is no longer 362 final var deleted = Sets.difference(oldIds, filenameMap.keySet()); 363 deleted.forEach(id -> { 364 final var meta = binaryMeta.get(id); 365 if (meta != null) { 366 relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label, 367 null, meta.mimeType)); 368 } 369 }); 370 } 371 } 372 } 373 } 374 375 writeMeta(toWrite, metaMap, objectSession, datastreamSessions); 376 updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions); 377 378 if (!headOnly) { 379 LOGGER.debug("Committing object <{}>", f6ObjectId); 380 381 final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate()); 382 383 objectSession.versionCreationTimestamp(creationTimestamp); 384 objectSession.commit(); 385 386 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 387 datastreamSessions.forEach((id, session) -> { 388 LOGGER.debug("Committing object <{}>", id); 389 session.versionCreationTimestamp(creationTimestamp); 390 session.commit(); 391 }); 392 } 393 } 394 } 395 396 handleDeletedResources(f6ObjectId, objectState, datastreamStates, objectSession); 397 398 // final commit when headOnly is set 399 if (headOnly && objectSession != null) { 400 LOGGER.debug("Committing object <{}>", f6ObjectId); 401 objectSession.versionCreationTimestamp(objectCreation); 402 objectSession.commit(); 403 } 404 } 405 406 /** 407 * Resolves the filename of the datastream based on the following precedence: 408 * 409 * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT 410 * 2. LABEL from datastream meta 411 * 3. Name of the datastream 412 * 413 * If extensions should be added, then an extension is picked based on the mime type. If the filename already 414 * includes a `.` then no extension is added. 415 * 416 * @param dsName the name of the datastream 417 * @param labelName the datastream's label 418 * @param downloadName the download name from RELS-INT 419 * @param mimeType the datastream's mime type 420 * @return the resolved filename 421 */ 422 private String resolveFilename(final String dsName, 423 final String labelName, 424 final String downloadName, 425 final String mimeType) { 426 String filename; 427 if (StringUtils.isNotBlank(downloadName)) { 428 filename = downloadName; 429 } else if (StringUtils.isNotBlank(labelName)) { 430 filename = labelName; 431 } else { 432 filename = dsName; 433 } 434 435 if (addDatastreamExtensions 436 && StringUtils.isNotBlank(mimeType) 437 && !filename.contains(".")) { 438 filename += getExtension(mimeType); 439 } 440 441 return filename; 442 } 443 444 /** 445 * RDF resources are written after writing all other binaries in the version because they can be affected by 446 * RELS-INT or RELS-EXT updates. 447 * 448 * @param toWrite the set of resources that should be written to this version 449 * @param metaMap the map of all known rdf resources 450 * @param objectSession the ocfl session for the object 451 * @param datastreamSessions the ocfl sessions for the datastreams 452 */ 453 private void writeMeta(final Set<String> toWrite, 454 final Map<String, MetaHolder> metaMap, 455 final OcflObjectSession objectSession, 456 final Map<String, OcflObjectSession> datastreamSessions) { 457 for (final var id : toWrite) { 458 final var meta = metaMap.get(id); 459 460 if (meta.headers == null) { 461 // This only happens if there's a RELS-INT that references a datastream before it exists. 462 // Skip for now. The triples will be added once the datastream exists. 463 continue; 464 } 465 466 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 467 k -> datastreamSession(k, objectSession)); 468 469 // Need to copy over the memento created date from the existing headers because it may have been updated 470 // when a description's binary was updated 471 if (migrationType == MigrationType.FEDORA_OCFL) { 472 try { 473 final var existingHeaders = session.readHeaders(id); 474 meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate()); 475 } catch (NotFoundException e) { 476 // this just means the resource hasn't been written yet 477 } 478 } 479 session.writeResource(meta.headers.build(), meta.constructTriples()); 480 } 481 } 482 483 private void updateFilenames(final Set<String> toUpdate, 484 final Map<String, String> filenameMap, 485 final Map<String, String> relsDeletedFilenames, 486 final OcflObjectSession objectSession, 487 final Map<String, OcflObjectSession> datastreamSessions) { 488 if (migrationType == MigrationType.FEDORA_OCFL) { 489 toUpdate.forEach(id -> { 490 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 491 k -> datastreamSession(k, objectSession)); 492 final var origHeaders = session.readHeaders(id); 493 final var filename = filenameMap.get(id); 494 if (StringUtils.isNotBlank(filename)) { 495 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 496 session.writeHeaders(newHeaders); 497 } 498 }); 499 relsDeletedFilenames.forEach((id, filename) -> { 500 final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""), 501 k -> datastreamSession(k, objectSession)); 502 final var origHeaders = session.readHeaders(id); 503 final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build(); 504 session.writeHeaders(newHeaders); 505 }); 506 } 507 } 508 509 private boolean fedora3DigestValid(final ContentDigest f3Digest) { 510 return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) && 511 StringUtils.isNotBlank(f3Digest.getDigest()); 512 } 513 514 private void writeDatastreamContent(final DatastreamVersion dv, 515 final ResourceHeaders datastreamHeaders, 516 final InputStream contentStream, 517 final OcflObjectSession session) throws IOException { 518 if (disableChecksumValidation) { 519 session.writeResource(datastreamHeaders, contentStream); 520 return; 521 } 522 final var f3Digest = dv.getContentDigest(); 523 final var ocflObjectId = session.ocflObjectId(); 524 final var datastreamId = dv.getDatastreamInfo().getDatastreamId(); 525 final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup(); 526 if (fedora3DigestValid(f3Digest)) { 527 try { 528 final var messageDigest = MessageDigest.getInstance(f3Digest.getType()); 529 if (migrationType == MigrationType.PLAIN_OCFL) { 530 session.writeResource(datastreamHeaders, contentStream); 531 } else { 532 try (var digestStream = new DigestInputStream(contentStream, messageDigest)) { 533 session.writeResource(datastreamHeaders, digestStream); 534 final var expectedDigest = f3Digest.getDigest(); 535 final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex(); 536 if (!actualDigest.equalsIgnoreCase(expectedDigest)) { 537 final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s", 538 ocflObjectId, datastreamId, actualDigest, expectedDigest); 539 throw new RuntimeException(msg); 540 } 541 } 542 } 543 } catch (final NoSuchAlgorithmException e) { 544 final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.", 545 ocflObjectId, datastreamId, f3Digest.getType()); 546 LOGGER.warn(msg); 547 session.writeResource(datastreamHeaders, contentStream); 548 } 549 } else { 550 if (datastreamControlGroup.equalsIgnoreCase("M")) { 551 final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.", 552 ocflObjectId, datastreamId); 553 LOGGER.warn(msg); 554 } 555 session.writeResource(datastreamHeaders, contentStream); 556 } 557 } 558 559 private void handleDeletedResources(final String f6ObjectId, 560 final String objectState, 561 final Map<String, String> datastreamStates, 562 final OcflObjectSession objectSession) { 563 final OcflObjectSession session = headOnly ? objectSession : newSession(f6ObjectId); 564 final var datastreamSessions = new HashMap<String, OcflObjectSession>(); 565 566 try { 567 final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC); 568 final var hasDeletes = new AtomicBoolean(false); 569 570 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 571 hasDeletes.set(true); 572 573 datastreamStates.keySet().forEach(f6DsId -> { 574 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 575 k -> datastreamSession(f6DsId, session)); 576 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 577 }); 578 579 if (migrationType == MigrationType.PLAIN_OCFL) { 580 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 581 } else { 582 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 583 } 584 } else { 585 datastreamStates.forEach((f6DsId, state) -> { 586 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 587 final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId, 588 k -> datastreamSession(f6DsId, session)); 589 hasDeletes.set(true); 590 deleteDatastream(f6DsId, now.toInstant(), datastreamSession); 591 } 592 }); 593 } 594 595 if (!headOnly && hasDeletes.get()) { 596 session.versionCreationTimestamp(now); 597 session.commit(); 598 599 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 600 datastreamSessions.forEach((id, dsSession) -> { 601 dsSession.versionCreationTimestamp(now); 602 dsSession.commit(); 603 }); 604 } 605 } else if (!headOnly) { 606 session.abort(); 607 if (resourceMigrationType == ResourceMigrationType.ATOMIC) { 608 datastreamSessions.forEach((id, dsSession) -> { 609 dsSession.abort(); 610 }); 611 } 612 } 613 } catch (RuntimeException e) { 614 session.abort(); 615 throw e; 616 } 617 } 618 619 private String f6DescriptionId(final String f6ResourceId) { 620 return f6ResourceId + FCRMETA_SUFFIX; 621 } 622 623 private String lastPartFromId(final String id) { 624 return id.substring(id.lastIndexOf('/') + 1); 625 } 626 627 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) { 628 return f6ObjectId + "/" + datastreamId; 629 } 630 631 private ResourceHeaders.Builder createHeaders(final String id, 632 final String parentId, 633 final InteractionModel model) { 634 final var headers = ResourceHeaders.builder(); 635 headers.withHeadersVersion(ResourceHeadersVersion.V1_0); 636 headers.withId(id); 637 headers.withParent(parentId); 638 headers.withInteractionModel(model.getUri()); 639 return headers; 640 } 641 642 private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 643 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 644 headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL); 645 headers.withObjectRoot(true); 646 headers.withLastModifiedBy(user); 647 headers.withCreatedBy(user); 648 649 ov.getObjectProperties().listProperties().forEach(p -> { 650 if (p.getName().contains("lastModifiedDate")) { 651 final var lastModified = Instant.parse(p.getValue()); 652 headers.withLastModifiedDate(lastModified); 653 headers.withMementoCreatedDate(lastModified); 654 headers.withStateToken(DigestUtils.md5Hex( 655 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 656 } else if (p.getName().contains("createdDate")) { 657 headers.withCreatedDate(Instant.parse(p.getValue())); 658 } 659 }); 660 661 return headers; 662 } 663 664 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 665 final String f6DsId, 666 final String f6ObjectId, 667 final String filename, 668 final String mime, 669 final String createDate) { 670 final var lastModified = Instant.parse(dv.getCreated()); 671 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 672 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 673 headers.withArchivalGroupId(f6ObjectId); 674 } 675 headers.withFilename(filename); 676 headers.withCreatedDate(Instant.parse(createDate)); 677 headers.withLastModifiedDate(lastModified); 678 headers.withLastModifiedBy(user); 679 headers.withCreatedBy(user); 680 headers.withMementoCreatedDate(lastModified); 681 682 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 683 headers.withExternalHandling( 684 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 685 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 686 } 687 688 headers.withArchivalGroup(false); 689 headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC); 690 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 691 headers.withContentSize(dv.getSize()); 692 } 693 694 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 695 if (!dv.getContentDigest().getDigest().equals("none")) { 696 final var digest = dv.getContentDigest(); 697 final var digests = new ArrayList<URI>(); 698 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + 699 digest.getDigest().toLowerCase())); 700 headers.withDigests(digests); 701 } else { 702 LOGGER.warn("Digest content 'none' found. Not adding to header"); 703 } 704 } 705 706 headers.withMimeType(mime); 707 headers.withStateToken(DigestUtils.md5Hex( 708 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 709 710 return headers.build(); 711 } 712 713 private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId, 714 final ResourceHeaders datastreamHeaders) { 715 final var id = f6DescriptionId(f6DsId); 716 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 717 718 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 719 headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId()); 720 } 721 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 722 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 723 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 724 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 725 headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate()); 726 727 headers.withArchivalGroup(false); 728 headers.withObjectRoot(false); 729 headers.withStateToken(datastreamHeaders.getStateToken()); 730 731 return headers; 732 } 733 734 private String resolveMimeType(final DatastreamVersion dv) { 735 String mime = dv.getMimeType(); 736 737 if (Strings.isNullOrEmpty(mime)) { 738 final var meta = new Metadata(); 739 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 740 try (var content = TikaInputStream.get(dv.getContent())) { 741 mime = mimeDetector.detect(content, meta).toString(); 742 } catch (IOException e) { 743 throw new UncheckedIOException(e); 744 } 745 } 746 747 return mime; 748 } 749 750 private void deleteDatastream(final String id, 751 final Instant lastModified, 752 final OcflObjectSession session) { 753 if (migrationType == MigrationType.PLAIN_OCFL) { 754 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 755 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 756 } else { 757 deleteF6MigratedResource(id, lastModified, session); 758 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 759 } 760 } 761 762 private void deleteF6MigratedResource(final String id, 763 final Instant lastModified, 764 final OcflObjectSession session) { 765 LOGGER.debug("Deleting resource {}", id); 766 final var headers = session.readHeaders(id); 767 session.deleteContentFile(ResourceHeaders.builder(headers) 768 .withDeleted(true) 769 .withLastModifiedDate(lastModified) 770 .withMementoCreatedDate(lastModified) 771 .build()); 772 } 773 774 private void deleteOcflMigratedResource(final String id, 775 final InteractionModel interactionModel, 776 final OcflObjectSession session) { 777 LOGGER.debug("Deleting resource {}", id); 778 session.deleteContentFile(ResourceHeaders.builder() 779 .withId(id) 780 .withInteractionModel(interactionModel.getUri()) 781 .build()); 782 } 783 784 private String getObjectState(final ObjectVersionReference ov, final String pid) { 785 return ov.getObjectProperties().listProperties().stream() 786 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 787 .findFirst() 788 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 789 pid))) 790 .getValue(); 791 } 792 793 // Get object-level triples 794 private static Model getObjTriples(final ObjectVersionReference o, final String pid) { 795 final Model triples = ModelFactory.createDefaultModel(); 796 final String uri = "info:fedora/" + pid; 797 798 o.getObjectProperties().listProperties().forEach(p -> { 799 if (p.getName().contains("Date")) { 800 addDateLiteral(triples, uri, p.getName(), p.getValue()); 801 } else { 802 addStringLiteral(triples, uri, p.getName(), p.getValue()); 803 } 804 }); 805 806 return triples; 807 } 808 809 // Get datastream-level triples 810 private Model getDsTriples(final DatastreamVersion dv, 811 final String f6DsId, 812 final String createDate) { 813 final Model triples = ModelFactory.createDefaultModel(); 814 815 if (migrationType == MigrationType.PLAIN_OCFL) { 816 // These triples are server managed in F6 817 addDateLiteral(triples, 818 f6DsId, 819 "http://fedora.info/definitions/v4/repository#created", 820 createDate); 821 addDateLiteral(triples, 822 f6DsId, 823 "http://fedora.info/definitions/v4/repository#lastModified", 824 dv.getCreated()); 825 addStringLiteral(triples, 826 f6DsId, 827 "http://purl.org/dc/terms/identifier", 828 dv.getDatastreamInfo().getDatastreamId()); 829 addStringLiteral(triples, 830 f6DsId, 831 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 832 dv.getMimeType()); 833 addLongLiteral(triples, 834 f6DsId, 835 "http://www.loc.gov/premis/rdf/v1#size", 836 dv.getSize()); 837 838 if (dv.getContentDigest() != null) { 839 addStringLiteral(triples, 840 f6DsId, 841 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 842 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 843 dv.getContentDigest().getDigest().toLowerCase()); 844 } 845 } 846 847 addStringLiteral(triples, 848 f6DsId, 849 "http://purl.org/dc/terms/title", 850 dv.getLabel()); 851 addStringLiteral(triples, 852 f6DsId, 853 "http://fedora.info/definitions/1/0/access/objState", 854 dv.getDatastreamInfo().getState()); 855 addStringLiteral(triples, 856 f6DsId, 857 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 858 dv.getFormatUri()); 859 860 return triples; 861 } 862 863 private static void addStringLiteral(final Model m, 864 final String s, 865 final String p, 866 final String o) { 867 if (o != null) { 868 m.add(m.createResource(s), m.createProperty(p), o); 869 } 870 } 871 872 private static void addDateLiteral(final Model m, 873 final String s, 874 final String p, 875 final String date) { 876 if (date != null) { 877 m.addLiteral(m.createResource(s), 878 m.createProperty(p), 879 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 880 } 881 } 882 883 private static void addLongLiteral(final Model m, 884 final String s, 885 final String p, 886 final long number) { 887 if (number != -1) { 888 m.addLiteral(m.createResource(s), 889 m.createProperty(p), 890 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 891 } 892 } 893 894 /** 895 * @param mime any mimetype as String 896 * @return extension associated with arg mime, return includes '.' in extension (.txt). 897 * ..Empty String if unrecognized mime 898 */ 899 private static String getExtension(final String mime) { 900 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 901 MimeType type; 902 try { 903 type = allTypes.forName(mime); 904 } catch (final MimeTypeException e) { 905 type = null; 906 } 907 908 if (type != null) { 909 return type.getExtension(); 910 } 911 912 LOGGER.warn("No mimetype found for '{}'", mime); 913 return ""; 914 } 915 916 private Model parseRdfXml(final DatastreamVersion datastreamVersion) { 917 final var model = ModelFactory.createDefaultModel(); 918 try (final var is = datastreamVersion.getContent()) { 919 RDFDataMgr.read(model, is, Lang.RDFXML); 920 return model; 921 } catch (Exception e) { 922 throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s", 923 datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(), 924 datastreamVersion.getDatastreamInfo().getDatastreamId()), e); 925 } 926 } 927 928 private Map<String, Model> splitRelsInt(final Model relsIntModel) { 929 final Map<String, Model> splitModels = new HashMap<>(); 930 for (final var it = relsIntModel.listStatements(); it.hasNext();) { 931 final var statement = it.next(); 932 final var id = statement.getSubject().getURI(); 933 final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel()); 934 model.add(statement); 935 } 936 return splitModels; 937 } 938 939 /** 940 * Creates a new session for the datastream when migrating as atomic resources, or returns the object session, 941 * when migrating as archival groups. 942 * 943 * @param id the datastream's id in fedora 6 944 * @param objectSession the datastream's object session 945 * @return either a new datastream session or the object session 946 */ 947 private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) { 948 if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) { 949 return objectSession; 950 } else { 951 return newSession(id); 952 } 953 } 954 955 private OcflObjectSession newSession(final String id) { 956 return new OcflObjectSessionWrapper(sessionFactory.newSession(id)); 957 } 958 959 /** 960 * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content 961 * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from 962 * one of the RELS-* files. They are maintained separately because it's possible for them to be updated 963 * independently and we need to be able to construct the correct set of triples when one changes. 964 */ 965 private static class MetaHolder { 966 Model contentTriples; 967 Model relsTriples; 968 Model dcTriples; 969 ResourceHeaders.Builder headers; 970 971 public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) { 972 return new MetaHolder(contentTriples, null, headers); 973 } 974 975 private MetaHolder() { 976 } 977 978 private MetaHolder(final Model contentTriples, 979 final Model relsTriples, 980 final Model dcTriples, 981 final ResourceHeaders.Builder headers) { 982 this.contentTriples = contentTriples; 983 this.relsTriples = relsTriples; 984 this.dcTriples = dcTriples; 985 this.headers = headers; 986 } 987 988 private MetaHolder(final Model contentTriples, 989 final Model relsTriples, 990 final ResourceHeaders.Builder headers) { 991 this.contentTriples = contentTriples; 992 this.relsTriples = relsTriples; 993 this.headers = headers; 994 } 995 996 997 /** 998 * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples. 999 * 1000 * @return n-triples input stream 1001 */ 1002 public InputStream constructTriples() { 1003 final var output = new ByteArrayOutputStream(); 1004 final var triples = ModelFactory.createDefaultModel(); 1005 1006 if (contentTriples != null) { 1007 triples.add(contentTriples.listStatements()); 1008 } 1009 1010 if (relsTriples != null) { 1011 triples.add(relsTriples.listStatements()); 1012 } 1013 1014 if (dcTriples != null) { 1015 triples.add(dcTriples.listStatements()); 1016 } 1017 1018 triples.write(output, Lang.NTRIPLES.getName()); 1019 return new ByteArrayInputStream(output.toByteArray()); 1020 } 1021 1022 public MetaHolder setHeaders(final ResourceHeaders.Builder headers) { 1023 this.headers = headers; 1024 return this; 1025 } 1026 1027 public MetaHolder setContentTriples(final Model contentTriples) { 1028 this.contentTriples = contentTriples; 1029 return this; 1030 } 1031 1032 public MetaHolder setRelsTriples(final Model relsTriples) { 1033 this.relsTriples = relsTriples; 1034 return this; 1035 } 1036 public MetaHolder setDcTriples(final Model dcTriples) { 1037 this.dcTriples = dcTriples; 1038 return this; 1039 } 1040 1041 } 1042 1043 private static class BinaryMeta { 1044 final String name; 1045 final String mimeType; 1046 final String label; 1047 1048 public BinaryMeta(final String name, final String mimeType, final String label) { 1049 this.name = name; 1050 this.mimeType = mimeType; 1051 this.label = label; 1052 } 1053 } 1054 1055}