001/* 002 * Copyright 2019 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.fcrepo.migration.handlers.ocfl; 018 019import com.google.common.base.Preconditions; 020import com.google.common.base.Strings; 021import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; 022import com.hp.hpl.jena.rdf.model.Model; 023import com.hp.hpl.jena.rdf.model.ModelFactory; 024import org.apache.commons.codec.digest.DigestUtils; 025import org.apache.commons.io.IOUtils; 026import org.apache.tika.config.TikaConfig; 027import org.apache.tika.detect.Detector; 028import org.apache.tika.io.TikaInputStream; 029import org.apache.tika.metadata.Metadata; 030import org.apache.tika.mime.MimeType; 031import org.apache.tika.mime.MimeTypeException; 032import org.apache.tika.mime.MimeTypes; 033import org.fcrepo.migration.DatastreamVersion; 034import org.fcrepo.migration.FedoraObjectVersionHandler; 035import org.fcrepo.migration.MigrationType; 036import org.fcrepo.migration.ObjectVersionReference; 037import org.fcrepo.storage.ocfl.InteractionModel; 038import org.fcrepo.storage.ocfl.OcflObjectSession; 039import org.fcrepo.storage.ocfl.OcflObjectSessionFactory; 040import org.fcrepo.storage.ocfl.ResourceHeaders; 041import org.slf4j.Logger; 042 043import java.io.ByteArrayInputStream; 044import java.io.ByteArrayOutputStream; 045import java.io.IOException; 046import java.io.InputStream; 047import java.io.UncheckedIOException; 048import java.net.URI; 049import java.time.Instant; 050import java.time.OffsetDateTime; 051import java.util.ArrayList; 052import java.util.HashMap; 053import java.util.Map; 054import java.util.concurrent.atomic.AtomicBoolean; 055 056import static org.slf4j.LoggerFactory.getLogger; 057 058/** 059 * Writes a Fedora object as a single ArchiveGroup. 060 * <p> 061 * All datastreams and object metadata from a fcrepo3 object are persisted to a 062 * single OCFL object (ArchiveGroup in fcrepo6 parlance). 063 * </p> 064 * <p> 065 * The contents of each datastream are written verbatim. No attempt is made to 066 * re-write the RELS-EXT to replace subjects and objects with their LDP 067 * counterparts. 068 * </p> 069 * <p> 070 * Note: fedora-specific OCFL serialization features (such as redirects, 071 * container metadata, etc) is not fully defined yet, so are not included here 072 * 073 * @author apb@jhu.edu 074 */ 075public class ArchiveGroupHandler implements FedoraObjectVersionHandler { 076 077 private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class); 078 079 private static final String FCREPO_ROOT = "info:fedora/"; 080 081 private static final Map<String, String> externalHandlingMap = Map.of( 082 "E", "proxy", 083 "R", "redirect" 084 ); 085 086 private static final String INLINE_XML = "X"; 087 088 private static final String DS_INACTIVE = "I"; 089 private static final String DS_DELETED = "D"; 090 091 private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state"; 092 private static final String OBJ_INACTIVE = "Inactive"; 093 private static final String OBJ_DELETED = "Deleted"; 094 095 private final OcflObjectSessionFactory sessionFactory; 096 private final boolean addDatastreamExtensions; 097 private final boolean deleteInactive; 098 private final MigrationType migrationType; 099 private final String user; 100 private final Detector mimeDetector; 101 102 /** 103 * Create an ArchiveGroupHandler, 104 * 105 * @param sessionFactory 106 * OCFL session factory 107 * @param migrationType 108 * the type of migration to do 109 * @param addDatastreamExtensions 110 * true if datastreams should be written with file extensions 111 * @param deleteInactive 112 * true if inactive objects and datastreams should be migrated as deleted 113 * @param user 114 * the username to associated with the migrated resources 115 */ 116 public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory, 117 final MigrationType migrationType, 118 final boolean addDatastreamExtensions, 119 final boolean deleteInactive, 120 final String user) { 121 this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null"); 122 this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null"); 123 this.addDatastreamExtensions = addDatastreamExtensions; 124 this.deleteInactive = deleteInactive; 125 this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank"); 126 try { 127 this.mimeDetector = new TikaConfig().getDetector(); 128 } catch (Exception e) { 129 throw new RuntimeException(e); 130 } 131 } 132 133 @Override 134 public void processObjectVersions(final Iterable<ObjectVersionReference> versions) { 135 // We use the PID to identify the OCFL object 136 String objectId = null; 137 String f6ObjectId = null; 138 139 // We need to manually keep track of the datastream creation dates 140 final Map<String, String> dsCreateDates = new HashMap<>(); 141 142 String objectState = null; 143 final Map<String, String> datastreamStates = new HashMap<>(); 144 145 for (var ov : versions) { 146 if (ov.isFirstVersion()) { 147 objectId = ov.getObjectInfo().getPid(); 148 f6ObjectId = FCREPO_ROOT + objectId; 149 objectState = getObjectState(ov); 150 } 151 152 final OcflObjectSession session = sessionFactory.newSession(f6ObjectId); 153 154 // Object properties are written only once (as fcrepo3 object properties were unversioned). 155 if (ov.isFirstVersion()) { 156 writeObjectFiles(f6ObjectId, ov, session); 157 } 158 159 // Write datastreams and their metadata 160 for (var dv : ov.listChangedDatastreams()) { 161 final var mimeType = resolveMimeType(dv); 162 final String dsId = dv.getDatastreamInfo().getDatastreamId(); 163 final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId, mimeType); 164 final var datastreamFilename = lastPartFromId(f6DsId); 165 166 if (dv.isFirstVersionIn(ov.getObject())) { 167 dsCreateDates.put(dsId, dv.getCreated()); 168 datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState()); 169 } 170 final var createDate = dsCreateDates.get(dsId); 171 172 final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId, 173 datastreamFilename, mimeType, createDate); 174 175 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 176 InputStream content = null; 177 // Write a file for external content only for plain OCFL migration 178 if (migrationType == MigrationType.PLAIN_OCFL) { 179 content = IOUtils.toInputStream(dv.getExternalOrRedirectURL()); 180 } 181 session.writeResource(datastreamHeaders, content); 182 } else { 183 try (var content = dv.getContent()) { 184 session.writeResource(datastreamHeaders, content); 185 } catch (final IOException e) { 186 throw new UncheckedIOException(e); 187 } 188 } 189 190 writeDescriptionFiles(f6DsId, datastreamFilename, createDate, datastreamHeaders, dv, session); 191 } 192 193 LOGGER.debug("Committing object <{}>", f6ObjectId); 194 195 session.versionCreationTimestamp(OffsetDateTime.parse(ov.getVersionDate())); 196 session.commit(); 197 } 198 199 handleDeletedResources(f6ObjectId, objectState, datastreamStates); 200 } 201 202 private void handleDeletedResources(final String f6ObjectId, 203 final String objectState, 204 final Map<String, String> datastreamStates) { 205 final OcflObjectSession session = sessionFactory.newSession(f6ObjectId); 206 207 try { 208 final var now = OffsetDateTime.now(); 209 final var hasDeletes = new AtomicBoolean(false); 210 211 if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) { 212 hasDeletes.set(true); 213 214 datastreamStates.keySet().forEach(f6DsId -> { 215 deleteDatastream(f6DsId, now.toInstant(), session); 216 }); 217 218 if (migrationType == MigrationType.PLAIN_OCFL) { 219 deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session); 220 } else { 221 deleteF6MigratedResource(f6ObjectId, now.toInstant(), session); 222 } 223 } else { 224 datastreamStates.forEach((f6DsId, state) -> { 225 if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) { 226 hasDeletes.set(true); 227 deleteDatastream(f6DsId, now.toInstant(), session); 228 } 229 }); 230 } 231 232 if (hasDeletes.get()) { 233 session.versionCreationTimestamp(now); 234 session.commit(); 235 } else { 236 session.abort(); 237 } 238 } catch (RuntimeException e) { 239 session.abort(); 240 throw e; 241 } 242 } 243 244 private void writeObjectFiles(final String f6ObjectId, 245 final ObjectVersionReference ov, 246 final OcflObjectSession session) { 247 final var objectHeaders = createObjectHeaders(f6ObjectId, ov); 248 final var content = getObjTriples(ov); 249 session.writeResource(objectHeaders, content); 250 } 251 252 private void writeDescriptionFiles(final String f6Dsid, 253 final String datastreamFilename, 254 final String createDate, 255 final ResourceHeaders datastreamHeaders, 256 final DatastreamVersion dv, 257 final OcflObjectSession session) { 258 final var descriptionHeaders = createDescriptionHeaders(f6Dsid, 259 datastreamFilename, 260 datastreamHeaders); 261 session.writeResource(descriptionHeaders, getDsTriples(dv, f6Dsid, createDate)); 262 } 263 264 private String f6DescriptionId(final String f6ResourceId) { 265 return f6ResourceId + "/fcr:metadata"; 266 } 267 268 private String lastPartFromId(final String id) { 269 return id.substring(id.lastIndexOf('/') + 1); 270 } 271 272 private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId, final String mimeType) { 273 var id = f6ObjectId + "/" + datastreamId; 274 275 if (addDatastreamExtensions && !Strings.isNullOrEmpty(mimeType)) { 276 id += getExtension(mimeType); 277 } 278 279 return id; 280 } 281 282 private ResourceHeaders.Builder createHeaders(final String id, 283 final String parentId, 284 final InteractionModel model) { 285 final var headers = ResourceHeaders.builder(); 286 headers.withId(id); 287 headers.withParent(parentId); 288 headers.withInteractionModel(model.getUri()); 289 return headers; 290 } 291 292 private ResourceHeaders createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) { 293 final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER); 294 headers.withArchivalGroup(true); 295 headers.withObjectRoot(true); 296 headers.withLastModifiedBy(user); 297 headers.withCreatedBy(user); 298 299 ov.getObjectProperties().listProperties().forEach(p -> { 300 if (p.getName().contains("lastModifiedDate")) { 301 final var lastModified = Instant.parse(p.getValue()); 302 headers.withLastModifiedDate(lastModified); 303 headers.withStateToken(DigestUtils.md5Hex( 304 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 305 } else if (p.getName().contains("createdDate")) { 306 headers.withCreatedDate(Instant.parse(p.getValue())); 307 } 308 }); 309 310 return headers.build(); 311 } 312 313 private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv, 314 final String f6DsId, 315 final String f6ObjectId, 316 final String filename, 317 final String mime, 318 final String createDate) { 319 final var lastModified = Instant.parse(dv.getCreated()); 320 final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF); 321 headers.withArchivalGroupId(f6ObjectId); 322 headers.withFilename(filename); 323 headers.withCreatedDate(Instant.parse(createDate)); 324 headers.withLastModifiedDate(lastModified); 325 headers.withLastModifiedBy(user); 326 headers.withCreatedBy(user); 327 328 if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) { 329 headers.withExternalHandling( 330 externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup())); 331 headers.withExternalUrl(dv.getExternalOrRedirectURL()); 332 } 333 334 headers.withArchivalGroup(false); 335 headers.withObjectRoot(false); 336 if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) { 337 headers.withContentSize(dv.getSize()); 338 } 339 340 if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) { 341 final var digest = dv.getContentDigest(); 342 final var digests = new ArrayList<URI>(); 343 digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase())); 344 headers.withDigests(digests); 345 } 346 347 headers.withMimeType(mime); 348 headers.withStateToken(DigestUtils.md5Hex( 349 String.valueOf(lastModified.toEpochMilli())).toUpperCase()); 350 351 return headers.build(); 352 } 353 354 private ResourceHeaders createDescriptionHeaders(final String f6DsId, 355 final String filename, 356 final ResourceHeaders datastreamHeaders) { 357 final var id = f6DescriptionId(f6DsId); 358 final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION); 359 360 headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId()); 361 headers.withFilename(filename); 362 headers.withCreatedDate(datastreamHeaders.getCreatedDate()); 363 headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate()); 364 headers.withCreatedBy(datastreamHeaders.getCreatedBy()); 365 headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy()); 366 367 headers.withArchivalGroup(false); 368 headers.withObjectRoot(false); 369 headers.withStateToken(datastreamHeaders.getStateToken()); 370 371 return headers.build(); 372 } 373 374 private String resolveMimeType(final DatastreamVersion dv) { 375 String mime = dv.getMimeType(); 376 377 if (Strings.isNullOrEmpty(mime)) { 378 final var meta = new Metadata(); 379 meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId()); 380 try (var content = TikaInputStream.get(dv.getContent())) { 381 mime = mimeDetector.detect(content, meta).toString(); 382 } catch (IOException e) { 383 throw new UncheckedIOException(e); 384 } 385 } 386 387 return mime; 388 } 389 390 private void deleteDatastream(final String id, 391 final Instant lastModified, 392 final OcflObjectSession session) { 393 if (migrationType == MigrationType.PLAIN_OCFL) { 394 deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session); 395 deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session); 396 } else { 397 deleteF6MigratedResource(id, lastModified, session); 398 deleteF6MigratedResource(f6DescriptionId(id), lastModified, session); 399 } 400 } 401 402 private void deleteF6MigratedResource(final String id, 403 final Instant lastModified, 404 final OcflObjectSession session) { 405 LOGGER.debug("Deleting resource {}", id); 406 final var headers = session.readHeaders(id); 407 session.deleteContentFile(ResourceHeaders.builder(headers) 408 .withDeleted(true) 409 .withLastModifiedDate(lastModified) 410 .build()); 411 } 412 413 private void deleteOcflMigratedResource(final String id, 414 final InteractionModel interactionModel, 415 final OcflObjectSession session) { 416 LOGGER.debug("Deleting resource {}", id); 417 session.deleteContentFile(ResourceHeaders.builder() 418 .withId(id) 419 .withInteractionModel(interactionModel.getUri()) 420 .build()); 421 } 422 423 private String getObjectState(final ObjectVersionReference ov) { 424 return ov.getObjectProperties().listProperties().stream() 425 .filter(prop -> OBJ_STATE_PROP.equals(prop.getName())) 426 .findFirst() 427 .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information", 428 ov.getObjectInfo().getPid()))) 429 .getValue(); 430 } 431 432 // Get object-level triples 433 private static InputStream getObjTriples(final ObjectVersionReference o) { 434 final ByteArrayOutputStream out = new ByteArrayOutputStream(); 435 final Model triples = ModelFactory.createDefaultModel(); 436 final String uri = "info:fedora/" + o.getObjectInfo().getPid(); 437 438 o.getObjectProperties().listProperties().forEach(p -> { 439 if (p.getName().contains("Date")) { 440 addDateLiteral(triples, uri, p.getName(), p.getValue()); 441 } else { 442 addStringLiteral(triples, uri, p.getName(), p.getValue()); 443 } 444 }); 445 446 triples.write(out, "N-TRIPLES"); 447 return new ByteArrayInputStream(out.toByteArray()); 448 } 449 450 // Get datastream-level triples 451 private InputStream getDsTriples(final DatastreamVersion dv, 452 final String f6DsId, 453 final String createDate) { 454 final ByteArrayOutputStream out = new ByteArrayOutputStream(); 455 final Model triples = ModelFactory.createDefaultModel(); 456 457 if (migrationType == MigrationType.PLAIN_OCFL) { 458 // These triples are server managed in F6 459 addDateLiteral(triples, 460 f6DsId, 461 "http://fedora.info/definitions/v4/repository#created", 462 createDate); 463 addDateLiteral(triples, 464 f6DsId, 465 "http://fedora.info/definitions/v4/repository#lastModified", 466 dv.getCreated()); 467 addStringLiteral(triples, 468 f6DsId, 469 "http://purl.org/dc/terms/identifier", 470 dv.getDatastreamInfo().getDatastreamId()); 471 addStringLiteral(triples, 472 f6DsId, 473 "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType", 474 dv.getMimeType()); 475 addLongLiteral(triples, 476 f6DsId, 477 "http://www.loc.gov/premis/rdf/v1#size", 478 dv.getSize()); 479 480 if (dv.getContentDigest() != null) { 481 addStringLiteral(triples, 482 f6DsId, 483 "http://www.loc.gov/premis/rdf/v1#hasMessageDigest", 484 "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" + 485 dv.getContentDigest().getDigest().toLowerCase()); 486 } 487 } 488 489 addStringLiteral(triples, 490 f6DsId, 491 "http://purl.org/dc/terms/title", 492 dv.getLabel()); 493 addStringLiteral(triples, 494 f6DsId, 495 "http://fedora.info/definitions/1/0/access/objState", 496 dv.getDatastreamInfo().getState()); 497 addStringLiteral(triples, 498 f6DsId, 499 "http://www.loc.gov/premis/rdf/v1#formatDesignation", 500 dv.getFormatUri()); 501 502 triples.write(out, "N-TRIPLES"); 503 return new ByteArrayInputStream(out.toByteArray()); 504 } 505 506 private static void addStringLiteral(final Model m, 507 final String s, 508 final String p, 509 final String o) { 510 if (o != null) { 511 m.add(m.createResource(s), m.createProperty(p), o); 512 } 513 } 514 515 private static void addDateLiteral(final Model m, 516 final String s, 517 final String p, 518 final String date) { 519 if (date != null) { 520 m.addLiteral(m.createResource(s), 521 m.createProperty(p), 522 m.createTypedLiteral(date, XSDDatatype.XSDdateTime)); 523 } 524 } 525 526 private static void addLongLiteral(final Model m, 527 final String s, 528 final String p, 529 final long number) { 530 if (number != -1) { 531 m.addLiteral(m.createResource(s), 532 m.createProperty(p), 533 m.createTypedLiteral(number, XSDDatatype.XSDlong)); 534 } 535 } 536 537 /** 538 * @param mime any mimetype as String 539 * @return extension associated with arg mime, return includes '.' in extension (.txt). 540 * ..Empty String if unrecognized mime 541 */ 542 private static String getExtension(final String mime) { 543 final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes(); 544 MimeType type; 545 try { 546 type = allTypes.forName(mime); 547 } catch (final MimeTypeException e) { 548 type = null; 549 } 550 551 if (type != null) { 552 return type.getExtension(); 553 } 554 555 LOGGER.warn("No mimetype found for '{}'", mime); 556 return ""; 557 } 558 559}