001/*
002 * Copyright 2019 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.fcrepo.migration.handlers.ocfl;
018
019import at.favre.lib.bytes.Bytes;
020import com.google.common.base.Preconditions;
021import com.google.common.base.Strings;
022import com.google.common.collect.Sets;
023import org.apache.commons.codec.digest.DigestUtils;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.lang3.StringUtils;
026import org.apache.jena.datatypes.xsd.XSDDatatype;
027import org.apache.jena.rdf.model.Model;
028import org.apache.jena.rdf.model.ModelFactory;
029import org.apache.jena.riot.Lang;
030import org.apache.jena.riot.RDFDataMgr;
031import org.apache.tika.config.TikaConfig;
032import org.apache.tika.detect.Detector;
033import org.apache.tika.io.TikaInputStream;
034import org.apache.tika.metadata.Metadata;
035import org.apache.tika.mime.MimeType;
036import org.apache.tika.mime.MimeTypeException;
037import org.apache.tika.mime.MimeTypes;
038import org.fcrepo.migration.ContentDigest;
039import org.fcrepo.migration.DatastreamVersion;
040import org.fcrepo.migration.FedoraObjectVersionHandler;
041import org.fcrepo.migration.MigrationType;
042import org.fcrepo.migration.ObjectInfo;
043import org.fcrepo.migration.ObjectVersionReference;
044import org.fcrepo.migration.ResourceMigrationType;
045import org.fcrepo.storage.ocfl.InteractionModel;
046import org.fcrepo.storage.ocfl.OcflObjectSession;
047import org.fcrepo.storage.ocfl.OcflObjectSessionFactory;
048import org.fcrepo.storage.ocfl.ResourceHeaders;
049import org.fcrepo.storage.ocfl.ResourceHeadersVersion;
050import org.fcrepo.storage.ocfl.exception.NotFoundException;
051import org.slf4j.Logger;
052
053import java.io.BufferedInputStream;
054import java.io.ByteArrayInputStream;
055import java.io.ByteArrayOutputStream;
056import java.io.IOException;
057import java.io.InputStream;
058import java.io.UncheckedIOException;
059import java.net.URI;
060import java.nio.charset.StandardCharsets;
061import java.nio.file.Files;
062import java.security.DigestInputStream;
063import java.security.MessageDigest;
064import java.security.NoSuchAlgorithmException;
065import java.time.Instant;
066import java.time.OffsetDateTime;
067import java.time.ZoneOffset;
068import java.util.ArrayList;
069import java.util.HashMap;
070import java.util.HashSet;
071import java.util.Map;
072import java.util.Set;
073import java.util.concurrent.atomic.AtomicBoolean;
074
075import static org.slf4j.LoggerFactory.getLogger;
076
077/**
078 * Writes a Fedora object as a single ArchiveGroup.
079 * <p>
080 * All datastreams and object metadata from a fcrepo3 object are persisted to a
081 * single OCFL object (ArchiveGroup in fcrepo6 parlance).
082 * </p>
083 * <p>
084 * The contents of each datastream are written verbatim. No attempt is made to
085 * re-write the RELS-EXT to replace subjects and objects with their LDP
086 * counterparts.
087 * </p>
088 * <p>
089 * Note: fedora-specific OCFL serialization features (such as redirects,
090 * container metadata, etc) is not fully defined yet, so are not included here
091 *
092 * @author apb@jhu.edu
093 */
094public class ArchiveGroupHandler implements FedoraObjectVersionHandler {
095
096    private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class);
097
098    private static final String FCREPO_ROOT = "info:fedora/";
099    private static final String FCRMETA_SUFFIX = "/fcr:metadata";
100
101    private static final Map<String, String> externalHandlingMap = Map.of(
102            "E", "proxy",
103            "R", "redirect"
104    );
105
106    private static final String INLINE_XML = "X";
107
108    private static final String DS_INACTIVE = "I";
109    private static final String DS_DELETED = "D";
110
111    private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state";
112    private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename";
113    private static final String OBJ_INACTIVE = "Inactive";
114    private static final String OBJ_DELETED = "Deleted";
115
116    private static final String RELS_EXT = "RELS-EXT";
117    private static final String RELS_INT = "RELS-INT";
118
119    private final OcflObjectSessionFactory sessionFactory;
120    private final boolean addDatastreamExtensions;
121    private final boolean deleteInactive;
122    private final boolean foxmlFile;
123    private final MigrationType migrationType;
124    private final ResourceMigrationType resourceMigrationType;
125    private final String user;
126    private final String idPrefix;
127    private final Detector mimeDetector;
128    private final boolean headOnly;
129    private final boolean disableChecksumValidation;
130
131    /**
132     * Create an ArchiveGroupHandler,
133     *
134     * @param sessionFactory
135     *        OCFL session factory
136     * @param migrationType
137     *        the type of migration to do
138     * @param resourceMigrationType
139     *        how resources should be migrated
140     * @param addDatastreamExtensions
141     *        true if datastreams should be written with file extensions
142     * @param deleteInactive
143     *        true if inactive objects and datastreams should be migrated as deleted
144     * @param foxmlFile
145     *        true if foxml file should be migrated as a whole file, instead of creating property files
146     * @param user
147     *        the username to associated with the migrated resources
148     * @param idPrefix
149     *        the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3)
150     * @param headOnly
151     *        flag to enable head only migrations
152     * @param disableChecksumValidation
153     */
154    public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory,
155                               final MigrationType migrationType,
156                               final ResourceMigrationType resourceMigrationType,
157                               final boolean addDatastreamExtensions,
158                               final boolean deleteInactive,
159                               final boolean foxmlFile,
160                               final String user,
161                               final String idPrefix,
162                               final boolean headOnly,
163                               final boolean disableChecksumValidation) {
164        this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null");
165        this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null");
166        this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType,
167                "resourceMigrationType cannot be null");
168        this.addDatastreamExtensions = addDatastreamExtensions;
169        this.deleteInactive = deleteInactive;
170        this.foxmlFile = foxmlFile;
171        this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank");
172        this.idPrefix = idPrefix;
173        this.headOnly = headOnly;
174        this.disableChecksumValidation = disableChecksumValidation;
175        try {
176            this.mimeDetector = new TikaConfig().getDetector();
177        } catch (Exception e) {
178            throw new RuntimeException(e);
179        }
180    }
181
182    @Override
183    public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) {
184        // We use the PID to identify the OCFL object
185        final String objectId = objectInfo.getPid();
186        final String f6ObjectId = idPrefix + objectId;
187
188        // We need to manually keep track of the datastream creation dates
189        final Map<String, String> dsCreateDates = new HashMap<>();
190
191        String objectState = null;
192        OffsetDateTime objectCreation = null;
193        OcflObjectSession objectSession = null;
194
195        final Map<String, String> datastreamStates = new HashMap<>();
196        // tracks the triples used to create containers and binary descriptions
197        final Map<String, MetaHolder> metaMap = new HashMap<>();
198        // tracks info about binary resources needed to construct filenames
199        final Map<String, BinaryMeta> binaryMeta = new HashMap<>();
200        // tracks filenames pulled from RELS-INT
201        final Map<String, String> filenameMap = new HashMap<>();
202
203        for (var ov : versions) {
204            // tracks the binary descriptions that need to be written
205            final Set<String> toWrite = new HashSet<>();
206            // tracks the binaries that need their filename updated base on RELS-INT
207            final Set<String> relsFilenameUpdates = new HashSet<>();
208            // tracks the binaries that need their filename updated based on a RELS-INT removal
209            final Map<String, String> relsDeletedFilenames = new HashMap<>();
210
211            // reuse the objectSession when headOnly is set
212            objectSession = (objectSession == null || !headOnly) ? newSession(f6ObjectId) : objectSession;
213
214            if (ov.isFirstVersion()) {
215                if (objectSession.containsResource(f6ObjectId)) {
216                    throw new RuntimeException(f6ObjectId + " already exists!");
217                }
218                objectCreation = OffsetDateTime.parse(ov.getVersionDate());
219                objectState = getObjectState(ov, objectId);
220                // Object properties are written only once (as fcrepo3 object properties were unversioned).
221                if (foxmlFile) {
222                    try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) {
223                        final var foxmlDsId = f6ObjectId + "/FOXML";
224                        final var headers = createHeaders(foxmlDsId, f6ObjectId,
225                                InteractionModel.NON_RDF).build();
226                        objectSession.writeResource(headers, is);
227                        //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources()
228                        datastreamStates.put(foxmlDsId, DS_DELETED);
229                    } catch (IOException io) {
230                        LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io);
231                        throw new UncheckedIOException(io);
232                    }
233                } else {
234                    final var objectHeaders = createObjectHeaders(f6ObjectId, ov);
235                    final var content = getObjTriples(ov, objectId);
236                    final var meta = MetaHolder.fromContent(content, objectHeaders);
237                    metaMap.put(f6ObjectId, meta);
238                    objectSession.writeResource(meta.headers.build(), meta.constructTriples());
239                }
240            }
241
242            final var datastreamSessions = new HashMap<String, OcflObjectSession>();
243
244            // Write datastreams and their metadata
245            for (var dv : ov.listChangedDatastreams()) {
246                final var mimeType = resolveMimeType(dv);
247                final String dsId = dv.getDatastreamInfo().getDatastreamId();
248                final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId);
249                final var datastreamFilename = lastPartFromId(f6DsId);
250
251                final var datastreamSession = datastreamSession(f6DsId, objectSession);
252                datastreamSessions.putIfAbsent(f6DsId, datastreamSession);
253
254                if (dv.isFirstVersionIn(ov.getObject())) {
255                    dsCreateDates.put(dsId, dv.getCreated());
256                    datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState());
257                }
258
259                final var createDate = dsCreateDates.get(dsId);
260
261                final var filename = resolveFilename(datastreamFilename,
262                        dv.getLabel(), filenameMap.get(f6DsId), mimeType);
263
264                relsDeletedFilenames.remove(f6DsId);
265
266                final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId,
267                        filename, mimeType, createDate);
268
269                binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel()));
270
271                if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
272                    InputStream content = null;
273                    // for plain OCFL migrations, write a file containing the external/redirect URL
274                    if (migrationType == MigrationType.PLAIN_OCFL) {
275                        content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8);
276                    }
277                    datastreamSession.writeResource(datastreamHeaders, content);
278                } else {
279                    try (var contentStream = dv.getContent()) {
280                        writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession);
281                    } catch (final IOException e) {
282                        throw new UncheckedIOException(e);
283                    }
284                }
285
286                if (!foxmlFile) {
287                    final var f6DescId = f6DescriptionId(f6DsId);
288                    final var descriptionHeaders = createDescriptionHeaders(f6DsId,
289                            datastreamHeaders);
290                    final var descriptionTriples = getDsTriples(dv, f6DsId, createDate);
291                    metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder())
292                            .setHeaders(descriptionHeaders)
293                            .setContentTriples(descriptionTriples);
294                    toWrite.add(f6DescId);
295
296                    if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) {
297                        final var triples = parseRdfXml(dv);
298                        if (RELS_EXT.equals(dsId)) {
299                            metaMap.get(f6ObjectId).setRelsTriples(triples);
300                            toWrite.add(f6ObjectId);
301                        } else {
302                            final Map<String, Model> splitModels = splitRelsInt(triples);
303                            final var oldIds = new HashSet<>(filenameMap.keySet());
304                            filenameMap.clear();
305
306                            splitModels.forEach((id, model) -> {
307                                final var descId = f6DescriptionId(id);
308                                metaMap.computeIfAbsent(descId, k -> new MetaHolder())
309                                        .setRelsTriples(model);
310                                toWrite.add(descId);
311
312                                // Check to see if there are any file names that need updated
313                                for (final var it = model.listStatements(); it.hasNext(); ) {
314                                    final var statement = it.next();
315                                    if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) {
316                                        filenameMap.put(id, statement.getObject().toString());
317                                        relsFilenameUpdates.add(id);
318                                        break;
319                                    }
320                                }
321                            });
322
323                            // The filename was set once but is no longer
324                            final var deleted = Sets.difference(oldIds, filenameMap.keySet());
325                            deleted.forEach(id -> {
326                                final var meta = binaryMeta.get(id);
327                                if (meta != null) {
328                                    relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label,
329                                            null, meta.mimeType));
330                                }
331                            });
332                        }
333                    }
334                }
335            }
336
337            writeMeta(toWrite, metaMap, objectSession, datastreamSessions);
338            updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions);
339
340            if (!headOnly) {
341                LOGGER.debug("Committing object <{}>", f6ObjectId);
342
343                final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate());
344
345                objectSession.versionCreationTimestamp(creationTimestamp);
346                objectSession.commit();
347
348                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
349                    datastreamSessions.forEach((id, session) -> {
350                        LOGGER.debug("Committing object <{}>", id);
351                        session.versionCreationTimestamp(creationTimestamp);
352                        session.commit();
353                    });
354                }
355            }
356        }
357
358        handleDeletedResources(f6ObjectId, objectState, datastreamStates, objectSession);
359
360        // final commit when headOnly is set
361        if (headOnly && objectSession != null) {
362            LOGGER.debug("Committing object <{}>", f6ObjectId);
363            objectSession.versionCreationTimestamp(objectCreation);
364            objectSession.commit();
365        }
366    }
367
368    /**
369     * Resolves the filename of the datastream based on the following precedence:
370     *
371     * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT
372     * 2. LABEL from datastream meta
373     * 3. Name of the datastream
374     *
375     * If extensions should be added, then an extension is picked based on the mime type. If the filename already
376     * includes a `.` then no extension is added.
377     *
378     * @param dsName the name of the datastream
379     * @param labelName the datastream's label
380     * @param downloadName the download name from RELS-INT
381     * @param mimeType the datastream's mime type
382     * @return the resolved filename
383     */
384    private String resolveFilename(final String dsName,
385                                   final String labelName,
386                                   final String downloadName,
387                                   final String mimeType) {
388        String filename;
389        if (StringUtils.isNotBlank(downloadName)) {
390            filename = downloadName;
391        } else if (StringUtils.isNotBlank(labelName)) {
392            filename = labelName;
393        } else {
394            filename = dsName;
395        }
396
397        if (addDatastreamExtensions
398                && StringUtils.isNotBlank(mimeType)
399                && !filename.contains(".")) {
400            filename += getExtension(mimeType);
401        }
402
403        return filename;
404    }
405
406    /**
407     * RDF resources are written after writing all other binaries in the version because they can be affected by
408     * RELS-INT or RELS-EXT updates.
409     *
410     * @param toWrite the set of resources that should be written to this version
411     * @param metaMap the map of all known rdf resources
412     * @param objectSession the ocfl session for the object
413     * @param datastreamSessions the ocfl sessions for the datastreams
414     */
415    private void writeMeta(final Set<String> toWrite,
416                           final Map<String, MetaHolder> metaMap,
417                           final OcflObjectSession objectSession,
418                           final Map<String, OcflObjectSession> datastreamSessions) {
419        for (final var id : toWrite) {
420            final var meta = metaMap.get(id);
421
422            if (meta.headers == null) {
423                // This only happens if there's a RELS-INT that references a datastream before it exists.
424                // Skip for now. The triples will be added once the datastream exists.
425                continue;
426            }
427
428            final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
429                    k -> datastreamSession(k, objectSession));
430
431            // Need to copy over the memento created date from the existing headers because it may have been updated
432            // when a description's binary was updated
433            if (migrationType == MigrationType.FEDORA_OCFL) {
434                try {
435                    final var existingHeaders = session.readHeaders(id);
436                    meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate());
437                } catch (NotFoundException e) {
438                    // this just means the resource hasn't been written yet
439                }
440            }
441            session.writeResource(meta.headers.build(), meta.constructTriples());
442        }
443    }
444
445    private void updateFilenames(final Set<String> toUpdate,
446                                 final Map<String, String> filenameMap,
447                                 final Map<String, String> relsDeletedFilenames,
448                                 final OcflObjectSession objectSession,
449                                 final Map<String, OcflObjectSession> datastreamSessions) {
450        if (migrationType == MigrationType.FEDORA_OCFL) {
451            toUpdate.forEach(id -> {
452                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
453                                                                       k -> datastreamSession(k, objectSession));
454                final var origHeaders = session.readHeaders(id);
455                final var filename = filenameMap.get(id);
456                if (StringUtils.isNotBlank(filename)) {
457                    final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
458                    session.writeHeaders(newHeaders);
459                }
460            });
461            relsDeletedFilenames.forEach((id, filename) -> {
462                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
463                                                                       k -> datastreamSession(k, objectSession));
464                final var origHeaders = session.readHeaders(id);
465                final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
466                session.writeHeaders(newHeaders);
467            });
468        }
469    }
470
471    private boolean fedora3DigestValid(final ContentDigest f3Digest) {
472        return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) &&
473                StringUtils.isNotBlank(f3Digest.getDigest());
474    }
475
476    private void writeDatastreamContent(final DatastreamVersion dv,
477                                        final ResourceHeaders datastreamHeaders,
478                                        final InputStream contentStream,
479                                        final OcflObjectSession session) throws IOException {
480        if (disableChecksumValidation) {
481            session.writeResource(datastreamHeaders, contentStream);
482            return;
483        }
484        final var f3Digest = dv.getContentDigest();
485        final var ocflObjectId = session.ocflObjectId();
486        final var datastreamId = dv.getDatastreamInfo().getDatastreamId();
487        final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup();
488        if (fedora3DigestValid(f3Digest)) {
489            try {
490                final var messageDigest = MessageDigest.getInstance(f3Digest.getType());
491                if (migrationType == MigrationType.PLAIN_OCFL) {
492                    session.writeResource(datastreamHeaders, contentStream);
493                } else {
494                    try (var digestStream = new DigestInputStream(contentStream, messageDigest)) {
495                        session.writeResource(datastreamHeaders, digestStream);
496                        final var expectedDigest = f3Digest.getDigest();
497                        final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex();
498                        if (!actualDigest.equalsIgnoreCase(expectedDigest)) {
499                            final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s",
500                                    ocflObjectId, datastreamId, actualDigest, expectedDigest);
501                            throw new RuntimeException(msg);
502                        }
503                    }
504                }
505            } catch (final NoSuchAlgorithmException e) {
506                final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.",
507                        ocflObjectId, datastreamId, f3Digest.getType());
508                LOGGER.warn(msg);
509                session.writeResource(datastreamHeaders, contentStream);
510            }
511        } else {
512            if (datastreamControlGroup.equalsIgnoreCase("M")) {
513                final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.",
514                        ocflObjectId, datastreamId);
515                LOGGER.warn(msg);
516            }
517            session.writeResource(datastreamHeaders, contentStream);
518        }
519    }
520
521    private void handleDeletedResources(final String f6ObjectId,
522                                        final String objectState,
523                                        final Map<String, String> datastreamStates,
524                                        final OcflObjectSession objectSession) {
525        final OcflObjectSession session = headOnly ? objectSession : newSession(f6ObjectId);
526        final var datastreamSessions = new HashMap<String, OcflObjectSession>();
527
528        try {
529            final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC);
530            final var hasDeletes = new AtomicBoolean(false);
531
532            if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) {
533                hasDeletes.set(true);
534
535                datastreamStates.keySet().forEach(f6DsId -> {
536                    final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
537                            k -> datastreamSession(f6DsId, session));
538                    deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
539                });
540
541                if (migrationType == MigrationType.PLAIN_OCFL) {
542                    deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session);
543                } else {
544                    deleteF6MigratedResource(f6ObjectId, now.toInstant(), session);
545                }
546            } else {
547                datastreamStates.forEach((f6DsId, state) -> {
548                    if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) {
549                        final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
550                                k -> datastreamSession(f6DsId, session));
551                        hasDeletes.set(true);
552                        deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
553                    }
554                });
555            }
556
557            if (!headOnly && hasDeletes.get()) {
558                session.versionCreationTimestamp(now);
559                session.commit();
560
561                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
562                    datastreamSessions.forEach((id, dsSession) -> {
563                        dsSession.versionCreationTimestamp(now);
564                        dsSession.commit();
565                    });
566                }
567            } else if (!headOnly) {
568                session.abort();
569                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
570                    datastreamSessions.forEach((id, dsSession) -> {
571                        dsSession.abort();
572                    });
573                }
574            }
575        } catch (RuntimeException e) {
576            session.abort();
577            throw e;
578        }
579    }
580
581    private String f6DescriptionId(final String f6ResourceId) {
582        return f6ResourceId + FCRMETA_SUFFIX;
583    }
584
585    private String lastPartFromId(final String id) {
586        return id.substring(id.lastIndexOf('/') + 1);
587    }
588
589    private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) {
590        return f6ObjectId + "/" + datastreamId;
591    }
592
593    private ResourceHeaders.Builder createHeaders(final String id,
594                                                  final String parentId,
595                                                  final InteractionModel model) {
596        final var headers = ResourceHeaders.builder();
597        headers.withHeadersVersion(ResourceHeadersVersion.V1_0);
598        headers.withId(id);
599        headers.withParent(parentId);
600        headers.withInteractionModel(model.getUri());
601        return headers;
602    }
603
604    private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) {
605        final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER);
606        headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL);
607        headers.withObjectRoot(true);
608        headers.withLastModifiedBy(user);
609        headers.withCreatedBy(user);
610
611        ov.getObjectProperties().listProperties().forEach(p -> {
612            if (p.getName().contains("lastModifiedDate")) {
613                final var lastModified = Instant.parse(p.getValue());
614                headers.withLastModifiedDate(lastModified);
615                headers.withMementoCreatedDate(lastModified);
616                headers.withStateToken(DigestUtils.md5Hex(
617                        String.valueOf(lastModified.toEpochMilli())).toUpperCase());
618            } else if (p.getName().contains("createdDate")) {
619                headers.withCreatedDate(Instant.parse(p.getValue()));
620            }
621        });
622
623        return headers;
624    }
625
626    private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv,
627                                                    final String f6DsId,
628                                                    final String f6ObjectId,
629                                                    final String filename,
630                                                    final String mime,
631                                                    final String createDate) {
632        final var lastModified = Instant.parse(dv.getCreated());
633        final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF);
634        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
635            headers.withArchivalGroupId(f6ObjectId);
636        }
637        headers.withFilename(filename);
638        headers.withCreatedDate(Instant.parse(createDate));
639        headers.withLastModifiedDate(lastModified);
640        headers.withLastModifiedBy(user);
641        headers.withCreatedBy(user);
642        headers.withMementoCreatedDate(lastModified);
643
644        if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
645            headers.withExternalHandling(
646                    externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup()));
647            headers.withExternalUrl(dv.getExternalOrRedirectURL());
648        }
649
650        headers.withArchivalGroup(false);
651        headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC);
652        if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) {
653            headers.withContentSize(dv.getSize());
654        }
655
656        if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) {
657            if (!dv.getContentDigest().getDigest().equals("none")) {
658                final var digest = dv.getContentDigest();
659                final var digests = new ArrayList<URI>();
660                digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" +
661                            digest.getDigest().toLowerCase()));
662                headers.withDigests(digests);
663            } else {
664                LOGGER.warn("Digest content 'none' found. Not adding to header");
665            }
666        }
667
668        headers.withMimeType(mime);
669        headers.withStateToken(DigestUtils.md5Hex(
670                String.valueOf(lastModified.toEpochMilli())).toUpperCase());
671
672        return headers.build();
673    }
674
675    private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId,
676                                                             final ResourceHeaders datastreamHeaders) {
677        final var id = f6DescriptionId(f6DsId);
678        final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION);
679
680        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
681            headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId());
682        }
683        headers.withCreatedDate(datastreamHeaders.getCreatedDate());
684        headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate());
685        headers.withCreatedBy(datastreamHeaders.getCreatedBy());
686        headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy());
687        headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate());
688
689        headers.withArchivalGroup(false);
690        headers.withObjectRoot(false);
691        headers.withStateToken(datastreamHeaders.getStateToken());
692
693        return headers;
694    }
695
696    private String resolveMimeType(final DatastreamVersion dv) {
697        String mime = dv.getMimeType();
698
699        if (Strings.isNullOrEmpty(mime)) {
700            final var meta = new Metadata();
701            meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId());
702            try (var content = TikaInputStream.get(dv.getContent())) {
703                mime = mimeDetector.detect(content, meta).toString();
704            } catch (IOException e) {
705                throw new UncheckedIOException(e);
706            }
707        }
708
709        return mime;
710    }
711
712    private void deleteDatastream(final String id,
713                                  final Instant lastModified,
714                                  final OcflObjectSession session) {
715        if (migrationType == MigrationType.PLAIN_OCFL) {
716            deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session);
717            deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session);
718        } else {
719            deleteF6MigratedResource(id, lastModified, session);
720            deleteF6MigratedResource(f6DescriptionId(id), lastModified, session);
721        }
722    }
723
724    private void deleteF6MigratedResource(final String id,
725                                          final Instant lastModified,
726                                          final OcflObjectSession session) {
727        LOGGER.debug("Deleting resource {}", id);
728        final var headers = session.readHeaders(id);
729        session.deleteContentFile(ResourceHeaders.builder(headers)
730                .withDeleted(true)
731                .withLastModifiedDate(lastModified)
732                .withMementoCreatedDate(lastModified)
733                .build());
734    }
735
736    private void deleteOcflMigratedResource(final String id,
737                                            final InteractionModel interactionModel,
738                                            final OcflObjectSession session) {
739        LOGGER.debug("Deleting resource {}", id);
740        session.deleteContentFile(ResourceHeaders.builder()
741                .withId(id)
742                .withInteractionModel(interactionModel.getUri())
743                .build());
744    }
745
746    private String getObjectState(final ObjectVersionReference ov, final String pid) {
747        return ov.getObjectProperties().listProperties().stream()
748                .filter(prop -> OBJ_STATE_PROP.equals(prop.getName()))
749                .findFirst()
750                .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information",
751                        pid)))
752                .getValue();
753    }
754
755    // Get object-level triples
756    private static Model getObjTriples(final ObjectVersionReference o, final String pid) {
757        final Model triples = ModelFactory.createDefaultModel();
758        final String uri = "info:fedora/" + pid;
759
760        o.getObjectProperties().listProperties().forEach(p -> {
761            if (p.getName().contains("Date")) {
762                addDateLiteral(triples, uri, p.getName(), p.getValue());
763            } else {
764                addStringLiteral(triples, uri, p.getName(), p.getValue());
765            }
766        });
767
768        return triples;
769    }
770
771    // Get datastream-level triples
772    private Model getDsTriples(final DatastreamVersion dv,
773                                            final String f6DsId,
774                                            final String createDate) {
775        final Model triples = ModelFactory.createDefaultModel();
776
777        if (migrationType == MigrationType.PLAIN_OCFL) {
778            // These triples are server managed in F6
779            addDateLiteral(triples,
780                    f6DsId,
781                    "http://fedora.info/definitions/v4/repository#created",
782                    createDate);
783            addDateLiteral(triples,
784                    f6DsId,
785                    "http://fedora.info/definitions/v4/repository#lastModified",
786                    dv.getCreated());
787            addStringLiteral(triples,
788                    f6DsId,
789                    "http://purl.org/dc/terms/identifier",
790                    dv.getDatastreamInfo().getDatastreamId());
791            addStringLiteral(triples,
792                    f6DsId,
793                    "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType",
794                    dv.getMimeType());
795            addLongLiteral(triples,
796                    f6DsId,
797                    "http://www.loc.gov/premis/rdf/v1#size",
798                    dv.getSize());
799
800            if (dv.getContentDigest() != null) {
801                addStringLiteral(triples,
802                        f6DsId,
803                        "http://www.loc.gov/premis/rdf/v1#hasMessageDigest",
804                        "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" +
805                                dv.getContentDigest().getDigest().toLowerCase());
806            }
807        }
808
809        addStringLiteral(triples,
810                f6DsId,
811                "http://purl.org/dc/terms/title",
812                dv.getLabel());
813        addStringLiteral(triples,
814                f6DsId,
815                "http://fedora.info/definitions/1/0/access/objState",
816                dv.getDatastreamInfo().getState());
817        addStringLiteral(triples,
818                f6DsId,
819                "http://www.loc.gov/premis/rdf/v1#formatDesignation",
820                dv.getFormatUri());
821
822        return triples;
823    }
824
825    private static void addStringLiteral(final Model m,
826                                         final String s,
827                                         final String p,
828                                         final String o) {
829        if (o != null) {
830            m.add(m.createResource(s), m.createProperty(p), o);
831        }
832    }
833
834    private static void addDateLiteral(final Model m,
835                                       final String s,
836                                       final String p,
837                                       final String date) {
838        if (date != null) {
839            m.addLiteral(m.createResource(s),
840                         m.createProperty(p),
841                         m.createTypedLiteral(date, XSDDatatype.XSDdateTime));
842        }
843    }
844
845    private static void addLongLiteral(final Model m,
846                                       final String s,
847                                       final String p,
848                                       final long number) {
849        if (number != -1) {
850            m.addLiteral(m.createResource(s),
851                    m.createProperty(p),
852                    m.createTypedLiteral(number, XSDDatatype.XSDlong));
853        }
854    }
855
856    /**
857     * @param mime any mimetype as String
858     * @return extension associated with arg mime, return includes '.' in extension (.txt).
859     *                  ..Empty String if unrecognized mime
860     */
861    private static String getExtension(final String mime) {
862        final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
863        MimeType type;
864        try {
865            type = allTypes.forName(mime);
866        } catch (final MimeTypeException e) {
867            type = null;
868        }
869
870        if (type != null) {
871            return type.getExtension();
872        }
873
874        LOGGER.warn("No mimetype found for '{}'", mime);
875        return "";
876    }
877
878    private Model parseRdfXml(final DatastreamVersion datastreamVersion) {
879        final var model = ModelFactory.createDefaultModel();
880        try (final var is = datastreamVersion.getContent()) {
881            RDFDataMgr.read(model, is, Lang.RDFXML);
882            return model;
883        } catch (Exception e) {
884            throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s",
885                    datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(),
886                    datastreamVersion.getDatastreamInfo().getDatastreamId()), e);
887        }
888    }
889
890    private Map<String, Model> splitRelsInt(final Model relsIntModel) {
891        final Map<String, Model> splitModels = new HashMap<>();
892        for (final var it = relsIntModel.listStatements(); it.hasNext();) {
893            final var statement = it.next();
894            final var id = statement.getSubject().getURI();
895            final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel());
896            model.add(statement);
897        }
898        return splitModels;
899    }
900
901    /**
902     * Creates a new session for the datastream when migrating as atomic resources, or returns the object session,
903     * when migrating as archival groups.
904     *
905     * @param id the datastream's id in fedora 6
906     * @param objectSession the datastream's object session
907     * @return either a new datastream session or the object session
908     */
909    private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) {
910        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
911            return objectSession;
912        } else {
913            return newSession(id);
914        }
915    }
916
917    private OcflObjectSession newSession(final String id) {
918        return new OcflObjectSessionWrapper(sessionFactory.newSession(id));
919    }
920
921    /**
922     * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content
923     * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from
924     * one of the RELS-* files. They are maintained separately because it's possible for them to be updated
925     * independently and we need to be able to construct the correct set of triples when one changes.
926     */
927    private static class MetaHolder {
928        Model contentTriples;
929        Model relsTriples;
930        ResourceHeaders.Builder headers;
931
932        public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) {
933            return new MetaHolder(contentTriples, null, headers);
934        }
935
936        private MetaHolder() {
937        }
938
939        private MetaHolder(final Model contentTriples,
940                           final Model relsTriples,
941                           final ResourceHeaders.Builder headers) {
942            this.contentTriples = contentTriples;
943            this.relsTriples = relsTriples;
944            this.headers = headers;
945        }
946
947        /**
948         * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples.
949         *
950         * @return n-triples input stream
951         */
952        public InputStream constructTriples() {
953            final var output = new ByteArrayOutputStream();
954            final var triples = ModelFactory.createDefaultModel();
955
956            if (contentTriples != null) {
957                triples.add(contentTriples.listStatements());
958            }
959
960            if (relsTriples != null) {
961                triples.add(relsTriples.listStatements());
962            }
963
964            triples.write(output, Lang.NTRIPLES.getName());
965            return new ByteArrayInputStream(output.toByteArray());
966        }
967
968        public MetaHolder setHeaders(final ResourceHeaders.Builder headers) {
969            this.headers = headers;
970            return this;
971        }
972
973        public MetaHolder setContentTriples(final Model contentTriples) {
974            this.contentTriples = contentTriples;
975            return this;
976        }
977
978        public MetaHolder setRelsTriples(final Model relsTriples) {
979            this.relsTriples = relsTriples;
980            return this;
981        }
982    }
983
984    private static class BinaryMeta {
985        final String name;
986        final String mimeType;
987        final String label;
988
989        public BinaryMeta(final String name, final String mimeType, final String label) {
990            this.name = name;
991            this.mimeType = mimeType;
992            this.label = label;
993        }
994    }
995
996}