001/*
002 * Copyright 2019 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.fcrepo.migration.handlers.ocfl;
018
019import at.favre.lib.bytes.Bytes;
020import com.google.common.base.Preconditions;
021import com.google.common.base.Strings;
022import com.google.common.collect.Sets;
023import org.apache.commons.codec.digest.DigestUtils;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.lang3.StringUtils;
026import org.apache.jena.datatypes.xsd.XSDDatatype;
027import org.apache.jena.rdf.model.Model;
028import org.apache.jena.rdf.model.ModelFactory;
029import org.apache.jena.riot.Lang;
030import org.apache.jena.riot.RDFDataMgr;
031import org.apache.tika.config.TikaConfig;
032import org.apache.tika.detect.Detector;
033import org.apache.tika.io.TikaInputStream;
034import org.apache.tika.metadata.Metadata;
035import org.apache.tika.mime.MimeType;
036import org.apache.tika.mime.MimeTypeException;
037import org.apache.tika.mime.MimeTypes;
038import org.fcrepo.migration.ContentDigest;
039import org.fcrepo.migration.DatastreamVersion;
040import org.fcrepo.migration.FedoraObjectVersionHandler;
041import org.fcrepo.migration.MigrationType;
042import org.fcrepo.migration.ObjectInfo;
043import org.fcrepo.migration.ObjectVersionReference;
044import org.fcrepo.migration.ResourceMigrationType;
045import org.fcrepo.storage.ocfl.InteractionModel;
046import org.fcrepo.storage.ocfl.OcflObjectSession;
047import org.fcrepo.storage.ocfl.OcflObjectSessionFactory;
048import org.fcrepo.storage.ocfl.ResourceHeaders;
049import org.fcrepo.storage.ocfl.ResourceHeadersVersion;
050import org.fcrepo.storage.ocfl.exception.NotFoundException;
051import org.slf4j.Logger;
052
053import java.io.BufferedInputStream;
054import java.io.ByteArrayInputStream;
055import java.io.ByteArrayOutputStream;
056import java.io.IOException;
057import java.io.InputStream;
058import java.io.UncheckedIOException;
059import java.net.URI;
060import java.nio.charset.StandardCharsets;
061import java.nio.file.Files;
062import java.security.DigestInputStream;
063import java.security.MessageDigest;
064import java.security.NoSuchAlgorithmException;
065import java.time.Instant;
066import java.time.OffsetDateTime;
067import java.time.ZoneOffset;
068import java.util.ArrayList;
069import java.util.HashMap;
070import java.util.HashSet;
071import java.util.Map;
072import java.util.Set;
073import java.util.concurrent.atomic.AtomicBoolean;
074
075import static org.slf4j.LoggerFactory.getLogger;
076
077/**
078 * Writes a Fedora object as a single ArchiveGroup.
079 * <p>
080 * All datastreams and object metadata from a fcrepo3 object are persisted to a
081 * single OCFL object (ArchiveGroup in fcrepo6 parlance).
082 * </p>
083 * <p>
084 * The contents of each datastream are written verbatim. No attempt is made to
085 * re-write the RELS-EXT to replace subjects and objects with their LDP
086 * counterparts.
087 * </p>
088 * <p>
089 * Note: fedora-specific OCFL serialization features (such as redirects,
090 * container metadata, etc) is not fully defined yet, so are not included here
091 *
092 * @author apb@jhu.edu
093 */
094public class ArchiveGroupHandler implements FedoraObjectVersionHandler {
095
096    private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class);
097
098    private static final String FCREPO_ROOT = "info:fedora/";
099    private static final String FCRMETA_SUFFIX = "/fcr:metadata";
100
101    private static final Map<String, String> externalHandlingMap = Map.of(
102            "E", "proxy",
103            "R", "redirect"
104    );
105
106    private static final String INLINE_XML = "X";
107
108    private static final String DS_INACTIVE = "I";
109    private static final String DS_DELETED = "D";
110
111    private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state";
112    private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename";
113    private static final String OBJ_INACTIVE = "Inactive";
114    private static final String OBJ_DELETED = "Deleted";
115
116    private static final String RELS_EXT = "RELS-EXT";
117    private static final String RELS_INT = "RELS-INT";
118
119    private final OcflObjectSessionFactory sessionFactory;
120    private final boolean addDatastreamExtensions;
121    private final boolean deleteInactive;
122    private final boolean foxmlFile;
123    private final MigrationType migrationType;
124    private final ResourceMigrationType resourceMigrationType;
125    private final String user;
126    private final String idPrefix;
127    private final Detector mimeDetector;
128    private final boolean disableChecksumValidation;
129
130    /**
131     * Create an ArchiveGroupHandler,
132     *
133     * @param sessionFactory
134     *        OCFL session factory
135     * @param migrationType
136     *        the type of migration to do
137     * @param resourceMigrationType
138     *        how resources should be migrated
139     * @param addDatastreamExtensions
140     *        true if datastreams should be written with file extensions
141     * @param deleteInactive
142     *        true if inactive objects and datastreams should be migrated as deleted
143     * @param foxmlFile
144     *        true if foxml file should be migrated as a whole file, instead of creating property files
145     * @param user
146     *        the username to associated with the migrated resources
147     * @param idPrefix
148     *        the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3)
149     * @param disableChecksumValidation
150     *        if true, migrator should not try to verify that the datastream content matches Fedora 3 checksums
151     */
152    public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory,
153                               final MigrationType migrationType,
154                               final ResourceMigrationType resourceMigrationType,
155                               final boolean addDatastreamExtensions,
156                               final boolean deleteInactive,
157                               final boolean foxmlFile,
158                               final String user,
159                               final String idPrefix,
160                               final boolean disableChecksumValidation) {
161        this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null");
162        this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null");
163        this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType,
164                "resourceMigrationType cannot be null");
165        this.addDatastreamExtensions = addDatastreamExtensions;
166        this.deleteInactive = deleteInactive;
167        this.foxmlFile = foxmlFile;
168        this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank");
169        this.idPrefix = idPrefix;
170        this.disableChecksumValidation = disableChecksumValidation;
171        try {
172            this.mimeDetector = new TikaConfig().getDetector();
173        } catch (Exception e) {
174            throw new RuntimeException(e);
175        }
176    }
177
178    @Override
179    public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) {
180        // We use the PID to identify the OCFL object
181        final String objectId = objectInfo.getPid();
182        final String f6ObjectId = idPrefix + objectId;
183
184        // We need to manually keep track of the datastream creation dates
185        final Map<String, String> dsCreateDates = new HashMap<>();
186
187        String objectState = null;
188        final Map<String, String> datastreamStates = new HashMap<>();
189        // tracks the triples used to create containers and binary descriptions
190        final Map<String, MetaHolder> metaMap = new HashMap<>();
191        // tracks info about binary resources needed to construct filenames
192        final Map<String, BinaryMeta> binaryMeta = new HashMap<>();
193        // tracks filenames pulled from RELS-INT
194        final Map<String, String> filenameMap = new HashMap<>();
195
196        for (var ov : versions) {
197            // tracks the binary descriptions that need to be written
198            final Set<String> toWrite = new HashSet<>();
199            // tracks the binaries that need their filename updated base on RELS-INT
200            final Set<String> relsFilenameUpdates = new HashSet<>();
201            // tracks the binaries that need their filename updated based on a RELS-INT removal
202            final Map<String, String> relsDeletedFilenames = new HashMap<>();
203
204            final var objectSession = newSession(f6ObjectId);
205
206            if (ov.isFirstVersion()) {
207                if (objectSession.containsResource(f6ObjectId)) {
208                    throw new RuntimeException(f6ObjectId + " already exists!");
209                }
210                objectState = getObjectState(ov, objectId);
211                // Object properties are written only once (as fcrepo3 object properties were unversioned).
212                if (foxmlFile) {
213                    try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) {
214                        final var foxmlDsId = f6ObjectId + "/FOXML";
215                        final var headers = createHeaders(foxmlDsId, f6ObjectId,
216                                InteractionModel.NON_RDF).build();
217                        objectSession.writeResource(headers, is);
218                        //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources()
219                        datastreamStates.put(foxmlDsId, DS_DELETED);
220                    } catch (IOException io) {
221                        LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io);
222                        throw new UncheckedIOException(io);
223                    }
224                } else {
225                    final var objectHeaders = createObjectHeaders(f6ObjectId, ov);
226                    final var content = getObjTriples(ov, objectId);
227                    final var meta = MetaHolder.fromContent(content, objectHeaders);
228                    metaMap.put(f6ObjectId, meta);
229                    objectSession.writeResource(meta.headers.build(), meta.constructTriples());
230                }
231            }
232
233            final var datastreamSessions = new HashMap<String, OcflObjectSession>();
234
235            // Write datastreams and their metadata
236            for (var dv : ov.listChangedDatastreams()) {
237                final var mimeType = resolveMimeType(dv);
238                final String dsId = dv.getDatastreamInfo().getDatastreamId();
239                final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId);
240                final var datastreamFilename = lastPartFromId(f6DsId);
241
242                final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
243                        k -> datastreamSession(f6DsId, objectSession));
244
245                if (dv.isFirstVersionIn(ov.getObject())) {
246                    dsCreateDates.put(dsId, dv.getCreated());
247                    datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState());
248                }
249                final var createDate = dsCreateDates.get(dsId);
250
251                final var filename = resolveFilename(datastreamFilename,
252                        dv.getLabel(), filenameMap.get(f6DsId), mimeType);
253
254                relsDeletedFilenames.remove(f6DsId);
255
256                final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId,
257                        filename, mimeType, createDate);
258
259                binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel()));
260
261                if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
262                    InputStream content = null;
263                    // for plain OCFL migrations, write a file containing the external/redirect URL
264                    if (migrationType == MigrationType.PLAIN_OCFL) {
265                        content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8);
266                    }
267                    datastreamSession.writeResource(datastreamHeaders, content);
268                } else {
269                    try (var contentStream = dv.getContent()) {
270                        writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession);
271                    } catch (final IOException e) {
272                        throw new UncheckedIOException(e);
273                    }
274                }
275
276                if (!foxmlFile) {
277                    final var f6DescId = f6DescriptionId(f6DsId);
278                    final var descriptionHeaders = createDescriptionHeaders(f6DsId,
279                            datastreamHeaders);
280                    final var descriptionTriples = getDsTriples(dv, f6DsId, createDate);
281                    metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder())
282                            .setHeaders(descriptionHeaders)
283                            .setContentTriples(descriptionTriples);
284                    toWrite.add(f6DescId);
285
286                    if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) {
287                        final var triples = parseRdfXml(dv);
288                        if (RELS_EXT.equals(dsId)) {
289                            metaMap.get(f6ObjectId).setRelsTriples(triples);
290                            toWrite.add(f6ObjectId);
291                        } else {
292                            final Map<String, Model> splitModels = splitRelsInt(triples);
293                            final var oldIds = new HashSet<>(filenameMap.keySet());
294                            filenameMap.clear();
295
296                            splitModels.forEach((id, model) -> {
297                                final var descId = f6DescriptionId(id);
298                                metaMap.computeIfAbsent(descId, k -> new MetaHolder())
299                                        .setRelsTriples(model);
300                                toWrite.add(descId);
301
302                                // Check to see if there are any file names that need updated
303                                for (final var it = model.listStatements(); it.hasNext(); ) {
304                                    final var statement = it.next();
305                                    if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) {
306                                        filenameMap.put(id, statement.getObject().toString());
307                                        relsFilenameUpdates.add(id);
308                                        break;
309                                    }
310                                }
311                            });
312
313                            // The filename was set once but is no longer
314                            final var deleted = Sets.difference(oldIds, filenameMap.keySet());
315                            deleted.forEach(id -> {
316                                final var meta = binaryMeta.get(id);
317                                if (meta != null) {
318                                    relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label,
319                                            null, meta.mimeType));
320                                }
321                            });
322                        }
323                    }
324                }
325            }
326
327            writeMeta(toWrite, metaMap, objectSession, datastreamSessions);
328            updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions);
329
330            LOGGER.debug("Committing object <{}>", f6ObjectId);
331
332            final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate());
333
334            objectSession.versionCreationTimestamp(creationTimestamp);
335            objectSession.commit();
336
337            if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
338                datastreamSessions.forEach((id, session) -> {
339                    LOGGER.debug("Committing object <{}>", id);
340                    session.versionCreationTimestamp(creationTimestamp);
341                    session.commit();
342                });
343            }
344        }
345
346        handleDeletedResources(f6ObjectId, objectState, datastreamStates);
347    }
348
349    /**
350     * Resolves the filename of the datastream based on the following precedence:
351     *
352     * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT
353     * 2. LABEL from datastream meta
354     * 3. Name of the datastream
355     *
356     * If extensions should be added, then an extension is picked based on the mime type. If the filename already
357     * includes a `.` then no extension is added.
358     *
359     * @param dsName the name of the datastream
360     * @param labelName the datastream's label
361     * @param downloadName the download name from RELS-INT
362     * @param mimeType the datastream's mime type
363     * @return the resolved filename
364     */
365    private String resolveFilename(final String dsName,
366                                   final String labelName,
367                                   final String downloadName,
368                                   final String mimeType) {
369        String filename;
370        if (StringUtils.isNotBlank(downloadName)) {
371            filename = downloadName;
372        } else if (StringUtils.isNotBlank(labelName)) {
373            filename = labelName;
374        } else {
375            filename = dsName;
376        }
377
378        if (addDatastreamExtensions
379                && StringUtils.isNotBlank(mimeType)
380                && !filename.contains(".")) {
381            filename += getExtension(mimeType);
382        }
383
384        return filename;
385    }
386
387    /**
388     * RDF resources are written after writing all other binaries in the version because they can be affected by
389     * RELS-INT or RELS-EXT updates.
390     *
391     * @param toWrite the set of resources that should be written to this version
392     * @param metaMap the map of all known rdf resources
393     * @param objectSession the ocfl session for the object
394     * @param datastreamSessions the ocfl sessions for the datastreams
395     */
396    private void writeMeta(final Set<String> toWrite,
397                           final Map<String, MetaHolder> metaMap,
398                           final OcflObjectSession objectSession,
399                           final Map<String, OcflObjectSession> datastreamSessions) {
400        for (final var id : toWrite) {
401            final var meta = metaMap.get(id);
402
403            if (meta.headers == null) {
404                // This only happens if there's a RELS-INT that references a datastream before it exists.
405                // Skip for now. The triples will be added once the datastream exists.
406                continue;
407            }
408
409            final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
410                    k -> datastreamSession(k, objectSession));
411
412            // Need to copy over the memento created date from the existing headers because it may have been updated
413            // when a description's binary was updated
414            if (migrationType == MigrationType.FEDORA_OCFL) {
415                try {
416                    final var existingHeaders = session.readHeaders(id);
417                    meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate());
418                } catch (NotFoundException e) {
419                    // this just means the resource hasn't been written yet
420                }
421            }
422            session.writeResource(meta.headers.build(), meta.constructTriples());
423        }
424    }
425
426    private void updateFilenames(final Set<String> toUpdate,
427                                 final Map<String, String> filenameMap,
428                                 final Map<String, String> relsDeletedFilenames,
429                                 final OcflObjectSession objectSession,
430                                 final Map<String, OcflObjectSession> datastreamSessions) {
431        if (migrationType == MigrationType.FEDORA_OCFL) {
432            toUpdate.forEach(id -> {
433                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
434                        k -> datastreamSession(k, objectSession));
435                final var origHeaders = session.readHeaders(id);
436                final var filename = filenameMap.get(id);
437                if (StringUtils.isNotBlank(filename)) {
438                    final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
439                    session.writeHeaders(newHeaders);
440                }
441            });
442            relsDeletedFilenames.forEach((id, filename) -> {
443                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
444                        k -> datastreamSession(k, objectSession));
445                final var origHeaders = session.readHeaders(id);
446                final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
447                session.writeHeaders(newHeaders);
448            });
449        }
450    }
451
452    private boolean fedora3DigestValid(final ContentDigest f3Digest) {
453        return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) &&
454                StringUtils.isNotBlank(f3Digest.getDigest());
455    }
456
457    private void writeDatastreamContent(final DatastreamVersion dv,
458                                        final ResourceHeaders datastreamHeaders,
459                                        final InputStream contentStream,
460                                        final OcflObjectSession session) throws IOException {
461        if (disableChecksumValidation) {
462            session.writeResource(datastreamHeaders, contentStream);
463            return;
464        }
465        final var f3Digest = dv.getContentDigest();
466        final var ocflObjectId = session.ocflObjectId();
467        final var datastreamId = dv.getDatastreamInfo().getDatastreamId();
468        final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup();
469        if (fedora3DigestValid(f3Digest)) {
470            try {
471                final var messageDigest = MessageDigest.getInstance(f3Digest.getType());
472                if (migrationType == MigrationType.PLAIN_OCFL) {
473                    session.writeResource(datastreamHeaders, contentStream);
474                } else {
475                    try (var digestStream = new DigestInputStream(contentStream, messageDigest)) {
476                        session.writeResource(datastreamHeaders, digestStream);
477                        final var expectedDigest = f3Digest.getDigest();
478                        final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex();
479                        if (!actualDigest.equalsIgnoreCase(expectedDigest)) {
480                            final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s",
481                                    ocflObjectId, datastreamId, actualDigest, expectedDigest);
482                            throw new RuntimeException(msg);
483                        }
484                    }
485                }
486            } catch (final NoSuchAlgorithmException e) {
487                final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.",
488                        ocflObjectId, datastreamId, f3Digest.getType());
489                LOGGER.warn(msg);
490                session.writeResource(datastreamHeaders, contentStream);
491            }
492        } else {
493            if (datastreamControlGroup.equalsIgnoreCase("M")) {
494                final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.",
495                        ocflObjectId, datastreamId);
496                LOGGER.warn(msg);
497            }
498            session.writeResource(datastreamHeaders, contentStream);
499        }
500    }
501
502    private void handleDeletedResources(final String f6ObjectId,
503                                        final String objectState,
504                                        final Map<String, String> datastreamStates) {
505        final OcflObjectSession session = newSession(f6ObjectId);
506        final var datastreamSessions = new HashMap<String, OcflObjectSession>();
507
508        try {
509            final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC);
510            final var hasDeletes = new AtomicBoolean(false);
511
512            if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) {
513                hasDeletes.set(true);
514
515                datastreamStates.keySet().forEach(f6DsId -> {
516                    final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
517                            k -> datastreamSession(f6DsId, session));
518                    deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
519                });
520
521                if (migrationType == MigrationType.PLAIN_OCFL) {
522                    deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session);
523                } else {
524                    deleteF6MigratedResource(f6ObjectId, now.toInstant(), session);
525                }
526            } else {
527                datastreamStates.forEach((f6DsId, state) -> {
528                    if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) {
529                        final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
530                                k -> datastreamSession(f6DsId, session));
531                        hasDeletes.set(true);
532                        deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
533                    }
534                });
535            }
536
537            if (hasDeletes.get()) {
538                session.versionCreationTimestamp(now);
539                session.commit();
540
541                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
542                    datastreamSessions.forEach((id, dsSession) -> {
543                        dsSession.versionCreationTimestamp(now);
544                        dsSession.commit();
545                    });
546                }
547            } else {
548                session.abort();
549                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
550                    datastreamSessions.forEach((id, dsSession) -> {
551                        dsSession.abort();
552                    });
553                }
554            }
555        } catch (RuntimeException e) {
556            session.abort();
557            throw e;
558        }
559    }
560
561    private String f6DescriptionId(final String f6ResourceId) {
562        return f6ResourceId + FCRMETA_SUFFIX;
563    }
564
565    private String lastPartFromId(final String id) {
566        return id.substring(id.lastIndexOf('/') + 1);
567    }
568
569    private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) {
570        return f6ObjectId + "/" + datastreamId;
571    }
572
573    private ResourceHeaders.Builder createHeaders(final String id,
574                                                  final String parentId,
575                                                  final InteractionModel model) {
576        final var headers = ResourceHeaders.builder();
577        headers.withHeadersVersion(ResourceHeadersVersion.V1_0);
578        headers.withId(id);
579        headers.withParent(parentId);
580        headers.withInteractionModel(model.getUri());
581        return headers;
582    }
583
584    private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) {
585        final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER);
586        headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL);
587        headers.withObjectRoot(true);
588        headers.withLastModifiedBy(user);
589        headers.withCreatedBy(user);
590
591        ov.getObjectProperties().listProperties().forEach(p -> {
592            if (p.getName().contains("lastModifiedDate")) {
593                final var lastModified = Instant.parse(p.getValue());
594                headers.withLastModifiedDate(lastModified);
595                headers.withMementoCreatedDate(lastModified);
596                headers.withStateToken(DigestUtils.md5Hex(
597                        String.valueOf(lastModified.toEpochMilli())).toUpperCase());
598            } else if (p.getName().contains("createdDate")) {
599                headers.withCreatedDate(Instant.parse(p.getValue()));
600            }
601        });
602
603        return headers;
604    }
605
606    private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv,
607                                                    final String f6DsId,
608                                                    final String f6ObjectId,
609                                                    final String filename,
610                                                    final String mime,
611                                                    final String createDate) {
612        final var lastModified = Instant.parse(dv.getCreated());
613        final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF);
614        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
615            headers.withArchivalGroupId(f6ObjectId);
616        }
617        headers.withFilename(filename);
618        headers.withCreatedDate(Instant.parse(createDate));
619        headers.withLastModifiedDate(lastModified);
620        headers.withLastModifiedBy(user);
621        headers.withCreatedBy(user);
622        headers.withMementoCreatedDate(lastModified);
623
624        if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
625            headers.withExternalHandling(
626                    externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup()));
627            headers.withExternalUrl(dv.getExternalOrRedirectURL());
628        }
629
630        headers.withArchivalGroup(false);
631        headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC);
632        if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) {
633            headers.withContentSize(dv.getSize());
634        }
635
636        if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) {
637            final var digest = dv.getContentDigest();
638            final var digests = new ArrayList<URI>();
639            digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase()));
640            headers.withDigests(digests);
641        }
642
643        headers.withMimeType(mime);
644        headers.withStateToken(DigestUtils.md5Hex(
645                String.valueOf(lastModified.toEpochMilli())).toUpperCase());
646
647        return headers.build();
648    }
649
650    private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId,
651                                                             final ResourceHeaders datastreamHeaders) {
652        final var id = f6DescriptionId(f6DsId);
653        final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION);
654
655        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
656            headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId());
657        }
658        headers.withCreatedDate(datastreamHeaders.getCreatedDate());
659        headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate());
660        headers.withCreatedBy(datastreamHeaders.getCreatedBy());
661        headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy());
662        headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate());
663
664        headers.withArchivalGroup(false);
665        headers.withObjectRoot(false);
666        headers.withStateToken(datastreamHeaders.getStateToken());
667
668        return headers;
669    }
670
671    private String resolveMimeType(final DatastreamVersion dv) {
672        String mime = dv.getMimeType();
673
674        if (Strings.isNullOrEmpty(mime)) {
675            final var meta = new Metadata();
676            meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId());
677            try (var content = TikaInputStream.get(dv.getContent())) {
678                mime = mimeDetector.detect(content, meta).toString();
679            } catch (IOException e) {
680                throw new UncheckedIOException(e);
681            }
682        }
683
684        return mime;
685    }
686
687    private void deleteDatastream(final String id,
688                                  final Instant lastModified,
689                                  final OcflObjectSession session) {
690        if (migrationType == MigrationType.PLAIN_OCFL) {
691            deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session);
692            deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session);
693        } else {
694            deleteF6MigratedResource(id, lastModified, session);
695            deleteF6MigratedResource(f6DescriptionId(id), lastModified, session);
696        }
697    }
698
699    private void deleteF6MigratedResource(final String id,
700                                          final Instant lastModified,
701                                          final OcflObjectSession session) {
702        LOGGER.debug("Deleting resource {}", id);
703        final var headers = session.readHeaders(id);
704        session.deleteContentFile(ResourceHeaders.builder(headers)
705                .withDeleted(true)
706                .withLastModifiedDate(lastModified)
707                .withMementoCreatedDate(lastModified)
708                .build());
709    }
710
711    private void deleteOcflMigratedResource(final String id,
712                                            final InteractionModel interactionModel,
713                                            final OcflObjectSession session) {
714        LOGGER.debug("Deleting resource {}", id);
715        session.deleteContentFile(ResourceHeaders.builder()
716                .withId(id)
717                .withInteractionModel(interactionModel.getUri())
718                .build());
719    }
720
721    private String getObjectState(final ObjectVersionReference ov, final String pid) {
722        return ov.getObjectProperties().listProperties().stream()
723                .filter(prop -> OBJ_STATE_PROP.equals(prop.getName()))
724                .findFirst()
725                .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information",
726                        pid)))
727                .getValue();
728    }
729
730    // Get object-level triples
731    private static Model getObjTriples(final ObjectVersionReference o, final String pid) {
732        final Model triples = ModelFactory.createDefaultModel();
733        final String uri = "info:fedora/" + pid;
734
735        o.getObjectProperties().listProperties().forEach(p -> {
736            if (p.getName().contains("Date")) {
737                addDateLiteral(triples, uri, p.getName(), p.getValue());
738            } else {
739                addStringLiteral(triples, uri, p.getName(), p.getValue());
740            }
741        });
742
743        return triples;
744    }
745
746    // Get datastream-level triples
747    private Model getDsTriples(final DatastreamVersion dv,
748                                            final String f6DsId,
749                                            final String createDate) {
750        final Model triples = ModelFactory.createDefaultModel();
751
752        if (migrationType == MigrationType.PLAIN_OCFL) {
753            // These triples are server managed in F6
754            addDateLiteral(triples,
755                    f6DsId,
756                    "http://fedora.info/definitions/v4/repository#created",
757                    createDate);
758            addDateLiteral(triples,
759                    f6DsId,
760                    "http://fedora.info/definitions/v4/repository#lastModified",
761                    dv.getCreated());
762            addStringLiteral(triples,
763                    f6DsId,
764                    "http://purl.org/dc/terms/identifier",
765                    dv.getDatastreamInfo().getDatastreamId());
766            addStringLiteral(triples,
767                    f6DsId,
768                    "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType",
769                    dv.getMimeType());
770            addLongLiteral(triples,
771                    f6DsId,
772                    "http://www.loc.gov/premis/rdf/v1#size",
773                    dv.getSize());
774
775            if (dv.getContentDigest() != null) {
776                addStringLiteral(triples,
777                        f6DsId,
778                        "http://www.loc.gov/premis/rdf/v1#hasMessageDigest",
779                        "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" +
780                                dv.getContentDigest().getDigest().toLowerCase());
781            }
782        }
783
784        addStringLiteral(triples,
785                f6DsId,
786                "http://purl.org/dc/terms/title",
787                dv.getLabel());
788        addStringLiteral(triples,
789                f6DsId,
790                "http://fedora.info/definitions/1/0/access/objState",
791                dv.getDatastreamInfo().getState());
792        addStringLiteral(triples,
793                f6DsId,
794                "http://www.loc.gov/premis/rdf/v1#formatDesignation",
795                dv.getFormatUri());
796
797        return triples;
798    }
799
800    private static void addStringLiteral(final Model m,
801                                         final String s,
802                                         final String p,
803                                         final String o) {
804        if (o != null) {
805            m.add(m.createResource(s), m.createProperty(p), o);
806        }
807    }
808
809    private static void addDateLiteral(final Model m,
810                                       final String s,
811                                       final String p,
812                                       final String date) {
813        if (date != null) {
814            m.addLiteral(m.createResource(s),
815                         m.createProperty(p),
816                         m.createTypedLiteral(date, XSDDatatype.XSDdateTime));
817        }
818    }
819
820    private static void addLongLiteral(final Model m,
821                                       final String s,
822                                       final String p,
823                                       final long number) {
824        if (number != -1) {
825            m.addLiteral(m.createResource(s),
826                    m.createProperty(p),
827                    m.createTypedLiteral(number, XSDDatatype.XSDlong));
828        }
829    }
830
831    /**
832     * @param mime any mimetype as String
833     * @return extension associated with arg mime, return includes '.' in extension (.txt).
834     *                  ..Empty String if unrecognized mime
835     */
836    private static String getExtension(final String mime) {
837        final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
838        MimeType type;
839        try {
840            type = allTypes.forName(mime);
841        } catch (final MimeTypeException e) {
842            type = null;
843        }
844
845        if (type != null) {
846            return type.getExtension();
847        }
848
849        LOGGER.warn("No mimetype found for '{}'", mime);
850        return "";
851    }
852
853    private Model parseRdfXml(final DatastreamVersion datastreamVersion) {
854        final var model = ModelFactory.createDefaultModel();
855        try (final var is = datastreamVersion.getContent()) {
856            RDFDataMgr.read(model, is, Lang.RDFXML);
857            return model;
858        } catch (Exception e) {
859            throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s",
860                    datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(),
861                    datastreamVersion.getDatastreamInfo().getDatastreamId()), e);
862        }
863    }
864
865    private Map<String, Model> splitRelsInt(final Model relsIntModel) {
866        final Map<String, Model> splitModels = new HashMap<>();
867        for (final var it = relsIntModel.listStatements(); it.hasNext();) {
868            final var statement = it.next();
869            final var id = statement.getSubject().getURI();
870            final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel());
871            model.add(statement);
872        }
873        return splitModels;
874    }
875
876    /**
877     * Creates a new session for the datastream when migrating as atomic resources, or returns the object session,
878     * when migrating as archival groups.
879     *
880     * @param id the datastream's id in fedora 6
881     * @param objectSession the datastream's object session
882     * @return either a new datastream session or the object session
883     */
884    private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) {
885        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
886            return objectSession;
887        } else {
888            return newSession(id);
889        }
890    }
891
892    private OcflObjectSession newSession(final String id) {
893        return new OcflObjectSessionWrapper(sessionFactory.newSession(id));
894    }
895
896    /**
897     * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content
898     * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from
899     * one of the RELS-* files. They are maintained separately because it's possible for them to be updated
900     * independently and we need to be able to construct the correct set of triples when one changes.
901     */
902    private static class MetaHolder {
903        Model contentTriples;
904        Model relsTriples;
905        ResourceHeaders.Builder headers;
906
907        public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) {
908            return new MetaHolder(contentTriples, null, headers);
909        }
910
911        private MetaHolder() {
912        }
913
914        private MetaHolder(final Model contentTriples,
915                           final Model relsTriples,
916                           final ResourceHeaders.Builder headers) {
917            this.contentTriples = contentTriples;
918            this.relsTriples = relsTriples;
919            this.headers = headers;
920        }
921
922        /**
923         * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples.
924         *
925         * @return n-triples input stream
926         */
927        public InputStream constructTriples() {
928            final var output = new ByteArrayOutputStream();
929            final var triples = ModelFactory.createDefaultModel();
930
931            if (contentTriples != null) {
932                triples.add(contentTriples.listStatements());
933            }
934
935            if (relsTriples != null) {
936                triples.add(relsTriples.listStatements());
937            }
938
939            triples.write(output, Lang.NTRIPLES.getName());
940            return new ByteArrayInputStream(output.toByteArray());
941        }
942
943        public MetaHolder setHeaders(final ResourceHeaders.Builder headers) {
944            this.headers = headers;
945            return this;
946        }
947
948        public MetaHolder setContentTriples(final Model contentTriples) {
949            this.contentTriples = contentTriples;
950            return this;
951        }
952
953        public MetaHolder setRelsTriples(final Model relsTriples) {
954            this.relsTriples = relsTriples;
955            return this;
956        }
957    }
958
959    private static class BinaryMeta {
960        final String name;
961        final String mimeType;
962        final String label;
963
964        public BinaryMeta(final String name, final String mimeType, final String label) {
965            this.name = name;
966            this.mimeType = mimeType;
967            this.label = label;
968        }
969    }
970
971}