001/*
002 * Copyright 2019 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.fcrepo.migration.handlers.ocfl;
018
019import at.favre.lib.bytes.Bytes;
020import com.google.common.base.Preconditions;
021import com.google.common.base.Strings;
022import com.google.common.collect.Sets;
023import org.apache.commons.codec.digest.DigestUtils;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.lang3.StringUtils;
026import org.apache.jena.datatypes.xsd.XSDDatatype;
027import org.apache.jena.rdf.model.Model;
028import org.apache.jena.rdf.model.ModelFactory;
029import org.apache.jena.riot.Lang;
030import org.apache.jena.riot.RDFDataMgr;
031import org.apache.jena.graph.NodeFactory;
032import org.apache.jena.graph.Triple;
033import org.apache.jena.rdf.model.Statement;
034
035import org.apache.tika.config.TikaConfig;
036import org.apache.tika.detect.Detector;
037import org.apache.tika.io.TikaInputStream;
038import org.apache.tika.metadata.Metadata;
039import org.apache.tika.mime.MimeType;
040import org.apache.tika.mime.MimeTypeException;
041import org.apache.tika.mime.MimeTypes;
042import org.fcrepo.migration.ContentDigest;
043import org.fcrepo.migration.DatastreamVersion;
044import org.fcrepo.migration.FedoraObjectVersionHandler;
045import org.fcrepo.migration.MigrationType;
046import org.fcrepo.migration.ObjectInfo;
047import org.fcrepo.migration.ObjectVersionReference;
048import org.fcrepo.migration.ResourceMigrationType;
049import org.fcrepo.migration.foxml.DC;
050import org.fcrepo.storage.ocfl.InteractionModel;
051import org.fcrepo.storage.ocfl.OcflObjectSession;
052import org.fcrepo.storage.ocfl.OcflObjectSessionFactory;
053import org.fcrepo.storage.ocfl.ResourceHeaders;
054import org.fcrepo.storage.ocfl.ResourceHeadersVersion;
055import org.fcrepo.storage.ocfl.exception.NotFoundException;
056import org.slf4j.Logger;
057
058import java.io.BufferedInputStream;
059import java.io.ByteArrayInputStream;
060import java.io.ByteArrayOutputStream;
061import java.io.IOException;
062import java.io.InputStream;
063import java.io.UncheckedIOException;
064import java.net.URI;
065import java.nio.charset.StandardCharsets;
066import java.nio.file.Files;
067import java.security.DigestInputStream;
068import java.security.MessageDigest;
069import java.security.NoSuchAlgorithmException;
070import java.time.Instant;
071import java.time.OffsetDateTime;
072import java.time.ZoneOffset;
073import java.util.ArrayList;
074import java.util.HashMap;
075import java.util.HashSet;
076import java.util.Map;
077import java.util.Set;
078import java.util.concurrent.atomic.AtomicBoolean;
079import static org.slf4j.LoggerFactory.getLogger;
080
081/**
082 * Writes a Fedora object as a single ArchiveGroup.
083 * <p>
084 * All datastreams and object metadata from a fcrepo3 object are persisted to a
085 * single OCFL object (ArchiveGroup in fcrepo6 parlance).
086 * </p>
087 * <p>
088 * The contents of each datastream are written verbatim. No attempt is made to
089 * re-write the RELS-EXT to replace subjects and objects with their LDP
090 * counterparts.
091 * </p>
092 * <p>
093 * Note: fedora-specific OCFL serialization features (such as redirects,
094 * container metadata, etc) is not fully defined yet, so are not included here
095 *
096 * @author apb@jhu.edu
097 */
098public class ArchiveGroupHandler implements FedoraObjectVersionHandler {
099
100    private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class);
101
102    private static final String FCREPO_ROOT = "info:fedora/";
103    private static final String FCRMETA_SUFFIX = "/fcr:metadata";
104
105    private static final Map<String, String> externalHandlingMap = Map.of(
106            "E", "proxy",
107            "R", "redirect"
108    );
109
110    private static final String INLINE_XML = "X";
111
112    private static final String DS_INACTIVE = "I";
113    private static final String DS_DELETED = "D";
114
115    private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state";
116    private static final String DOWNLOAD_NAME_PROP = "info:fedora/fedora-system:def/model#downloadFilename";
117    private static final String OBJ_INACTIVE = "Inactive";
118    private static final String OBJ_DELETED = "Deleted";
119
120    private static final String RELS_EXT = "RELS-EXT";
121    private static final String RELS_INT = "RELS-INT";
122    private static final String DC_DS = "DC";
123
124    private final OcflObjectSessionFactory sessionFactory;
125    private final boolean addDatastreamExtensions;
126    private final boolean deleteInactive;
127    private final boolean foxmlFile;
128    private final MigrationType migrationType;
129    private final ResourceMigrationType resourceMigrationType;
130    private final String user;
131    private final String idPrefix;
132    private final Detector mimeDetector;
133    private final boolean headOnly;
134    private final boolean disableChecksumValidation;
135    private final boolean disableDc;
136
137    /**
138     * Create an ArchiveGroupHandler,
139     *
140     * @param sessionFactory
141     *        OCFL session factory
142     * @param migrationType
143     *        the type of migration to do
144     * @param resourceMigrationType
145     *        how resources should be migrated
146     * @param addDatastreamExtensions
147     *        true if datastreams should be written with file extensions
148     * @param deleteInactive
149     *        true if inactive objects and datastreams should be migrated as deleted
150     * @param foxmlFile
151     *        true if foxml file should be migrated as a whole file, instead of creating property files
152     * @param user
153     *        the username to associated with the migrated resources
154     * @param idPrefix
155     *        the prefix to add to the Fedora 3 pid (default "info:fedora/", like Fedora 3)
156     * @param headOnly
157     *        flag to enable head only migrations
158     * @param disableChecksumValidation
159     *        disable Checksum validation
160     * @param disableDc
161     *        true if DC datastreams should not be migrated to RDF object properties
162     */
163    public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory,
164                               final MigrationType migrationType,
165                               final ResourceMigrationType resourceMigrationType,
166                               final boolean addDatastreamExtensions,
167                               final boolean deleteInactive,
168                               final boolean foxmlFile,
169                               final String user,
170                               final String idPrefix,
171                               final boolean headOnly,
172                               final boolean disableChecksumValidation,
173                               final boolean disableDc) {
174        this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null");
175        this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null");
176        this.resourceMigrationType = Preconditions.checkNotNull(resourceMigrationType,
177                "resourceMigrationType cannot be null");
178        this.addDatastreamExtensions = addDatastreamExtensions;
179        this.deleteInactive = deleteInactive;
180        this.foxmlFile = foxmlFile;
181        this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank");
182        this.idPrefix = idPrefix;
183        this.headOnly = headOnly;
184        this.disableChecksumValidation = disableChecksumValidation;
185        this.disableDc = disableDc;
186        try {
187            this.mimeDetector = new TikaConfig().getDetector();
188        } catch (Exception e) {
189            throw new RuntimeException(e);
190        }
191    }
192
193    @Override
194    public void processObjectVersions(final Iterable<ObjectVersionReference> versions, final ObjectInfo objectInfo) {
195        // We use the PID to identify the OCFL object
196        final String objectId = objectInfo.getPid();
197        final String f6ObjectId = idPrefix + objectId;
198
199        // We need to manually keep track of the datastream creation dates
200        final Map<String, String> dsCreateDates = new HashMap<>();
201
202        String objectState = null;
203        OffsetDateTime objectCreation = null;
204        OcflObjectSession objectSession = null;
205
206        final Map<String, String> datastreamStates = new HashMap<>();
207        // tracks the triples used to create containers and binary descriptions
208        final Map<String, MetaHolder> metaMap = new HashMap<>();
209        // tracks info about binary resources needed to construct filenames
210        final Map<String, BinaryMeta> binaryMeta = new HashMap<>();
211        // tracks filenames pulled from RELS-INT
212        final Map<String, String> filenameMap = new HashMap<>();
213
214        for (var ov : versions) {
215            // tracks the binary descriptions that need to be written
216            final Set<String> toWrite = new HashSet<>();
217            // tracks the binaries that need their filename updated base on RELS-INT
218            final Set<String> relsFilenameUpdates = new HashSet<>();
219            // tracks the binaries that need their filename updated based on a RELS-INT removal
220            final Map<String, String> relsDeletedFilenames = new HashMap<>();
221
222            // reuse the objectSession when headOnly is set
223            objectSession = (objectSession == null || !headOnly) ? newSession(f6ObjectId) : objectSession;
224
225            if (ov.isFirstVersion()) {
226                if (objectSession.containsResource(f6ObjectId)) {
227                    throw new RuntimeException(f6ObjectId + " already exists!");
228                }
229                objectCreation = OffsetDateTime.parse(ov.getVersionDate());
230                objectState = getObjectState(ov, objectId);
231                // Object properties are written only once (as fcrepo3 object properties were unversioned).
232                if (foxmlFile) {
233                    try (InputStream is = new BufferedInputStream(Files.newInputStream(objectInfo.getFoxmlPath()))) {
234                        final var foxmlDsId = f6ObjectId + "/FOXML";
235                        final var headers = createHeaders(foxmlDsId, f6ObjectId,
236                                InteractionModel.NON_RDF).build();
237                        objectSession.writeResource(headers, is);
238                        //mark FOXML as a deleted datastream so it gets deleted in handleDeletedResources()
239                        datastreamStates.put(foxmlDsId, DS_DELETED);
240                    } catch (IOException io) {
241                        LOGGER.error("error writing " + objectId + " FOXML file to " + f6ObjectId + ": " + io);
242                        throw new UncheckedIOException(io);
243                    }
244                } else {
245                    final var objectHeaders = createObjectHeaders(f6ObjectId, ov);
246                    final var content = getObjTriples(ov, objectId);
247                    final var meta = MetaHolder.fromContent(content, objectHeaders);
248                    metaMap.put(f6ObjectId, meta);
249                    objectSession.writeResource(meta.headers.build(), meta.constructTriples());
250                }
251            }
252
253            final var datastreamSessions = new HashMap<String, OcflObjectSession>();
254
255            // Write datastreams and their metadata
256            for (var dv : ov.listChangedDatastreams()) {
257                final var mimeType = resolveMimeType(dv);
258                final String dsId = dv.getDatastreamInfo().getDatastreamId();
259                final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId);
260                final var datastreamFilename = lastPartFromId(f6DsId);
261
262                final var datastreamSession = datastreamSession(f6DsId, objectSession);
263                datastreamSessions.putIfAbsent(f6DsId, datastreamSession);
264
265                if (dv.isFirstVersionIn(ov.getObject())) {
266                    dsCreateDates.put(dsId, dv.getCreated());
267                    datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState());
268                }
269
270                final var createDate = dsCreateDates.get(dsId);
271
272                final var filename = resolveFilename(datastreamFilename,
273                        dv.getLabel(), filenameMap.get(f6DsId), mimeType);
274
275                relsDeletedFilenames.remove(f6DsId);
276
277                final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId,
278                        filename, mimeType, createDate);
279
280                binaryMeta.put(f6DsId, new BinaryMeta(datastreamFilename, mimeType, dv.getLabel()));
281
282                if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
283                    InputStream content = null;
284                    // for plain OCFL migrations, write a file containing the external/redirect URL
285                    if (migrationType == MigrationType.PLAIN_OCFL) {
286                        content = IOUtils.toInputStream(dv.getExternalOrRedirectURL(), StandardCharsets.UTF_8);
287                    }
288                    datastreamSession.writeResource(datastreamHeaders, content);
289                } else {
290                    try (var contentStream = dv.getContent()) {
291                        writeDatastreamContent(dv, datastreamHeaders, contentStream, datastreamSession);
292                    } catch (final IOException e) {
293                        throw new UncheckedIOException(e);
294                    }
295                }
296
297                if (!foxmlFile) {
298                    final var f6DescId = f6DescriptionId(f6DsId);
299                    final var descriptionHeaders = createDescriptionHeaders(f6DsId,
300                            datastreamHeaders);
301                    final var descriptionTriples = getDsTriples(dv, f6DsId, createDate);
302                    metaMap.computeIfAbsent(f6DescId, k -> new MetaHolder())
303                            .setHeaders(descriptionHeaders)
304                            .setContentTriples(descriptionTriples);
305                    toWrite.add(f6DescId);
306
307                    if (DC_DS.equals(dsId) && !disableDc) {
308                        DC dc = new DC();
309                        try {
310                            dc = DC.parseDC(dv.getContent());
311                        } catch (Exception e) {
312                            throw new RuntimeException(String.format("Failed to parse DC XML in %s/%s",
313                                objectId,f6DsId), e);
314                        }
315
316                        final var model = ModelFactory.createDefaultModel();
317                        for (String uri : dc.getRepresentedElementURIs()) {
318                            for (String value : dc.getValuesForURI(uri)) {
319                                final Triple dcTriple = new Triple(
320                                    NodeFactory.createURI(f6ObjectId),
321                                    NodeFactory.createURI(uri),
322                                    NodeFactory.createLiteral(value, XSDDatatype.XSDstring));
323                                final Statement statement = model.asStatement(dcTriple);
324                                model.add(statement);
325                                LOGGER.debug(dcTriple.toString());
326                            }
327                        }
328
329                        metaMap.get(f6ObjectId).setDcTriples(model);
330                        toWrite.add(f6ObjectId);
331
332                    }
333
334                    if (RELS_EXT.equals(dsId) || RELS_INT.equals(dsId)) {
335                        final var triples = parseRdfXml(dv);
336                        if (RELS_EXT.equals(dsId)) {
337                            metaMap.get(f6ObjectId).setRelsTriples(triples);
338                            toWrite.add(f6ObjectId);
339                        } else {
340                            final Map<String, Model> splitModels = splitRelsInt(triples);
341                            final var oldIds = new HashSet<>(filenameMap.keySet());
342                            filenameMap.clear();
343
344                            splitModels.forEach((id, model) -> {
345                                final var descId = f6DescriptionId(id);
346                                metaMap.computeIfAbsent(descId, k -> new MetaHolder())
347                                        .setRelsTriples(model);
348                                toWrite.add(descId);
349
350                                // Check to see if there are any file names that need updated
351                                for (final var it = model.listStatements(); it.hasNext(); ) {
352                                    final var statement = it.next();
353                                    if (DOWNLOAD_NAME_PROP.equals(statement.getPredicate().getURI())) {
354                                        filenameMap.put(id, statement.getObject().toString());
355                                        relsFilenameUpdates.add(id);
356                                        break;
357                                    }
358                                }
359                            });
360
361                            // The filename was set once but is no longer
362                            final var deleted = Sets.difference(oldIds, filenameMap.keySet());
363                            deleted.forEach(id -> {
364                                final var meta = binaryMeta.get(id);
365                                if (meta != null) {
366                                    relsDeletedFilenames.put(id, resolveFilename(meta.name, meta.label,
367                                            null, meta.mimeType));
368                                }
369                            });
370                        }
371                    }
372                }
373            }
374
375            writeMeta(toWrite, metaMap, objectSession, datastreamSessions);
376            updateFilenames(relsFilenameUpdates, filenameMap, relsDeletedFilenames, objectSession, datastreamSessions);
377
378            if (!headOnly) {
379                LOGGER.debug("Committing object <{}>", f6ObjectId);
380
381                final var creationTimestamp = OffsetDateTime.parse(ov.getVersionDate());
382
383                objectSession.versionCreationTimestamp(creationTimestamp);
384                objectSession.commit();
385
386                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
387                    datastreamSessions.forEach((id, session) -> {
388                        LOGGER.debug("Committing object <{}>", id);
389                        session.versionCreationTimestamp(creationTimestamp);
390                        session.commit();
391                    });
392                }
393            }
394        }
395
396        handleDeletedResources(f6ObjectId, objectState, datastreamStates, objectSession);
397
398        // final commit when headOnly is set
399        if (headOnly && objectSession != null) {
400            LOGGER.debug("Committing object <{}>", f6ObjectId);
401            objectSession.versionCreationTimestamp(objectCreation);
402            objectSession.commit();
403        }
404    }
405
406    /**
407     * Resolves the filename of the datastream based on the following precedence:
408     *
409     * 1. info:fedora/fedora-system:def/model#downloadFilename from RELS-INT
410     * 2. LABEL from datastream meta
411     * 3. Name of the datastream
412     *
413     * If extensions should be added, then an extension is picked based on the mime type. If the filename already
414     * includes a `.` then no extension is added.
415     *
416     * @param dsName the name of the datastream
417     * @param labelName the datastream's label
418     * @param downloadName the download name from RELS-INT
419     * @param mimeType the datastream's mime type
420     * @return the resolved filename
421     */
422    private String resolveFilename(final String dsName,
423                                   final String labelName,
424                                   final String downloadName,
425                                   final String mimeType) {
426        String filename;
427        if (StringUtils.isNotBlank(downloadName)) {
428            filename = downloadName;
429        } else if (StringUtils.isNotBlank(labelName)) {
430            filename = labelName;
431        } else {
432            filename = dsName;
433        }
434
435        if (addDatastreamExtensions
436                && StringUtils.isNotBlank(mimeType)
437                && !filename.contains(".")) {
438            filename += getExtension(mimeType);
439        }
440
441        return filename;
442    }
443
444    /**
445     * RDF resources are written after writing all other binaries in the version because they can be affected by
446     * RELS-INT or RELS-EXT updates.
447     *
448     * @param toWrite the set of resources that should be written to this version
449     * @param metaMap the map of all known rdf resources
450     * @param objectSession the ocfl session for the object
451     * @param datastreamSessions the ocfl sessions for the datastreams
452     */
453    private void writeMeta(final Set<String> toWrite,
454                           final Map<String, MetaHolder> metaMap,
455                           final OcflObjectSession objectSession,
456                           final Map<String, OcflObjectSession> datastreamSessions) {
457        for (final var id : toWrite) {
458            final var meta = metaMap.get(id);
459
460            if (meta.headers == null) {
461                // This only happens if there's a RELS-INT that references a datastream before it exists.
462                // Skip for now. The triples will be added once the datastream exists.
463                continue;
464            }
465
466            final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
467                    k -> datastreamSession(k, objectSession));
468
469            // Need to copy over the memento created date from the existing headers because it may have been updated
470            // when a description's binary was updated
471            if (migrationType == MigrationType.FEDORA_OCFL) {
472                try {
473                    final var existingHeaders = session.readHeaders(id);
474                    meta.headers.withMementoCreatedDate(existingHeaders.getMementoCreatedDate());
475                } catch (NotFoundException e) {
476                    // this just means the resource hasn't been written yet
477                }
478            }
479            session.writeResource(meta.headers.build(), meta.constructTriples());
480        }
481    }
482
483    private void updateFilenames(final Set<String> toUpdate,
484                                 final Map<String, String> filenameMap,
485                                 final Map<String, String> relsDeletedFilenames,
486                                 final OcflObjectSession objectSession,
487                                 final Map<String, OcflObjectSession> datastreamSessions) {
488        if (migrationType == MigrationType.FEDORA_OCFL) {
489            toUpdate.forEach(id -> {
490                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
491                                                                       k -> datastreamSession(k, objectSession));
492                final var origHeaders = session.readHeaders(id);
493                final var filename = filenameMap.get(id);
494                if (StringUtils.isNotBlank(filename)) {
495                    final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
496                    session.writeHeaders(newHeaders);
497                }
498            });
499            relsDeletedFilenames.forEach((id, filename) -> {
500                final var session = datastreamSessions.computeIfAbsent(id.replace(FCRMETA_SUFFIX, ""),
501                                                                       k -> datastreamSession(k, objectSession));
502                final var origHeaders = session.readHeaders(id);
503                final var newHeaders = ResourceHeaders.builder(origHeaders).withFilename(filename).build();
504                session.writeHeaders(newHeaders);
505            });
506        }
507    }
508
509    private boolean fedora3DigestValid(final ContentDigest f3Digest) {
510        return f3Digest != null && StringUtils.isNotBlank(f3Digest.getType()) &&
511                StringUtils.isNotBlank(f3Digest.getDigest());
512    }
513
514    private void writeDatastreamContent(final DatastreamVersion dv,
515                                        final ResourceHeaders datastreamHeaders,
516                                        final InputStream contentStream,
517                                        final OcflObjectSession session) throws IOException {
518        if (disableChecksumValidation) {
519            session.writeResource(datastreamHeaders, contentStream);
520            return;
521        }
522        final var f3Digest = dv.getContentDigest();
523        final var ocflObjectId = session.ocflObjectId();
524        final var datastreamId = dv.getDatastreamInfo().getDatastreamId();
525        final var datastreamControlGroup = dv.getDatastreamInfo().getControlGroup();
526        if (fedora3DigestValid(f3Digest)) {
527            try {
528                final var messageDigest = MessageDigest.getInstance(f3Digest.getType());
529                if (migrationType == MigrationType.PLAIN_OCFL) {
530                    session.writeResource(datastreamHeaders, contentStream);
531                } else {
532                    try (var digestStream = new DigestInputStream(contentStream, messageDigest)) {
533                        session.writeResource(datastreamHeaders, digestStream);
534                        final var expectedDigest = f3Digest.getDigest();
535                        final var actualDigest = Bytes.wrap(digestStream.getMessageDigest().digest()).encodeHex();
536                        if (!actualDigest.equalsIgnoreCase(expectedDigest)) {
537                            final var msg = String.format("%s/%s: digest %s doesn't match expected digest %s",
538                                    ocflObjectId, datastreamId, actualDigest, expectedDigest);
539                            throw new RuntimeException(msg);
540                        }
541                    }
542                }
543            } catch (final NoSuchAlgorithmException e) {
544                final var msg = String.format("%s/%s: no digest algorithm %s. Writing resource & continuing.",
545                        ocflObjectId, datastreamId, f3Digest.getType());
546                LOGGER.warn(msg);
547                session.writeResource(datastreamHeaders, contentStream);
548            }
549        } else {
550            if (datastreamControlGroup.equalsIgnoreCase("M")) {
551                final var msg = String.format("%s/%s: missing/invalid digest. Writing resource & continuing.",
552                        ocflObjectId, datastreamId);
553                LOGGER.warn(msg);
554            }
555            session.writeResource(datastreamHeaders, contentStream);
556        }
557    }
558
559    private void handleDeletedResources(final String f6ObjectId,
560                                        final String objectState,
561                                        final Map<String, String> datastreamStates,
562                                        final OcflObjectSession objectSession) {
563        final OcflObjectSession session = headOnly ? objectSession : newSession(f6ObjectId);
564        final var datastreamSessions = new HashMap<String, OcflObjectSession>();
565
566        try {
567            final var now = OffsetDateTime.now().withOffsetSameInstant(ZoneOffset.UTC);
568            final var hasDeletes = new AtomicBoolean(false);
569
570            if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) {
571                hasDeletes.set(true);
572
573                datastreamStates.keySet().forEach(f6DsId -> {
574                    final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
575                            k -> datastreamSession(f6DsId, session));
576                    deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
577                });
578
579                if (migrationType == MigrationType.PLAIN_OCFL) {
580                    deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session);
581                } else {
582                    deleteF6MigratedResource(f6ObjectId, now.toInstant(), session);
583                }
584            } else {
585                datastreamStates.forEach((f6DsId, state) -> {
586                    if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) {
587                        final var datastreamSession = datastreamSessions.computeIfAbsent(f6DsId,
588                                k -> datastreamSession(f6DsId, session));
589                        hasDeletes.set(true);
590                        deleteDatastream(f6DsId, now.toInstant(), datastreamSession);
591                    }
592                });
593            }
594
595            if (!headOnly && hasDeletes.get()) {
596                session.versionCreationTimestamp(now);
597                session.commit();
598
599                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
600                    datastreamSessions.forEach((id, dsSession) -> {
601                        dsSession.versionCreationTimestamp(now);
602                        dsSession.commit();
603                    });
604                }
605            } else if (!headOnly) {
606                session.abort();
607                if (resourceMigrationType == ResourceMigrationType.ATOMIC) {
608                    datastreamSessions.forEach((id, dsSession) -> {
609                        dsSession.abort();
610                    });
611                }
612            }
613        } catch (RuntimeException e) {
614            session.abort();
615            throw e;
616        }
617    }
618
619    private String f6DescriptionId(final String f6ResourceId) {
620        return f6ResourceId + FCRMETA_SUFFIX;
621    }
622
623    private String lastPartFromId(final String id) {
624        return id.substring(id.lastIndexOf('/') + 1);
625    }
626
627    private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId) {
628        return f6ObjectId + "/" + datastreamId;
629    }
630
631    private ResourceHeaders.Builder createHeaders(final String id,
632                                                  final String parentId,
633                                                  final InteractionModel model) {
634        final var headers = ResourceHeaders.builder();
635        headers.withHeadersVersion(ResourceHeadersVersion.V1_0);
636        headers.withId(id);
637        headers.withParent(parentId);
638        headers.withInteractionModel(model.getUri());
639        return headers;
640    }
641
642    private ResourceHeaders.Builder createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) {
643        final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER);
644        headers.withArchivalGroup(resourceMigrationType == ResourceMigrationType.ARCHIVAL);
645        headers.withObjectRoot(true);
646        headers.withLastModifiedBy(user);
647        headers.withCreatedBy(user);
648
649        ov.getObjectProperties().listProperties().forEach(p -> {
650            if (p.getName().contains("lastModifiedDate")) {
651                final var lastModified = Instant.parse(p.getValue());
652                headers.withLastModifiedDate(lastModified);
653                headers.withMementoCreatedDate(lastModified);
654                headers.withStateToken(DigestUtils.md5Hex(
655                        String.valueOf(lastModified.toEpochMilli())).toUpperCase());
656            } else if (p.getName().contains("createdDate")) {
657                headers.withCreatedDate(Instant.parse(p.getValue()));
658            }
659        });
660
661        return headers;
662    }
663
664    private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv,
665                                                    final String f6DsId,
666                                                    final String f6ObjectId,
667                                                    final String filename,
668                                                    final String mime,
669                                                    final String createDate) {
670        final var lastModified = Instant.parse(dv.getCreated());
671        final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF);
672        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
673            headers.withArchivalGroupId(f6ObjectId);
674        }
675        headers.withFilename(filename);
676        headers.withCreatedDate(Instant.parse(createDate));
677        headers.withLastModifiedDate(lastModified);
678        headers.withLastModifiedBy(user);
679        headers.withCreatedBy(user);
680        headers.withMementoCreatedDate(lastModified);
681
682        if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
683            headers.withExternalHandling(
684                    externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup()));
685            headers.withExternalUrl(dv.getExternalOrRedirectURL());
686        }
687
688        headers.withArchivalGroup(false);
689        headers.withObjectRoot(resourceMigrationType == ResourceMigrationType.ATOMIC);
690        if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) {
691            headers.withContentSize(dv.getSize());
692        }
693
694        if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) {
695            if (!dv.getContentDigest().getDigest().equals("none")) {
696                final var digest = dv.getContentDigest();
697                final var digests = new ArrayList<URI>();
698                digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" +
699                            digest.getDigest().toLowerCase()));
700                headers.withDigests(digests);
701            } else {
702                LOGGER.warn("Digest content 'none' found. Not adding to header");
703            }
704        }
705
706        headers.withMimeType(mime);
707        headers.withStateToken(DigestUtils.md5Hex(
708                String.valueOf(lastModified.toEpochMilli())).toUpperCase());
709
710        return headers.build();
711    }
712
713    private ResourceHeaders.Builder createDescriptionHeaders(final String f6DsId,
714                                                             final ResourceHeaders datastreamHeaders) {
715        final var id = f6DescriptionId(f6DsId);
716        final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION);
717
718        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
719            headers.withArchivalGroupId(datastreamHeaders.getArchivalGroupId());
720        }
721        headers.withCreatedDate(datastreamHeaders.getCreatedDate());
722        headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate());
723        headers.withCreatedBy(datastreamHeaders.getCreatedBy());
724        headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy());
725        headers.withMementoCreatedDate(datastreamHeaders.getMementoCreatedDate());
726
727        headers.withArchivalGroup(false);
728        headers.withObjectRoot(false);
729        headers.withStateToken(datastreamHeaders.getStateToken());
730
731        return headers;
732    }
733
734    private String resolveMimeType(final DatastreamVersion dv) {
735        String mime = dv.getMimeType();
736
737        if (Strings.isNullOrEmpty(mime)) {
738            final var meta = new Metadata();
739            meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId());
740            try (var content = TikaInputStream.get(dv.getContent())) {
741                mime = mimeDetector.detect(content, meta).toString();
742            } catch (IOException e) {
743                throw new UncheckedIOException(e);
744            }
745        }
746
747        return mime;
748    }
749
750    private void deleteDatastream(final String id,
751                                  final Instant lastModified,
752                                  final OcflObjectSession session) {
753        if (migrationType == MigrationType.PLAIN_OCFL) {
754            deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session);
755            deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session);
756        } else {
757            deleteF6MigratedResource(id, lastModified, session);
758            deleteF6MigratedResource(f6DescriptionId(id), lastModified, session);
759        }
760    }
761
762    private void deleteF6MigratedResource(final String id,
763                                          final Instant lastModified,
764                                          final OcflObjectSession session) {
765        LOGGER.debug("Deleting resource {}", id);
766        final var headers = session.readHeaders(id);
767        session.deleteContentFile(ResourceHeaders.builder(headers)
768                .withDeleted(true)
769                .withLastModifiedDate(lastModified)
770                .withMementoCreatedDate(lastModified)
771                .build());
772    }
773
774    private void deleteOcflMigratedResource(final String id,
775                                            final InteractionModel interactionModel,
776                                            final OcflObjectSession session) {
777        LOGGER.debug("Deleting resource {}", id);
778        session.deleteContentFile(ResourceHeaders.builder()
779                .withId(id)
780                .withInteractionModel(interactionModel.getUri())
781                .build());
782    }
783
784    private String getObjectState(final ObjectVersionReference ov, final String pid) {
785        return ov.getObjectProperties().listProperties().stream()
786                .filter(prop -> OBJ_STATE_PROP.equals(prop.getName()))
787                .findFirst()
788                .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information",
789                        pid)))
790                .getValue();
791    }
792
793    // Get object-level triples
794    private static Model getObjTriples(final ObjectVersionReference o, final String pid) {
795        final Model triples = ModelFactory.createDefaultModel();
796        final String uri = "info:fedora/" + pid;
797
798        o.getObjectProperties().listProperties().forEach(p -> {
799            if (p.getName().contains("Date")) {
800                addDateLiteral(triples, uri, p.getName(), p.getValue());
801            } else {
802                addStringLiteral(triples, uri, p.getName(), p.getValue());
803            }
804        });
805
806        return triples;
807    }
808
809    // Get datastream-level triples
810    private Model getDsTriples(final DatastreamVersion dv,
811                                            final String f6DsId,
812                                            final String createDate) {
813        final Model triples = ModelFactory.createDefaultModel();
814
815        if (migrationType == MigrationType.PLAIN_OCFL) {
816            // These triples are server managed in F6
817            addDateLiteral(triples,
818                    f6DsId,
819                    "http://fedora.info/definitions/v4/repository#created",
820                    createDate);
821            addDateLiteral(triples,
822                    f6DsId,
823                    "http://fedora.info/definitions/v4/repository#lastModified",
824                    dv.getCreated());
825            addStringLiteral(triples,
826                    f6DsId,
827                    "http://purl.org/dc/terms/identifier",
828                    dv.getDatastreamInfo().getDatastreamId());
829            addStringLiteral(triples,
830                    f6DsId,
831                    "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType",
832                    dv.getMimeType());
833            addLongLiteral(triples,
834                    f6DsId,
835                    "http://www.loc.gov/premis/rdf/v1#size",
836                    dv.getSize());
837
838            if (dv.getContentDigest() != null) {
839                addStringLiteral(triples,
840                        f6DsId,
841                        "http://www.loc.gov/premis/rdf/v1#hasMessageDigest",
842                        "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" +
843                                dv.getContentDigest().getDigest().toLowerCase());
844            }
845        }
846
847        addStringLiteral(triples,
848                f6DsId,
849                "http://purl.org/dc/terms/title",
850                dv.getLabel());
851        addStringLiteral(triples,
852                f6DsId,
853                "http://fedora.info/definitions/1/0/access/objState",
854                dv.getDatastreamInfo().getState());
855        addStringLiteral(triples,
856                f6DsId,
857                "http://www.loc.gov/premis/rdf/v1#formatDesignation",
858                dv.getFormatUri());
859
860        return triples;
861    }
862
863    private static void addStringLiteral(final Model m,
864                                         final String s,
865                                         final String p,
866                                         final String o) {
867        if (o != null) {
868            m.add(m.createResource(s), m.createProperty(p), o);
869        }
870    }
871
872    private static void addDateLiteral(final Model m,
873                                       final String s,
874                                       final String p,
875                                       final String date) {
876        if (date != null) {
877            m.addLiteral(m.createResource(s),
878                         m.createProperty(p),
879                         m.createTypedLiteral(date, XSDDatatype.XSDdateTime));
880        }
881    }
882
883    private static void addLongLiteral(final Model m,
884                                       final String s,
885                                       final String p,
886                                       final long number) {
887        if (number != -1) {
888            m.addLiteral(m.createResource(s),
889                    m.createProperty(p),
890                    m.createTypedLiteral(number, XSDDatatype.XSDlong));
891        }
892    }
893
894    /**
895     * @param mime any mimetype as String
896     * @return extension associated with arg mime, return includes '.' in extension (.txt).
897     *                  ..Empty String if unrecognized mime
898     */
899    private static String getExtension(final String mime) {
900        final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
901        MimeType type;
902        try {
903            type = allTypes.forName(mime);
904        } catch (final MimeTypeException e) {
905            type = null;
906        }
907
908        if (type != null) {
909            return type.getExtension();
910        }
911
912        LOGGER.warn("No mimetype found for '{}'", mime);
913        return "";
914    }
915
916    private Model parseRdfXml(final DatastreamVersion datastreamVersion) {
917        final var model = ModelFactory.createDefaultModel();
918        try (final var is = datastreamVersion.getContent()) {
919            RDFDataMgr.read(model, is, Lang.RDFXML);
920            return model;
921        } catch (Exception e) {
922            throw new RuntimeException(String.format("Failed to parse RDF XML in %s/%s",
923                    datastreamVersion.getDatastreamInfo().getObjectInfo().getPid(),
924                    datastreamVersion.getDatastreamInfo().getDatastreamId()), e);
925        }
926    }
927
928    private Map<String, Model> splitRelsInt(final Model relsIntModel) {
929        final Map<String, Model> splitModels = new HashMap<>();
930        for (final var it = relsIntModel.listStatements(); it.hasNext();) {
931            final var statement = it.next();
932            final var id = statement.getSubject().getURI();
933            final var model = splitModels.computeIfAbsent(id, k -> ModelFactory.createDefaultModel());
934            model.add(statement);
935        }
936        return splitModels;
937    }
938
939    /**
940     * Creates a new session for the datastream when migrating as atomic resources, or returns the object session,
941     * when migrating as archival groups.
942     *
943     * @param id the datastream's id in fedora 6
944     * @param objectSession the datastream's object session
945     * @return either a new datastream session or the object session
946     */
947    private OcflObjectSession datastreamSession(final String id, final OcflObjectSession objectSession) {
948        if (resourceMigrationType == ResourceMigrationType.ARCHIVAL) {
949            return objectSession;
950        } else {
951            return newSession(id);
952        }
953    }
954
955    private OcflObjectSession newSession(final String id) {
956        return new OcflObjectSessionWrapper(sessionFactory.newSession(id));
957    }
958
959    /**
960     * Wrapper class for storing a RDF resource's "content" triples, RELS triples, and resource headers. The content
961     * triples are triples that were generated based on general Fedora metadata, and the RELS triples are extracted from
962     * one of the RELS-* files. They are maintained separately because it's possible for them to be updated
963     * independently and we need to be able to construct the correct set of triples when one changes.
964     */
965    private static class MetaHolder {
966        Model contentTriples;
967        Model relsTriples;
968        Model dcTriples;
969        ResourceHeaders.Builder headers;
970
971        public static MetaHolder fromContent(final Model contentTriples, final ResourceHeaders.Builder headers) {
972            return new MetaHolder(contentTriples, null, headers);
973        }
974
975        private MetaHolder() {
976        }
977
978        private MetaHolder(final Model contentTriples,
979                           final Model relsTriples,
980                           final Model dcTriples,
981                           final ResourceHeaders.Builder headers) {
982            this.contentTriples = contentTriples;
983            this.relsTriples = relsTriples;
984            this.dcTriples = dcTriples;
985            this.headers = headers;
986        }
987
988        private MetaHolder(final Model contentTriples,
989                           final Model relsTriples,
990                           final ResourceHeaders.Builder headers) {
991            this.contentTriples = contentTriples;
992            this.relsTriples = relsTriples;
993            this.headers = headers;
994        }
995
996
997        /**
998         * Constructs a complete set of triples at the current version of the resource and serializes them as n-triples.
999         *
1000         * @return n-triples input stream
1001         */
1002        public InputStream constructTriples() {
1003            final var output = new ByteArrayOutputStream();
1004            final var triples = ModelFactory.createDefaultModel();
1005
1006            if (contentTriples != null) {
1007                triples.add(contentTriples.listStatements());
1008            }
1009
1010            if (relsTriples != null) {
1011                triples.add(relsTriples.listStatements());
1012            }
1013
1014            if (dcTriples != null) {
1015                triples.add(dcTriples.listStatements());
1016            }
1017
1018            triples.write(output, Lang.NTRIPLES.getName());
1019            return new ByteArrayInputStream(output.toByteArray());
1020        }
1021
1022        public MetaHolder setHeaders(final ResourceHeaders.Builder headers) {
1023            this.headers = headers;
1024            return this;
1025        }
1026
1027        public MetaHolder setContentTriples(final Model contentTriples) {
1028            this.contentTriples = contentTriples;
1029            return this;
1030        }
1031
1032        public MetaHolder setRelsTriples(final Model relsTriples) {
1033            this.relsTriples = relsTriples;
1034            return this;
1035        }
1036        public MetaHolder setDcTriples(final Model dcTriples) {
1037            this.dcTriples = dcTriples;
1038            return this;
1039        }
1040
1041    }
1042
1043    private static class BinaryMeta {
1044        final String name;
1045        final String mimeType;
1046        final String label;
1047
1048        public BinaryMeta(final String name, final String mimeType, final String label) {
1049            this.name = name;
1050            this.mimeType = mimeType;
1051            this.label = label;
1052        }
1053    }
1054
1055}