001/*
002 * Copyright 2019 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.fcrepo.migration.handlers.ocfl;
018
019import com.google.common.base.Preconditions;
020import com.google.common.base.Strings;
021import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
022import com.hp.hpl.jena.rdf.model.Model;
023import com.hp.hpl.jena.rdf.model.ModelFactory;
024import org.apache.commons.codec.digest.DigestUtils;
025import org.apache.commons.io.IOUtils;
026import org.apache.tika.config.TikaConfig;
027import org.apache.tika.detect.Detector;
028import org.apache.tika.io.TikaInputStream;
029import org.apache.tika.metadata.Metadata;
030import org.apache.tika.mime.MimeType;
031import org.apache.tika.mime.MimeTypeException;
032import org.apache.tika.mime.MimeTypes;
033import org.fcrepo.migration.DatastreamVersion;
034import org.fcrepo.migration.FedoraObjectVersionHandler;
035import org.fcrepo.migration.MigrationType;
036import org.fcrepo.migration.ObjectVersionReference;
037import org.fcrepo.storage.ocfl.InteractionModel;
038import org.fcrepo.storage.ocfl.OcflObjectSession;
039import org.fcrepo.storage.ocfl.OcflObjectSessionFactory;
040import org.fcrepo.storage.ocfl.ResourceHeaders;
041import org.slf4j.Logger;
042
043import java.io.ByteArrayInputStream;
044import java.io.ByteArrayOutputStream;
045import java.io.IOException;
046import java.io.InputStream;
047import java.io.UncheckedIOException;
048import java.net.URI;
049import java.time.Instant;
050import java.time.OffsetDateTime;
051import java.util.ArrayList;
052import java.util.HashMap;
053import java.util.Map;
054import java.util.concurrent.atomic.AtomicBoolean;
055
056import static org.slf4j.LoggerFactory.getLogger;
057
058/**
059 * Writes a Fedora object as a single ArchiveGroup.
060 * <p>
061 * All datastreams and object metadata from a fcrepo3 object are persisted to a
062 * single OCFL object (ArchiveGroup in fcrepo6 parlance).
063 * </p>
064 * <p>
065 * The contents of each datastream are written verbatim. No attempt is made to
066 * re-write the RELS-EXT to replace subjects and objects with their LDP
067 * counterparts.
068 * </p>
069 * <p>
070 * Note: fedora-specific OCFL serialization features (such as redirects,
071 * container metadata, etc) is not fully defined yet, so are not included here
072 *
073 * @author apb@jhu.edu
074 */
075public class ArchiveGroupHandler implements FedoraObjectVersionHandler {
076
077    private static final Logger LOGGER = getLogger(ArchiveGroupHandler.class);
078
079    private static final String FCREPO_ROOT = "info:fedora/";
080
081    private static final Map<String, String> externalHandlingMap = Map.of(
082            "E", "proxy",
083            "R", "redirect"
084    );
085
086    private static final String INLINE_XML = "X";
087
088    private static final String DS_INACTIVE = "I";
089    private static final String DS_DELETED = "D";
090
091    private static final String OBJ_STATE_PROP = "info:fedora/fedora-system:def/model#state";
092    private static final String OBJ_INACTIVE = "Inactive";
093    private static final String OBJ_DELETED = "Deleted";
094
095    private final OcflObjectSessionFactory sessionFactory;
096    private final boolean addDatastreamExtensions;
097    private final boolean deleteInactive;
098    private final MigrationType migrationType;
099    private final String user;
100    private final Detector mimeDetector;
101
102    /**
103     * Create an ArchiveGroupHandler,
104     *
105     * @param sessionFactory
106     *        OCFL session factory
107     * @param migrationType
108     *        the type of migration to do
109     * @param addDatastreamExtensions
110     *        true if datastreams should be written with file extensions
111     * @param deleteInactive
112     *        true if inactive objects and datastreams should be migrated as deleted
113     * @param user
114     *        the username to associated with the migrated resources
115     */
116    public ArchiveGroupHandler(final OcflObjectSessionFactory sessionFactory,
117                               final MigrationType migrationType,
118                               final boolean addDatastreamExtensions,
119                               final boolean deleteInactive,
120                               final String user) {
121        this.sessionFactory = Preconditions.checkNotNull(sessionFactory, "sessionFactory cannot be null");
122        this.migrationType = Preconditions.checkNotNull(migrationType, "migrationType cannot be null");
123        this.addDatastreamExtensions = addDatastreamExtensions;
124        this.deleteInactive = deleteInactive;
125        this.user = Preconditions.checkNotNull(Strings.emptyToNull(user), "user cannot be blank");
126        try {
127            this.mimeDetector = new TikaConfig().getDetector();
128        } catch (Exception e) {
129            throw new RuntimeException(e);
130        }
131    }
132
133    @Override
134    public void processObjectVersions(final Iterable<ObjectVersionReference> versions) {
135        // We use the PID to identify the OCFL object
136        String objectId = null;
137        String f6ObjectId = null;
138
139        // We need to manually keep track of the datastream creation dates
140        final Map<String, String> dsCreateDates = new HashMap<>();
141
142        String objectState = null;
143        final Map<String, String> datastreamStates = new HashMap<>();
144
145        for (var ov : versions) {
146            if (ov.isFirstVersion()) {
147                objectId = ov.getObjectInfo().getPid();
148                f6ObjectId = FCREPO_ROOT + objectId;
149                objectState = getObjectState(ov);
150            }
151
152            final OcflObjectSession session = sessionFactory.newSession(f6ObjectId);
153
154            // Object properties are written only once (as fcrepo3 object properties were unversioned).
155            if (ov.isFirstVersion()) {
156                writeObjectFiles(f6ObjectId, ov, session);
157            }
158
159            // Write datastreams and their metadata
160            for (var dv : ov.listChangedDatastreams()) {
161                final var mimeType = resolveMimeType(dv);
162                final String dsId = dv.getDatastreamInfo().getDatastreamId();
163                final String f6DsId = resolveF6DatastreamId(dsId, f6ObjectId, mimeType);
164                final var datastreamFilename = lastPartFromId(f6DsId);
165
166                if (dv.isFirstVersionIn(ov.getObject())) {
167                    dsCreateDates.put(dsId, dv.getCreated());
168                    datastreamStates.put(f6DsId, dv.getDatastreamInfo().getState());
169                }
170                final var createDate = dsCreateDates.get(dsId);
171
172                final var datastreamHeaders = createDatastreamHeaders(dv, f6DsId, f6ObjectId,
173                        datastreamFilename, mimeType, createDate);
174
175                if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
176                    InputStream content = null;
177                    // Write a file for external content only for plain OCFL migration
178                    if (migrationType == MigrationType.PLAIN_OCFL) {
179                        content = IOUtils.toInputStream(dv.getExternalOrRedirectURL());
180                    }
181                    session.writeResource(datastreamHeaders, content);
182                } else {
183                    try (var content = dv.getContent()) {
184                        session.writeResource(datastreamHeaders, content);
185                    } catch (final IOException e) {
186                        throw new UncheckedIOException(e);
187                    }
188                }
189
190                writeDescriptionFiles(f6DsId, datastreamFilename, createDate, datastreamHeaders, dv, session);
191            }
192
193            LOGGER.debug("Committing object <{}>", f6ObjectId);
194
195            session.versionCreationTimestamp(OffsetDateTime.parse(ov.getVersionDate()));
196            session.commit();
197        }
198
199        handleDeletedResources(f6ObjectId, objectState, datastreamStates);
200    }
201
202    private void handleDeletedResources(final String f6ObjectId,
203                                        final String objectState,
204                                        final Map<String, String> datastreamStates) {
205        final OcflObjectSession session = sessionFactory.newSession(f6ObjectId);
206
207        try {
208            final var now = OffsetDateTime.now();
209            final var hasDeletes = new AtomicBoolean(false);
210
211            if (OBJ_DELETED.equals(objectState) || (deleteInactive && OBJ_INACTIVE.equals(objectState))) {
212                hasDeletes.set(true);
213
214                datastreamStates.keySet().forEach(f6DsId -> {
215                    deleteDatastream(f6DsId, now.toInstant(), session);
216                });
217
218                if (migrationType == MigrationType.PLAIN_OCFL) {
219                    deleteOcflMigratedResource(f6ObjectId, InteractionModel.BASIC_CONTAINER, session);
220                } else {
221                    deleteF6MigratedResource(f6ObjectId, now.toInstant(), session);
222                }
223            } else {
224                datastreamStates.forEach((f6DsId, state) -> {
225                    if (DS_DELETED.equals(state) || (deleteInactive && DS_INACTIVE.equals(state))) {
226                        hasDeletes.set(true);
227                        deleteDatastream(f6DsId, now.toInstant(), session);
228                    }
229                });
230            }
231
232            if (hasDeletes.get()) {
233                session.versionCreationTimestamp(now);
234                session.commit();
235            } else {
236                session.abort();
237            }
238        } catch (RuntimeException e) {
239            session.abort();
240            throw e;
241        }
242    }
243
244    private void writeObjectFiles(final String f6ObjectId,
245                                  final ObjectVersionReference ov,
246                                  final OcflObjectSession session) {
247        final var objectHeaders = createObjectHeaders(f6ObjectId, ov);
248        final var content = getObjTriples(ov);
249        session.writeResource(objectHeaders, content);
250    }
251
252    private void writeDescriptionFiles(final String f6Dsid,
253                                       final String datastreamFilename,
254                                       final String createDate,
255                                       final ResourceHeaders datastreamHeaders,
256                                       final DatastreamVersion dv,
257                                       final OcflObjectSession session) {
258        final var descriptionHeaders = createDescriptionHeaders(f6Dsid,
259                datastreamFilename,
260                datastreamHeaders);
261        session.writeResource(descriptionHeaders, getDsTriples(dv, f6Dsid, createDate));
262    }
263
264    private String f6DescriptionId(final String f6ResourceId) {
265        return f6ResourceId + "/fcr:metadata";
266    }
267
268    private String lastPartFromId(final String id) {
269        return id.substring(id.lastIndexOf('/') + 1);
270    }
271
272    private String resolveF6DatastreamId(final String datastreamId, final String f6ObjectId, final String mimeType) {
273        var id = f6ObjectId + "/" + datastreamId;
274
275        if (addDatastreamExtensions && !Strings.isNullOrEmpty(mimeType)) {
276            id += getExtension(mimeType);
277        }
278
279        return id;
280    }
281
282    private ResourceHeaders.Builder createHeaders(final String id,
283                                                  final String parentId,
284                                                  final InteractionModel model) {
285        final var headers = ResourceHeaders.builder();
286        headers.withId(id);
287        headers.withParent(parentId);
288        headers.withInteractionModel(model.getUri());
289        return headers;
290    }
291
292    private ResourceHeaders createObjectHeaders(final String f6ObjectId, final ObjectVersionReference ov) {
293        final var headers = createHeaders(f6ObjectId, FCREPO_ROOT, InteractionModel.BASIC_CONTAINER);
294        headers.withArchivalGroup(true);
295        headers.withObjectRoot(true);
296        headers.withLastModifiedBy(user);
297        headers.withCreatedBy(user);
298
299        ov.getObjectProperties().listProperties().forEach(p -> {
300            if (p.getName().contains("lastModifiedDate")) {
301                final var lastModified = Instant.parse(p.getValue());
302                headers.withLastModifiedDate(lastModified);
303                headers.withStateToken(DigestUtils.md5Hex(
304                        String.valueOf(lastModified.toEpochMilli())).toUpperCase());
305            } else if (p.getName().contains("createdDate")) {
306                headers.withCreatedDate(Instant.parse(p.getValue()));
307            }
308        });
309
310        return headers.build();
311    }
312
313    private ResourceHeaders createDatastreamHeaders(final DatastreamVersion dv,
314                                                    final String f6DsId,
315                                                    final String f6ObjectId,
316                                                    final String filename,
317                                                    final String mime,
318                                                    final String createDate) {
319        final var lastModified = Instant.parse(dv.getCreated());
320        final var headers = createHeaders(f6DsId, f6ObjectId, InteractionModel.NON_RDF);
321        headers.withFilename(filename);
322        headers.withCreatedDate(Instant.parse(createDate));
323        headers.withLastModifiedDate(lastModified);
324        headers.withLastModifiedBy(user);
325        headers.withCreatedBy(user);
326
327        if (externalHandlingMap.containsKey(dv.getDatastreamInfo().getControlGroup())) {
328            headers.withExternalHandling(
329                    externalHandlingMap.get(dv.getDatastreamInfo().getControlGroup()));
330            headers.withExternalUrl(dv.getExternalOrRedirectURL());
331        }
332
333        headers.withArchivalGroup(false);
334        headers.withObjectRoot(false);
335        if (dv.getSize() > -1 && !INLINE_XML.equals(dv.getDatastreamInfo().getControlGroup())) {
336            headers.withContentSize(dv.getSize());
337        }
338
339        if (dv.getContentDigest() != null && !Strings.isNullOrEmpty(dv.getContentDigest().getDigest())) {
340            final var digest = dv.getContentDigest();
341            final var digests = new ArrayList<URI>();
342            digests.add(URI.create("urn:" + digest.getType().toLowerCase() + ":" + digest.getDigest().toLowerCase()));
343            headers.withDigests(digests);
344        }
345
346        headers.withMimeType(mime);
347        headers.withStateToken(DigestUtils.md5Hex(
348                String.valueOf(lastModified.toEpochMilli())).toUpperCase());
349
350        return headers.build();
351    }
352
353    private ResourceHeaders createDescriptionHeaders(final String f6DsId,
354                                                     final String filename,
355                                                     final ResourceHeaders datastreamHeaders) {
356        final var id = f6DescriptionId(f6DsId);
357        final var headers = createHeaders(id, f6DsId, InteractionModel.NON_RDF_DESCRIPTION);
358
359        headers.withFilename(filename);
360        headers.withCreatedDate(datastreamHeaders.getCreatedDate());
361        headers.withLastModifiedDate(datastreamHeaders.getLastModifiedDate());
362        headers.withCreatedBy(datastreamHeaders.getCreatedBy());
363        headers.withLastModifiedBy(datastreamHeaders.getLastModifiedBy());
364
365        headers.withArchivalGroup(false);
366        headers.withObjectRoot(false);
367        headers.withStateToken(datastreamHeaders.getStateToken());
368
369        return headers.build();
370    }
371
372    private String resolveMimeType(final DatastreamVersion dv) {
373        String mime = dv.getMimeType();
374
375        if (Strings.isNullOrEmpty(mime)) {
376            final var meta = new Metadata();
377            meta.set(Metadata.RESOURCE_NAME_KEY, dv.getDatastreamInfo().getDatastreamId());
378            try (var content = TikaInputStream.get(dv.getContent())) {
379                mime = mimeDetector.detect(content, meta).toString();
380            } catch (IOException e) {
381                throw new UncheckedIOException(e);
382            }
383        }
384
385        return mime;
386    }
387
388    private void deleteDatastream(final String id,
389                                  final Instant lastModified,
390                                  final OcflObjectSession session) {
391        if (migrationType == MigrationType.PLAIN_OCFL) {
392            deleteOcflMigratedResource(id, InteractionModel.NON_RDF, session);
393            deleteOcflMigratedResource(f6DescriptionId(id), InteractionModel.NON_RDF_DESCRIPTION, session);
394        } else {
395            deleteF6MigratedResource(id, lastModified, session);
396            deleteF6MigratedResource(f6DescriptionId(id), lastModified, session);
397        }
398    }
399
400    private void deleteF6MigratedResource(final String id,
401                                          final Instant lastModified,
402                                          final OcflObjectSession session) {
403        LOGGER.debug("Deleting resource {}", id);
404        final var headers = session.readHeaders(id);
405        session.deleteContentFile(ResourceHeaders.builder(headers)
406                .withDeleted(true)
407                .withLastModifiedDate(lastModified)
408                .build());
409    }
410
411    private void deleteOcflMigratedResource(final String id,
412                                            final InteractionModel interactionModel,
413                                            final OcflObjectSession session) {
414        LOGGER.debug("Deleting resource {}", id);
415        session.deleteContentFile(ResourceHeaders.builder()
416                .withId(id)
417                .withInteractionModel(interactionModel.getUri())
418                .build());
419    }
420
421    private String getObjectState(final ObjectVersionReference ov) {
422        return ov.getObjectProperties().listProperties().stream()
423                .filter(prop -> OBJ_STATE_PROP.equals(prop.getName()))
424                .findFirst()
425                .orElseThrow(() -> new IllegalStateException(String.format("Object %s is missing state information",
426                        ov.getObjectInfo().getPid())))
427                .getValue();
428    }
429
430    // Get object-level triples
431    private static InputStream getObjTriples(final ObjectVersionReference o) {
432        final ByteArrayOutputStream out = new ByteArrayOutputStream();
433        final Model triples = ModelFactory.createDefaultModel();
434        final String uri = "info:fedora/" + o.getObjectInfo().getPid();
435
436        o.getObjectProperties().listProperties().forEach(p -> {
437            if (p.getName().contains("Date")) {
438                addDateLiteral(triples, uri, p.getName(), p.getValue());
439            } else {
440                addStringLiteral(triples, uri, p.getName(), p.getValue());
441            }
442        });
443
444        triples.write(out, "N-TRIPLES");
445        return new ByteArrayInputStream(out.toByteArray());
446    }
447
448    // Get datastream-level triples
449    private InputStream getDsTriples(final DatastreamVersion dv,
450                                            final String f6DsId,
451                                            final String createDate) {
452        final ByteArrayOutputStream out = new ByteArrayOutputStream();
453        final Model triples = ModelFactory.createDefaultModel();
454
455        if (migrationType == MigrationType.PLAIN_OCFL) {
456            // These triples are server managed in F6
457            addDateLiteral(triples,
458                    f6DsId,
459                    "http://fedora.info/definitions/v4/repository#created",
460                    createDate);
461            addDateLiteral(triples,
462                    f6DsId,
463                    "http://fedora.info/definitions/v4/repository#lastModified",
464                    dv.getCreated());
465            addStringLiteral(triples,
466                    f6DsId,
467                    "http://purl.org/dc/terms/identifier",
468                    dv.getDatastreamInfo().getDatastreamId());
469            addStringLiteral(triples,
470                    f6DsId,
471                    "http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hasMimeType",
472                    dv.getMimeType());
473            addLongLiteral(triples,
474                    f6DsId,
475                    "http://www.loc.gov/premis/rdf/v1#size",
476                    dv.getSize());
477
478            if (dv.getContentDigest() != null) {
479                addStringLiteral(triples,
480                        f6DsId,
481                        "http://www.loc.gov/premis/rdf/v1#hasMessageDigest",
482                        "urn:" + dv.getContentDigest().getType().toLowerCase() + ":" +
483                                dv.getContentDigest().getDigest().toLowerCase());
484            }
485        }
486
487        addStringLiteral(triples,
488                         f6DsId,
489                         "http://purl.org/dc/terms/title",
490                         dv.getLabel());
491        addStringLiteral(triples,
492                         f6DsId,
493                         "http://fedora.info/definitions/1/0/access/objState",
494                         dv.getDatastreamInfo().getState());
495        addStringLiteral(triples,
496                         f6DsId,
497                         "http://www.loc.gov/premis/rdf/v1#formatDesignation",
498                         dv.getFormatUri());
499
500        triples.write(out, "N-TRIPLES");
501        return new ByteArrayInputStream(out.toByteArray());
502    }
503
504    private static void addStringLiteral(final Model m,
505                                         final String s,
506                                         final String p,
507                                         final String o) {
508        if (o != null) {
509            m.add(m.createResource(s), m.createProperty(p), o);
510        }
511    }
512
513    private static void addDateLiteral(final Model m,
514                                       final String s,
515                                       final String p,
516                                       final String date) {
517        if (date != null) {
518            m.addLiteral(m.createResource(s),
519                         m.createProperty(p),
520                         m.createTypedLiteral(date, XSDDatatype.XSDdateTime));
521        }
522    }
523
524    private static void addLongLiteral(final Model m,
525                                       final String s,
526                                       final String p,
527                                       final long number) {
528        if (number != -1) {
529            m.addLiteral(m.createResource(s),
530                    m.createProperty(p),
531                    m.createTypedLiteral(number, XSDDatatype.XSDlong));
532        }
533    }
534
535    /**
536     * @param mime any mimetype as String
537     * @return extension associated with arg mime, return includes '.' in extension (.txt).
538     *                  ..Empty String if unrecognized mime
539     */
540    private static String getExtension(final String mime) {
541        final MimeTypes allTypes = MimeTypes.getDefaultMimeTypes();
542        MimeType type;
543        try {
544            type = allTypes.forName(mime);
545        } catch (final MimeTypeException e) {
546            type = null;
547        }
548
549        if (type != null) {
550            return type.getExtension();
551        }
552
553        LOGGER.warn("No mimetype found for '{}'", mime);
554        return "";
555    }
556
557}