001/**
002 * The contents of this file are subject to the license and copyright
003 * detailed in the LICENSE and NOTICE files at the root of the source
004 * tree.
005 *
006 */
007package org.fcrepo.migration.foxml;
008
009import io.micrometer.core.instrument.Metrics;
010import io.micrometer.core.instrument.Timer;
011import org.apache.commons.codec.binary.Base64OutputStream;
012import org.apache.commons.codec.binary.Hex;
013import org.apache.commons.codec.digest.DigestUtils;
014import org.apache.commons.io.FileUtils;
015import org.apache.commons.io.IOUtils;
016import org.apache.commons.lang3.StringUtils;
017import org.apache.xml.serialize.OutputFormat;
018import org.apache.xml.serialize.XMLSerializer;
019import org.codehaus.stax2.XMLInputFactory2;
020import org.fcrepo.migration.ContentDigest;
021import org.fcrepo.migration.DatastreamInfo;
022import org.fcrepo.migration.DatastreamVersion;
023import org.fcrepo.migration.DefaultContentDigest;
024import org.fcrepo.migration.DefaultObjectInfo;
025import org.fcrepo.migration.FedoraObjectProcessor;
026import org.fcrepo.migration.ObjectInfo;
027import org.fcrepo.migration.ObjectProperties;
028import org.fcrepo.migration.ObjectReference;
029import org.fcrepo.migration.StreamingFedoraObjectHandler;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032import org.w3c.dom.Document;
033import org.xml.sax.InputSource;
034import org.xml.sax.SAXException;
035
036import javax.xml.bind.JAXBContext;
037import javax.xml.bind.JAXBElement;
038import javax.xml.bind.JAXBException;
039import javax.xml.bind.Unmarshaller;
040import javax.xml.parsers.DocumentBuilder;
041import javax.xml.parsers.DocumentBuilderFactory;
042import javax.xml.parsers.ParserConfigurationException;
043import javax.xml.stream.XMLEventReader;
044import javax.xml.stream.XMLInputFactory;
045import javax.xml.stream.XMLStreamConstants;
046import javax.xml.stream.XMLStreamException;
047import javax.xml.stream.XMLStreamReader;
048import javax.xml.stream.events.XMLEvent;
049import java.io.BufferedInputStream;
050import java.io.BufferedOutputStream;
051import java.io.BufferedReader;
052import java.io.ByteArrayOutputStream;
053import java.io.File;
054import java.io.FileInputStream;
055import java.io.FileNotFoundException;
056import java.io.FileOutputStream;
057import java.io.IOException;
058import java.io.InputStream;
059import java.io.InputStreamReader;
060import java.io.OutputStreamWriter;
061import java.io.PrintWriter;
062import java.io.StringReader;
063import java.io.StringWriter;
064import java.io.UncheckedIOException;
065import java.net.MalformedURLException;
066import java.net.URL;
067import java.nio.charset.StandardCharsets;
068import java.time.Instant;
069import java.util.ArrayList;
070import java.util.Arrays;
071import java.util.HashMap;
072import java.util.HashSet;
073import java.util.LinkedList;
074import java.util.List;
075import java.util.Map;
076import java.util.Optional;
077import java.util.Set;
078import java.util.regex.Pattern;
079
080/**
081 * A FedoraObjectProcessor implementation that uses the STaX API to process
082 * a FOXML XML InputStream.
083 * @author mdurbin
084 */
085public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor {
086
087    private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class);
088
089    private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>",
090            Pattern.DOTALL);
091
092    private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#";
093
094    private static final String METRIC_NAME = "fcrepo.storage.foxml.object";
095    private static final String OPERATION = "operation";
096    private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject");
097    private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject");
098
099    private URLFetcher fetcher;
100
101    private String localFedoraServer;
102
103    private InternalIDResolver idResolver;
104
105    private File file;
106
107    private InputStream stream;
108
109    private XMLStreamReader reader;
110
111    private DocumentBuilder documentBuilder;
112
113    private List<File> tempFiles;
114
115    private LinkedList<String> inlineXml;
116
117    /**
118     * The basic object information read from the XML stream at construction
119     * time by processing the root XML element and its attributes.
120     */
121    private ObjectInfo objectInfo;
122
123    /**
124     * foxml input stream fedora object processor.
125     * @param file the FOXML file
126     * @param fetcher the fetcher
127     * @param resolver the resolver
128     * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server
129     *                          from which the content exposed by the "is" parameter comes.
130     * @throws XMLStreamException xml stream exception
131     */
132    public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher,
133                                                 final InternalIDResolver resolver, final String localFedoraServer)
134            throws XMLStreamException, FileNotFoundException {
135        this.file = file;
136        this.fetcher = fetcher;
137        this.idResolver = resolver;
138        this.localFedoraServer = localFedoraServer;
139        final XMLInputFactory factory = XMLInputFactory.newFactory();
140        stream = new BufferedInputStream(new FileInputStream(file));
141        reader = factory.createXMLStreamReader(stream);
142        reader.nextTag();
143        final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation");
144        objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath());
145        while (reader.next() == XMLStreamConstants.CHARACTERS) {
146        }
147
148        tempFiles = new ArrayList<File>();
149
150        final var builderFactory = DocumentBuilderFactory.newInstance();
151        builderFactory.setNamespaceAware(true);
152        builderFactory.setIgnoringComments(false);
153        try {
154            documentBuilder = builderFactory.newDocumentBuilder();
155        } catch (ParserConfigurationException e) {
156            throw new RuntimeException(e);
157        }
158
159        try {
160            inlineXml = new LinkedList<>();
161            final var content = FileUtils.readFileToString(file);
162            final var matcher = INLINE_PATTERN.matcher(content);
163            while (matcher.find()) {
164                inlineXml.add(matcher.group(1));
165            }
166        } catch (IOException e) {
167            throw new UncheckedIOException(e);
168        }
169    }
170
171    @Override
172    public ObjectInfo getObjectInfo() {
173        return objectInfo;
174    }
175
176    @Override
177    public void processObject(final StreamingFedoraObjectHandler handler) {
178        final var stopwatch = Timer.start();
179        handler.beginObject(objectInfo);
180        Foxml11DatastreamInfo dsInfo = null;
181        try {
182            handler.processObjectProperties(readProperties());
183            while (reader.hasNext()) {
184                if (reader.isCharacters()) {
185                    if (!reader.isWhiteSpace()) {
186                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
187                    } else {
188                        // skip whitespace...
189                    }
190                } else if (reader.isStartElement()) {
191                    if (reader.getLocalName().equals("datastream")
192                            && reader.getNamespaceURI().equals(FOXML_NS)) {
193                        dsInfo = new Foxml11DatastreamInfo(objectInfo, reader);
194                    } else if (reader.getLocalName().equals("datastreamVersion")) {
195                        final var v = new Foxml11DatastreamVersion(dsInfo, reader);
196                        try {
197                            v.validateInlineXml();
198                        } catch (RuntimeException e) {
199                            // do we need to do anyting with disabled digests?
200                            LOG.error("Inline Validation failed", e);
201                            throw new RuntimeException(e);
202                        }
203                        handler.processDatastreamVersion(v);
204                    } else {
205                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
206                    }
207                } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) {
208                    dsInfo = null;
209                } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) {
210                    // end of document....
211                } else {
212                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
213                            + reader.getLocation().getLineNumber() + ", column "
214                            + reader.getLocation().getColumnNumber()
215                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
216                }
217                reader.next();
218            }
219        } catch (Exception e) {
220            abort(handler, e);
221        } finally {
222            stopwatch.stop(processObjectTimer);
223        }
224
225        completeObjectTimer.record(() -> complete(handler));
226    }
227
228    private void complete(final StreamingFedoraObjectHandler handler) {
229        try {
230            handler.completeObject(objectInfo);
231            cleanUpTempFiles();
232        } catch (Exception e) {
233            abort(handler, e);
234        }
235    }
236
237    private void abort(final StreamingFedoraObjectHandler handler, final Exception e) {
238        try {
239            handler.abortObject(objectInfo);
240            if (e instanceof RuntimeException) {
241                throw (RuntimeException) e;
242            }
243            throw new RuntimeException(e);
244        } finally {
245            cleanUpTempFiles();
246            close();
247        }
248    }
249
250    /**
251     * Close resources associated to the processor
252     */
253    public void close() {
254        try {
255            reader.close();
256        } catch (final XMLStreamException e) {
257            LOG.warn("Failed to close reader cleanly", e);
258        }
259        try {
260            stream.close();
261        } catch (IOException e) {
262            LOG.warn("Failed to close file cleanly", e);
263        }
264    }
265
266    private void cleanUpTempFiles() {
267        for (final File f : this.tempFiles) {
268            if (f.exists()) {
269                f.delete();
270            }
271        }
272    }
273
274    private ObjectProperties readProperties() throws JAXBException, XMLStreamException {
275        final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class);
276        final Unmarshaller unmarshaller = jc.createUnmarshaller();
277        final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class);
278        final FoxmlObjectProperties properties = p.getValue();
279        return properties;
280    }
281
282    private void readUntilClosed(final String name, final String namespace) throws XMLStreamException {
283        while (reader.hasNext()) {
284            if (reader.isEndElement() && reader.getLocalName().equals(name)
285                    && reader.getNamespaceURI().equals(namespace)) {
286                return;
287            } else {
288                // skip all other stuff....
289            }
290            reader.next();
291        }
292    }
293
294    private class Foxml11DatastreamInfo implements DatastreamInfo {
295
296        private String id;
297
298        private String controlGroup;
299
300        private String fedoraUri;
301
302        private String state;
303
304        private boolean versionable;
305
306        private ObjectInfo objectInfo;
307
308        public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) {
309            this.objectInfo = objectInfo;
310            final Map<String, String> attributes
311            = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE");
312            id = attributes.get("ID");
313            controlGroup = attributes.get("CONTROL_GROUP");
314            fedoraUri = attributes.get("FEDORA_URI");
315            state = attributes.get("STATE");
316            versionable = Boolean.valueOf(attributes.get("VERSIONABLE"));
317        }
318
319        @Override
320        public ObjectInfo getObjectInfo() {
321            return objectInfo;
322        }
323
324        @Override
325        public String getDatastreamId() {
326            return id;
327        }
328
329        @Override
330        public String getControlGroup() {
331            return controlGroup;
332        }
333
334        @Override
335        public String getFedoraURI() {
336            return fedoraUri;
337        }
338
339        @Override
340        public String getState() {
341            return state;
342        }
343
344        @Override
345        public boolean getVersionable() {
346            return versionable;
347        }
348    }
349
350    public class Foxml11DatastreamVersion implements DatastreamVersion {
351
352        private DatastreamInfo dsInfo;
353
354        private String id;
355        private String label;
356        private String created;
357        private Instant createdInstant;
358        private String mimeType;
359        private String altIds;
360        private String formatUri;
361        private long size;
362        private ContentDigest contentDigest;
363        private CachedContent dsContent;
364        private boolean isInlineXml = false;
365
366        /**
367         * foxml datastream version.
368         * @param dsInfo the datastream information
369         * @param reader the reader
370         * @throws XMLStreamException xml stream exception
371         */
372        public Foxml11DatastreamVersion(final DatastreamInfo dsInfo,
373                final XMLStreamReader reader) throws XMLStreamException {
374            this.dsInfo = dsInfo;
375            final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL",
376                    "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE");
377            id = dsAttributes.get("ID");
378            label = dsAttributes.get("LABEL");
379            created = dsAttributes.get("CREATED");
380            createdInstant = created != null ? Instant.parse(created) : null;
381            mimeType = dsAttributes.get("MIMETYPE");
382            altIds = dsAttributes.get("ALT_IDS");
383            formatUri = dsAttributes.get("FORMAT_URI");
384            size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1;
385            reader.next();
386
387            while (reader.hasNext()) {
388                if (reader.isCharacters()) {
389                    if (!reader.isWhiteSpace()) {
390                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
391                    } else {
392                        // skip whitespace...
393                    }
394                } else if (reader.isStartElement()) {
395                    final String localName = reader.getLocalName();
396                    if (localName.equals("contentDigest")) {
397                        final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST");
398                        this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST"));
399                    } else if (localName.equals("xmlContent")) {
400                        // this XML fragment may not be valid out of context
401                        // context, so write it out as a complete XML
402                        // file...
403                        reader.next();
404
405                        isInlineXml = true;
406                        dsContent = new MemoryCachedContent(extractInlineXml());
407                    } else if (localName.equals("contentLocation")) {
408                        final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE");
409                        if (attributes.get("TYPE").equals("INTERNAL_ID")) {
410                            dsContent = idResolver.resolveInternalID(attributes.get("REF"));
411                        } else {
412                            try {
413                                String ref = attributes.get("REF");
414                                if (ref.contains("local.fedora.server")) {
415                                    ref = ref.replace("local.fedora.server", localFedoraServer);
416                                }
417                                dsContent = new URLCachedContent(new URL(ref), fetcher);
418                            } catch (final MalformedURLException e) {
419                                throw new RuntimeException(e);
420                            }
421                        }
422                    } else if (localName.equals("binaryContent")) {
423                        try {
424                            final File f = File.createTempFile("decoded", "file");
425                            tempFiles.add(f);
426                            final Base64OutputStream out = new Base64OutputStream(
427                                    new BufferedOutputStream(new FileOutputStream(f)), false);
428                            while (reader.next() == XMLStreamConstants.CHARACTERS) {
429                                out.write(reader.getText().getBytes("UTF-8"));
430                            }
431                            out.flush();
432                            out.close();
433                            dsContent = new FileCachedContent(f);
434                        } catch (final IOException e) {
435                            throw new RuntimeException(e);
436                        }
437                        readUntilClosed("binaryContent", FOXML_NS);
438                    } else {
439                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
440                    }
441                } else if (reader.isEndElement()) {
442                    if (reader.getLocalName().equals("datastreamVersion")) {
443                        return;
444                    }
445                } else {
446                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
447                            + reader.getLocation().getLineNumber() + ", column "
448                            + reader.getLocation().getColumnNumber()
449                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
450                }
451                reader.next();
452            }
453
454        }
455
456        @Override
457        public Optional<File> getFile() {
458            return dsContent.getFile();
459        }
460
461        private String extractInlineXml() throws XMLStreamException {
462            final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader);
463            while (eventReader.hasNext()) {
464                final XMLEvent event = eventReader.nextEvent();
465                if (event.isEndElement()
466                        && event.asEndElement().getName().getLocalPart().equals("xmlContent")
467                        && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) {
468                    break;
469                }
470            }
471
472            return inlineXml.removeFirst();
473        }
474
475        private void validateInlineXml() {
476            if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) {
477
478                if (StringUtils.equals(contentDigest.getType(), "DISABLED")) {
479                    LOG.warn("Datastream Digest DISABLED. Skipping digest validation");
480                    return;
481                }
482
483                final var transformedXml = transformInlineXmlForChecksum();
484                final var digest = DigestUtils.getDigest(contentDigest.getType());
485                final var digestBytes = DigestUtils.digest(digest, transformedXml);
486                final var digestHex = Hex.encodeHexString(digestBytes);
487
488                if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) {
489                    throw new RuntimeException(String.format(
490                            "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s",
491                            dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(),
492                            contentDigest.getType(), contentDigest.getDigest(), digestHex));
493                }
494            }
495        }
496
497        /**
498         * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/
499         * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/
500         * DatastreamXMLMetadata.java#L92
501         *
502         * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order.
503         *
504         * @return the xml in the format Fedora 3 used to calculate digests
505         */
506        private byte[] transformInlineXmlForChecksum() {
507            try {
508                // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :(
509                final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
510                        + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8);
511
512                final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8);
513                final var source = new InputSource(isReader);
514                source.setEncoding("UTF-8");
515
516                final Document doc = documentBuilder.parse(source);
517
518                final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false);
519                // indent == 0 means add no indenting
520                fmt.setIndent(0);
521                // default line width is 72, but only applies when indenting
522                fmt.setLineWidth(0);
523                fmt.setPreserveSpace(false);
524
525                final StringWriter out = new StringWriter();
526                final XMLSerializer ser = new XMLSerializer(out, fmt);
527                ser.serialize(doc);
528                out.close();
529
530                final var baos = new ByteArrayOutputStream();
531                final var br = new BufferedReader(new StringReader(out.toString()));
532                String line;
533                final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8));
534                while ((line = br.readLine()) != null) {
535                    line = line.trim();
536                    outStream.append(line);
537                }
538                outStream.close();
539
540                return baos.toByteArray();
541            } catch (IOException e) {
542                throw new UncheckedIOException(e);
543            } catch (SAXException e) {
544                try {
545                    LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream()));
546                } catch (IOException e2) {
547                    // swallow
548                }
549                throw new RuntimeException(e);
550            }
551        }
552
553        @Override
554        public DatastreamInfo getDatastreamInfo() {
555            return dsInfo;
556        }
557
558        @Override
559        public String getVersionId() {
560            return id;
561        }
562
563        @Override
564        public String getMimeType() {
565            return mimeType;
566        }
567
568        @Override
569        public String getLabel() {
570            return label;
571        }
572
573        @Override
574        public String getCreated() {
575            return created;
576        }
577
578        @Override
579        public Instant getCreatedInstant() {
580            return createdInstant;
581        }
582
583        @Override
584        public String getAltIds() {
585            return altIds;
586        }
587
588        @Override
589        public String getFormatUri() {
590            return formatUri;
591        }
592
593        @Override
594        public long getSize() {
595            return size;
596        }
597
598        @Override
599        public ContentDigest getContentDigest() {
600            // The digests for inline xml do not match what is stored in the FOXML and should not be returned here.
601            if (isInlineXml) {
602                return null;
603            }
604            return contentDigest;
605        }
606
607        @Override
608        public InputStream getContent() throws IOException {
609            return dsContent.getInputStream();
610        }
611
612        @Override
613        public String getExternalOrRedirectURL() {
614            if (dsContent instanceof URLCachedContent) {
615                return ((URLCachedContent) dsContent).getURL().toString();
616            } else {
617                throw new IllegalStateException();
618            }
619        }
620
621        @Override
622        public boolean isFirstVersionIn(final ObjectReference obj) {
623            final List<DatastreamVersion> datastreams =
624                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
625            return datastreams.indexOf(this) == 0;
626        }
627
628        @Override
629        public boolean isLastVersionIn(final ObjectReference obj) {
630            final List<DatastreamVersion> datastreams =
631                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
632            return datastreams.indexOf(this) == datastreams.size() - 1;
633        }
634    }
635
636    private static Map<String, String> getAttributes(final XMLStreamReader r,
637            final String ... allowedNames) {
638        final HashMap<String, String> result = new HashMap<String, String>();
639        final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames));
640        for (int i = 0; i < r.getAttributeCount(); i ++) {
641            final String localName = r.getAttributeLocalName(i);
642            final String value = r.getAttributeValue(i);
643            if (allowed.contains(localName)) {
644                result.put(localName, value);
645            } else {
646                System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\"");
647            }
648        }
649        return result;
650
651    }
652
653}