001/*
002 * Copyright 2015 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.fcrepo.migration.foxml;
017
018import io.micrometer.core.instrument.Metrics;
019import io.micrometer.core.instrument.Timer;
020import org.apache.commons.codec.binary.Base64OutputStream;
021import org.apache.commons.codec.binary.Hex;
022import org.apache.commons.codec.digest.DigestUtils;
023import org.apache.commons.io.FileUtils;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.lang3.StringUtils;
026import org.apache.xml.serialize.OutputFormat;
027import org.apache.xml.serialize.XMLSerializer;
028import org.codehaus.stax2.XMLInputFactory2;
029import org.fcrepo.migration.ContentDigest;
030import org.fcrepo.migration.DatastreamInfo;
031import org.fcrepo.migration.DatastreamVersion;
032import org.fcrepo.migration.DefaultContentDigest;
033import org.fcrepo.migration.DefaultObjectInfo;
034import org.fcrepo.migration.FedoraObjectProcessor;
035import org.fcrepo.migration.ObjectInfo;
036import org.fcrepo.migration.ObjectProperties;
037import org.fcrepo.migration.ObjectReference;
038import org.fcrepo.migration.StreamingFedoraObjectHandler;
039import org.slf4j.Logger;
040import org.slf4j.LoggerFactory;
041import org.w3c.dom.Document;
042import org.xml.sax.InputSource;
043import org.xml.sax.SAXException;
044
045import javax.xml.bind.JAXBContext;
046import javax.xml.bind.JAXBElement;
047import javax.xml.bind.JAXBException;
048import javax.xml.bind.Unmarshaller;
049import javax.xml.parsers.DocumentBuilder;
050import javax.xml.parsers.DocumentBuilderFactory;
051import javax.xml.parsers.ParserConfigurationException;
052import javax.xml.stream.XMLEventReader;
053import javax.xml.stream.XMLInputFactory;
054import javax.xml.stream.XMLStreamConstants;
055import javax.xml.stream.XMLStreamException;
056import javax.xml.stream.XMLStreamReader;
057import javax.xml.stream.events.XMLEvent;
058import java.io.BufferedInputStream;
059import java.io.BufferedReader;
060import java.io.ByteArrayOutputStream;
061import java.io.File;
062import java.io.FileInputStream;
063import java.io.FileNotFoundException;
064import java.io.FileOutputStream;
065import java.io.IOException;
066import java.io.InputStream;
067import java.io.InputStreamReader;
068import java.io.OutputStreamWriter;
069import java.io.PrintWriter;
070import java.io.StringReader;
071import java.io.StringWriter;
072import java.io.UncheckedIOException;
073import java.net.MalformedURLException;
074import java.net.URL;
075import java.nio.charset.StandardCharsets;
076import java.util.ArrayList;
077import java.util.Arrays;
078import java.util.HashMap;
079import java.util.HashSet;
080import java.util.LinkedList;
081import java.util.List;
082import java.util.Map;
083import java.util.Optional;
084import java.util.Set;
085import java.util.regex.Pattern;
086
087/**
088 * A FedoraObjectProcessor implementation that uses the STaX API to process
089 * a FOXML XML InputStream.
090 * @author mdurbin
091 */
092public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor {
093
094    private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class);
095
096    private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>",
097            Pattern.DOTALL);
098
099    private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#";
100
101    private static final String METRIC_NAME = "fcrepo.storage.foxml.object";
102    private static final String OPERATION = "operation";
103    private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject");
104    private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject");
105
106    private URLFetcher fetcher;
107
108    private String localFedoraServer;
109
110    private InternalIDResolver idResolver;
111
112    private File file;
113
114    private InputStream stream;
115
116    private XMLStreamReader reader;
117
118    private DocumentBuilder documentBuilder;
119
120    private List<File> tempFiles;
121
122    private LinkedList<String> inlineXml;
123
124    /**
125     * The basic object information read from the XML stream at construction
126     * time by processing the root XML element and its attributes.
127     */
128    private ObjectInfo objectInfo;
129
130    /**
131     * foxml input stream fedora object processor.
132     * @param file the FOXML file
133     * @param fetcher the fetcher
134     * @param resolver the resolver
135     * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server
136     *                          from which the content exposed by the "is" parameter comes.
137     * @throws XMLStreamException xml stream exception
138     */
139    public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher,
140                                                 final InternalIDResolver resolver, final String localFedoraServer)
141            throws XMLStreamException, FileNotFoundException {
142        this.file = file;
143        this.fetcher = fetcher;
144        this.idResolver = resolver;
145        this.localFedoraServer = localFedoraServer;
146        final XMLInputFactory factory = XMLInputFactory.newFactory();
147        stream = new BufferedInputStream(new FileInputStream(file));
148        reader = factory.createXMLStreamReader(stream);
149        reader.nextTag();
150        final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation");
151        objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath());
152        while (reader.next() == XMLStreamConstants.CHARACTERS) {
153        }
154
155        tempFiles = new ArrayList<File>();
156
157        final var builderFactory = DocumentBuilderFactory.newInstance();
158        builderFactory.setNamespaceAware(true);
159        builderFactory.setIgnoringComments(false);
160        try {
161            documentBuilder = builderFactory.newDocumentBuilder();
162        } catch (ParserConfigurationException e) {
163            throw new RuntimeException(e);
164        }
165
166        try {
167            inlineXml = new LinkedList<>();
168            final var content = FileUtils.readFileToString(file);
169            final var matcher = INLINE_PATTERN.matcher(content);
170            while (matcher.find()) {
171                inlineXml.add(matcher.group(1));
172            }
173        } catch (IOException e) {
174            throw new UncheckedIOException(e);
175        }
176    }
177
178    @Override
179    public ObjectInfo getObjectInfo() {
180        return objectInfo;
181    }
182
183    @Override
184    public void processObject(final StreamingFedoraObjectHandler handler) {
185        final var stopwatch = Timer.start();
186        handler.beginObject(objectInfo);
187        Foxml11DatastreamInfo dsInfo = null;
188        try {
189            handler.processObjectProperties(readProperties());
190            while (reader.hasNext()) {
191                if (reader.isCharacters()) {
192                    if (!reader.isWhiteSpace()) {
193                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
194                    } else {
195                        // skip whitespace...
196                    }
197                } else if (reader.isStartElement()) {
198                    if (reader.getLocalName().equals("datastream")
199                            && reader.getNamespaceURI().equals(FOXML_NS)) {
200                        dsInfo = new Foxml11DatastreamInfo(objectInfo, reader);
201                    } else if (reader.getLocalName().equals("datastreamVersion")) {
202                        final var v = new Foxml11DatastreamVersion(dsInfo, reader);
203                        v.validateInlineXml();
204                        handler.processDatastreamVersion(v);
205                    } else {
206                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
207                    }
208                } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) {
209                    dsInfo = null;
210                } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) {
211                    // end of document....
212                } else {
213                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
214                            + reader.getLocation().getLineNumber() + ", column "
215                            + reader.getLocation().getColumnNumber()
216                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
217                }
218                reader.next();
219            }
220        } catch (Exception e) {
221            abort(handler, e);
222        } finally {
223            stopwatch.stop(processObjectTimer);
224        }
225
226        completeObjectTimer.record(() -> complete(handler));
227    }
228
229    private void complete(final StreamingFedoraObjectHandler handler) {
230        try {
231            handler.completeObject(objectInfo);
232            cleanUpTempFiles();
233        } catch (Exception e) {
234            abort(handler, e);
235        }
236    }
237
238    private void abort(final StreamingFedoraObjectHandler handler, final Exception e) {
239        try {
240            handler.abortObject(objectInfo);
241            if (e instanceof RuntimeException) {
242                throw (RuntimeException) e;
243            }
244            throw new RuntimeException(e);
245        } finally {
246            cleanUpTempFiles();
247            close();
248        }
249    }
250
251    /**
252     * Close resources associated to the processor
253     */
254    public void close() {
255        try {
256            reader.close();
257        } catch (final XMLStreamException e) {
258            LOG.warn("Failed to close reader cleanly", e);
259        }
260        try {
261            stream.close();
262        } catch (IOException e) {
263            LOG.warn("Failed to close file cleanly", e);
264        }
265    }
266
267    private void cleanUpTempFiles() {
268        for (final File f : this.tempFiles) {
269            if (f.exists()) {
270                f.delete();
271            }
272        }
273    }
274
275    private ObjectProperties readProperties() throws JAXBException, XMLStreamException {
276        final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class);
277        final Unmarshaller unmarshaller = jc.createUnmarshaller();
278        final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class);
279        final FoxmlObjectProperties properties = p.getValue();
280        return properties;
281    }
282
283    private void readUntilClosed(final String name, final String namespace) throws XMLStreamException {
284        while (reader.hasNext()) {
285            if (reader.isEndElement() && reader.getLocalName().equals(name)
286                    && reader.getNamespaceURI().equals(namespace)) {
287                return;
288            } else {
289                // skip all other stuff....
290            }
291            reader.next();
292        }
293    }
294
295    private class Foxml11DatastreamInfo implements DatastreamInfo {
296
297        private String id;
298
299        private String controlGroup;
300
301        private String fedoraUri;
302
303        private String state;
304
305        private boolean versionable;
306
307        private ObjectInfo objectInfo;
308
309        public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) {
310            this.objectInfo = objectInfo;
311            final Map<String, String> attributes
312            = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE");
313            id = attributes.get("ID");
314            controlGroup = attributes.get("CONTROL_GROUP");
315            fedoraUri = attributes.get("FEDORA_URI");
316            state = attributes.get("STATE");
317            versionable = Boolean.valueOf(attributes.get("VERSIONABLE"));
318        }
319
320        @Override
321        public ObjectInfo getObjectInfo() {
322            return objectInfo;
323        }
324
325        @Override
326        public String getDatastreamId() {
327            return id;
328        }
329
330        @Override
331        public String getControlGroup() {
332            return controlGroup;
333        }
334
335        @Override
336        public String getFedoraURI() {
337            return fedoraUri;
338        }
339
340        @Override
341        public String getState() {
342            return state;
343        }
344
345        @Override
346        public boolean getVersionable() {
347            return versionable;
348        }
349    }
350
351    public class Foxml11DatastreamVersion implements DatastreamVersion {
352
353        private DatastreamInfo dsInfo;
354
355        private String id;
356        private String label;
357        private String created;
358        private String mimeType;
359        private String altIds;
360        private String formatUri;
361        private long size;
362        private ContentDigest contentDigest;
363        private CachedContent dsContent;
364        private boolean isInlineXml = false;
365
366        /**
367         * foxml datastream version.
368         * @param dsInfo the datastream information
369         * @param reader the reader
370         * @throws XMLStreamException xml stream exception
371         */
372        public Foxml11DatastreamVersion(final DatastreamInfo dsInfo,
373                final XMLStreamReader reader) throws XMLStreamException {
374            this.dsInfo = dsInfo;
375            final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL",
376                    "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE");
377            id = dsAttributes.get("ID");
378            label = dsAttributes.get("LABEL");
379            created = dsAttributes.get("CREATED");
380            mimeType = dsAttributes.get("MIMETYPE");
381            altIds = dsAttributes.get("ALT_IDS");
382            formatUri = dsAttributes.get("FORMAT_URI");
383            size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1;
384            reader.next();
385
386            while (reader.hasNext()) {
387                if (reader.isCharacters()) {
388                    if (!reader.isWhiteSpace()) {
389                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
390                    } else {
391                        // skip whitespace...
392                    }
393                } else if (reader.isStartElement()) {
394                    final String localName = reader.getLocalName();
395                    if (localName.equals("contentDigest")) {
396                        final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST");
397                        this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST"));
398                    } else if (localName.equals("xmlContent")) {
399                        // this XML fragment may not be valid out of context
400                        // context, so write it out as a complete XML
401                        // file...
402                        reader.next();
403
404                        isInlineXml = true;
405                        dsContent = new MemoryCachedContent(extractInlineXml());
406                    } else if (localName.equals("contentLocation")) {
407                        final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE");
408                        if (attributes.get("TYPE").equals("INTERNAL_ID")) {
409                            dsContent = idResolver.resolveInternalID(attributes.get("REF"));
410                        } else {
411                            try {
412                                String ref = attributes.get("REF");
413                                if (ref.contains("local.fedora.server")) {
414                                    ref = ref.replace("local.fedora.server", localFedoraServer);
415                                }
416                                dsContent = new URLCachedContent(new URL(ref), fetcher);
417                            } catch (final MalformedURLException e) {
418                                throw new RuntimeException(e);
419                            }
420                        }
421                    } else if (localName.equals("binaryContent")) {
422                        try {
423                            final File f = File.createTempFile("decoded", "file");
424                            tempFiles.add(f);
425                            final Base64OutputStream out = new Base64OutputStream(new FileOutputStream(f), false);
426                            while (reader.next() == XMLStreamConstants.CHARACTERS) {
427                                out.write(reader.getText().getBytes("UTF-8"));
428                            }
429                            out.flush();
430                            out.close();
431                            dsContent = new FileCachedContent(f);
432                        } catch (final IOException e) {
433                            throw new RuntimeException(e);
434                        }
435                        readUntilClosed("binaryContent", FOXML_NS);
436                    } else {
437                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
438                    }
439                } else if (reader.isEndElement()) {
440                    if (reader.getLocalName().equals("datastreamVersion")) {
441                        return;
442                    }
443                } else {
444                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
445                            + reader.getLocation().getLineNumber() + ", column "
446                            + reader.getLocation().getColumnNumber()
447                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
448                }
449                reader.next();
450            }
451
452        }
453
454        @Override
455        public Optional<File> getFile() {
456            return dsContent.getFile();
457        }
458
459        private String extractInlineXml() throws XMLStreamException {
460            final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader);
461            while (eventReader.hasNext()) {
462                final XMLEvent event = eventReader.nextEvent();
463                if (event.isEndElement()
464                        && event.asEndElement().getName().getLocalPart().equals("xmlContent")
465                        && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) {
466                    break;
467                }
468            }
469
470            return inlineXml.removeFirst();
471        }
472
473        private void validateInlineXml() {
474            if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) {
475                final var transformedXml = transformInlineXmlForChecksum();
476                final var digest = DigestUtils.getDigest(contentDigest.getType());
477                final var digestBytes = DigestUtils.digest(digest, transformedXml);
478                final var digestHex = Hex.encodeHexString(digestBytes);
479
480                if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) {
481                    throw new RuntimeException(String.format(
482                            "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s",
483                            dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(),
484                            contentDigest.getType(), contentDigest.getDigest(), digestHex));
485                }
486            }
487        }
488
489        /**
490         * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/
491         * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/
492         * DatastreamXMLMetadata.java#L92
493         *
494         * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order.
495         *
496         * @return the xml in the format Fedora 3 used to calculate digests
497         */
498        private byte[] transformInlineXmlForChecksum() {
499            try {
500                // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :(
501                final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
502                        + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8);
503
504                final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8);
505                final var source = new InputSource(isReader);
506                source.setEncoding("UTF-8");
507
508                final Document doc = documentBuilder.parse(source);
509
510                final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false);
511                // indent == 0 means add no indenting
512                fmt.setIndent(0);
513                // default line width is 72, but only applies when indenting
514                fmt.setLineWidth(0);
515                fmt.setPreserveSpace(false);
516
517                final StringWriter out = new StringWriter();
518                final XMLSerializer ser = new XMLSerializer(out, fmt);
519                ser.serialize(doc);
520                out.close();
521
522                final var baos = new ByteArrayOutputStream();
523                final var br = new BufferedReader(new StringReader(out.toString()));
524                String line;
525                final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8));
526                while ((line = br.readLine()) != null) {
527                    line = line.trim();
528                    outStream.append(line);
529                }
530                outStream.close();
531
532                return baos.toByteArray();
533            } catch (IOException e) {
534                throw new UncheckedIOException(e);
535            } catch (SAXException e) {
536                try {
537                    LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream()));
538                } catch (IOException e2) {
539                    // swallow
540                }
541                throw new RuntimeException(e);
542            }
543        }
544
545        @Override
546        public DatastreamInfo getDatastreamInfo() {
547            return dsInfo;
548        }
549
550        @Override
551        public String getVersionId() {
552            return id;
553        }
554
555        @Override
556        public String getMimeType() {
557            return mimeType;
558        }
559
560        @Override
561        public String getLabel() {
562            return label;
563        }
564
565        @Override
566        public String getCreated() {
567            return created;
568        }
569
570        @Override
571        public String getAltIds() {
572            return altIds;
573        }
574
575        @Override
576        public String getFormatUri() {
577            return formatUri;
578        }
579
580        @Override
581        public long getSize() {
582            return size;
583        }
584
585        @Override
586        public ContentDigest getContentDigest() {
587            // The digests for inline xml do not match what is stored in the FOXML and should not be returned here.
588            if (isInlineXml) {
589                return null;
590            }
591            return contentDigest;
592        }
593
594        @Override
595        public InputStream getContent() throws IOException {
596            return dsContent.getInputStream();
597        }
598
599        @Override
600        public String getExternalOrRedirectURL() {
601            if (dsContent instanceof URLCachedContent) {
602                return ((URLCachedContent) dsContent).getURL().toString();
603            } else {
604                throw new IllegalStateException();
605            }
606        }
607
608        @Override
609        public boolean isFirstVersionIn(final ObjectReference obj) {
610            final List<DatastreamVersion> datastreams =
611                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
612            return datastreams.indexOf(this) == 0;
613        }
614
615        @Override
616        public boolean isLastVersionIn(final ObjectReference obj) {
617            final List<DatastreamVersion> datastreams =
618                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
619            return datastreams.indexOf(this) == datastreams.size() - 1;
620        }
621    }
622
623    private static Map<String, String> getAttributes(final XMLStreamReader r,
624            final String ... allowedNames) {
625        final HashMap<String, String> result = new HashMap<String, String>();
626        final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames));
627        for (int i = 0; i < r.getAttributeCount(); i ++) {
628            final String localName = r.getAttributeLocalName(i);
629            final String value = r.getAttributeValue(i);
630            if (allowed.contains(localName)) {
631                result.put(localName, value);
632            } else {
633                System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\"");
634            }
635        }
636        return result;
637
638    }
639
640}