001/**
002 * The contents of this file are subject to the license and copyright
003 * detailed in the LICENSE and NOTICE files at the root of the source
004 * tree.
005 *
006 */
007package org.fcrepo.migration.foxml;
008
009import io.micrometer.core.instrument.Metrics;
010import io.micrometer.core.instrument.Timer;
011import org.apache.commons.codec.binary.Base64OutputStream;
012import org.apache.commons.codec.binary.Hex;
013import org.apache.commons.codec.digest.DigestUtils;
014import org.apache.commons.io.FileUtils;
015import org.apache.commons.io.IOUtils;
016import org.apache.commons.lang3.StringUtils;
017import org.apache.xml.serialize.OutputFormat;
018import org.apache.xml.serialize.XMLSerializer;
019import org.codehaus.stax2.XMLInputFactory2;
020import org.fcrepo.migration.ContentDigest;
021import org.fcrepo.migration.DatastreamInfo;
022import org.fcrepo.migration.DatastreamVersion;
023import org.fcrepo.migration.DefaultContentDigest;
024import org.fcrepo.migration.DefaultObjectInfo;
025import org.fcrepo.migration.FedoraObjectProcessor;
026import org.fcrepo.migration.ObjectInfo;
027import org.fcrepo.migration.ObjectProperties;
028import org.fcrepo.migration.ObjectReference;
029import org.fcrepo.migration.StreamingFedoraObjectHandler;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032import org.w3c.dom.Document;
033import org.xml.sax.InputSource;
034import org.xml.sax.SAXException;
035
036import javax.xml.bind.JAXBContext;
037import javax.xml.bind.JAXBElement;
038import javax.xml.bind.JAXBException;
039import javax.xml.bind.Unmarshaller;
040import javax.xml.parsers.DocumentBuilder;
041import javax.xml.parsers.DocumentBuilderFactory;
042import javax.xml.parsers.ParserConfigurationException;
043import javax.xml.stream.XMLEventReader;
044import javax.xml.stream.XMLInputFactory;
045import javax.xml.stream.XMLStreamConstants;
046import javax.xml.stream.XMLStreamException;
047import javax.xml.stream.XMLStreamReader;
048import javax.xml.stream.events.XMLEvent;
049import java.io.BufferedInputStream;
050import java.io.BufferedOutputStream;
051import java.io.BufferedReader;
052import java.io.ByteArrayOutputStream;
053import java.io.File;
054import java.io.FileInputStream;
055import java.io.FileNotFoundException;
056import java.io.FileOutputStream;
057import java.io.IOException;
058import java.io.InputStream;
059import java.io.InputStreamReader;
060import java.io.OutputStreamWriter;
061import java.io.PrintWriter;
062import java.io.StringReader;
063import java.io.StringWriter;
064import java.io.UncheckedIOException;
065import java.net.MalformedURLException;
066import java.net.URL;
067import java.nio.charset.StandardCharsets;
068import java.util.ArrayList;
069import java.util.Arrays;
070import java.util.HashMap;
071import java.util.HashSet;
072import java.util.LinkedList;
073import java.util.List;
074import java.util.Map;
075import java.util.Optional;
076import java.util.Set;
077import java.util.regex.Pattern;
078
079/**
080 * A FedoraObjectProcessor implementation that uses the STaX API to process
081 * a FOXML XML InputStream.
082 * @author mdurbin
083 */
084public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor {
085
086    private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class);
087
088    private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>",
089            Pattern.DOTALL);
090
091    private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#";
092
093    private static final String METRIC_NAME = "fcrepo.storage.foxml.object";
094    private static final String OPERATION = "operation";
095    private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject");
096    private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject");
097
098    private URLFetcher fetcher;
099
100    private String localFedoraServer;
101
102    private InternalIDResolver idResolver;
103
104    private File file;
105
106    private InputStream stream;
107
108    private XMLStreamReader reader;
109
110    private DocumentBuilder documentBuilder;
111
112    private List<File> tempFiles;
113
114    private LinkedList<String> inlineXml;
115
116    /**
117     * The basic object information read from the XML stream at construction
118     * time by processing the root XML element and its attributes.
119     */
120    private ObjectInfo objectInfo;
121
122    /**
123     * foxml input stream fedora object processor.
124     * @param file the FOXML file
125     * @param fetcher the fetcher
126     * @param resolver the resolver
127     * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server
128     *                          from which the content exposed by the "is" parameter comes.
129     * @throws XMLStreamException xml stream exception
130     */
131    public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher,
132                                                 final InternalIDResolver resolver, final String localFedoraServer)
133            throws XMLStreamException, FileNotFoundException {
134        this.file = file;
135        this.fetcher = fetcher;
136        this.idResolver = resolver;
137        this.localFedoraServer = localFedoraServer;
138        final XMLInputFactory factory = XMLInputFactory.newFactory();
139        stream = new BufferedInputStream(new FileInputStream(file));
140        reader = factory.createXMLStreamReader(stream);
141        reader.nextTag();
142        final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation");
143        objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath());
144        while (reader.next() == XMLStreamConstants.CHARACTERS) {
145        }
146
147        tempFiles = new ArrayList<File>();
148
149        final var builderFactory = DocumentBuilderFactory.newInstance();
150        builderFactory.setNamespaceAware(true);
151        builderFactory.setIgnoringComments(false);
152        try {
153            documentBuilder = builderFactory.newDocumentBuilder();
154        } catch (ParserConfigurationException e) {
155            throw new RuntimeException(e);
156        }
157
158        try {
159            inlineXml = new LinkedList<>();
160            final var content = FileUtils.readFileToString(file);
161            final var matcher = INLINE_PATTERN.matcher(content);
162            while (matcher.find()) {
163                inlineXml.add(matcher.group(1));
164            }
165        } catch (IOException e) {
166            throw new UncheckedIOException(e);
167        }
168    }
169
170    @Override
171    public ObjectInfo getObjectInfo() {
172        return objectInfo;
173    }
174
175    @Override
176    public void processObject(final StreamingFedoraObjectHandler handler) {
177        final var stopwatch = Timer.start();
178        handler.beginObject(objectInfo);
179        Foxml11DatastreamInfo dsInfo = null;
180        try {
181            handler.processObjectProperties(readProperties());
182            while (reader.hasNext()) {
183                if (reader.isCharacters()) {
184                    if (!reader.isWhiteSpace()) {
185                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
186                    } else {
187                        // skip whitespace...
188                    }
189                } else if (reader.isStartElement()) {
190                    if (reader.getLocalName().equals("datastream")
191                            && reader.getNamespaceURI().equals(FOXML_NS)) {
192                        dsInfo = new Foxml11DatastreamInfo(objectInfo, reader);
193                    } else if (reader.getLocalName().equals("datastreamVersion")) {
194                        final var v = new Foxml11DatastreamVersion(dsInfo, reader);
195                        v.validateInlineXml();
196                        handler.processDatastreamVersion(v);
197                    } else {
198                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
199                    }
200                } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) {
201                    dsInfo = null;
202                } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) {
203                    // end of document....
204                } else {
205                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
206                            + reader.getLocation().getLineNumber() + ", column "
207                            + reader.getLocation().getColumnNumber()
208                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
209                }
210                reader.next();
211            }
212        } catch (Exception e) {
213            abort(handler, e);
214        } finally {
215            stopwatch.stop(processObjectTimer);
216        }
217
218        completeObjectTimer.record(() -> complete(handler));
219    }
220
221    private void complete(final StreamingFedoraObjectHandler handler) {
222        try {
223            handler.completeObject(objectInfo);
224            cleanUpTempFiles();
225        } catch (Exception e) {
226            abort(handler, e);
227        }
228    }
229
230    private void abort(final StreamingFedoraObjectHandler handler, final Exception e) {
231        try {
232            handler.abortObject(objectInfo);
233            if (e instanceof RuntimeException) {
234                throw (RuntimeException) e;
235            }
236            throw new RuntimeException(e);
237        } finally {
238            cleanUpTempFiles();
239            close();
240        }
241    }
242
243    /**
244     * Close resources associated to the processor
245     */
246    public void close() {
247        try {
248            reader.close();
249        } catch (final XMLStreamException e) {
250            LOG.warn("Failed to close reader cleanly", e);
251        }
252        try {
253            stream.close();
254        } catch (IOException e) {
255            LOG.warn("Failed to close file cleanly", e);
256        }
257    }
258
259    private void cleanUpTempFiles() {
260        for (final File f : this.tempFiles) {
261            if (f.exists()) {
262                f.delete();
263            }
264        }
265    }
266
267    private ObjectProperties readProperties() throws JAXBException, XMLStreamException {
268        final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class);
269        final Unmarshaller unmarshaller = jc.createUnmarshaller();
270        final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class);
271        final FoxmlObjectProperties properties = p.getValue();
272        return properties;
273    }
274
275    private void readUntilClosed(final String name, final String namespace) throws XMLStreamException {
276        while (reader.hasNext()) {
277            if (reader.isEndElement() && reader.getLocalName().equals(name)
278                    && reader.getNamespaceURI().equals(namespace)) {
279                return;
280            } else {
281                // skip all other stuff....
282            }
283            reader.next();
284        }
285    }
286
287    private class Foxml11DatastreamInfo implements DatastreamInfo {
288
289        private String id;
290
291        private String controlGroup;
292
293        private String fedoraUri;
294
295        private String state;
296
297        private boolean versionable;
298
299        private ObjectInfo objectInfo;
300
301        public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) {
302            this.objectInfo = objectInfo;
303            final Map<String, String> attributes
304            = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE");
305            id = attributes.get("ID");
306            controlGroup = attributes.get("CONTROL_GROUP");
307            fedoraUri = attributes.get("FEDORA_URI");
308            state = attributes.get("STATE");
309            versionable = Boolean.valueOf(attributes.get("VERSIONABLE"));
310        }
311
312        @Override
313        public ObjectInfo getObjectInfo() {
314            return objectInfo;
315        }
316
317        @Override
318        public String getDatastreamId() {
319            return id;
320        }
321
322        @Override
323        public String getControlGroup() {
324            return controlGroup;
325        }
326
327        @Override
328        public String getFedoraURI() {
329            return fedoraUri;
330        }
331
332        @Override
333        public String getState() {
334            return state;
335        }
336
337        @Override
338        public boolean getVersionable() {
339            return versionable;
340        }
341    }
342
343    public class Foxml11DatastreamVersion implements DatastreamVersion {
344
345        private DatastreamInfo dsInfo;
346
347        private String id;
348        private String label;
349        private String created;
350        private String mimeType;
351        private String altIds;
352        private String formatUri;
353        private long size;
354        private ContentDigest contentDigest;
355        private CachedContent dsContent;
356        private boolean isInlineXml = false;
357
358        /**
359         * foxml datastream version.
360         * @param dsInfo the datastream information
361         * @param reader the reader
362         * @throws XMLStreamException xml stream exception
363         */
364        public Foxml11DatastreamVersion(final DatastreamInfo dsInfo,
365                final XMLStreamReader reader) throws XMLStreamException {
366            this.dsInfo = dsInfo;
367            final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL",
368                    "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE");
369            id = dsAttributes.get("ID");
370            label = dsAttributes.get("LABEL");
371            created = dsAttributes.get("CREATED");
372            mimeType = dsAttributes.get("MIMETYPE");
373            altIds = dsAttributes.get("ALT_IDS");
374            formatUri = dsAttributes.get("FORMAT_URI");
375            size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1;
376            reader.next();
377
378            while (reader.hasNext()) {
379                if (reader.isCharacters()) {
380                    if (!reader.isWhiteSpace()) {
381                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
382                    } else {
383                        // skip whitespace...
384                    }
385                } else if (reader.isStartElement()) {
386                    final String localName = reader.getLocalName();
387                    if (localName.equals("contentDigest")) {
388                        final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST");
389                        this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST"));
390                    } else if (localName.equals("xmlContent")) {
391                        // this XML fragment may not be valid out of context
392                        // context, so write it out as a complete XML
393                        // file...
394                        reader.next();
395
396                        isInlineXml = true;
397                        dsContent = new MemoryCachedContent(extractInlineXml());
398                    } else if (localName.equals("contentLocation")) {
399                        final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE");
400                        if (attributes.get("TYPE").equals("INTERNAL_ID")) {
401                            dsContent = idResolver.resolveInternalID(attributes.get("REF"));
402                        } else {
403                            try {
404                                String ref = attributes.get("REF");
405                                if (ref.contains("local.fedora.server")) {
406                                    ref = ref.replace("local.fedora.server", localFedoraServer);
407                                }
408                                dsContent = new URLCachedContent(new URL(ref), fetcher);
409                            } catch (final MalformedURLException e) {
410                                throw new RuntimeException(e);
411                            }
412                        }
413                    } else if (localName.equals("binaryContent")) {
414                        try {
415                            final File f = File.createTempFile("decoded", "file");
416                            tempFiles.add(f);
417                            final Base64OutputStream out = new Base64OutputStream(
418                                    new BufferedOutputStream(new FileOutputStream(f)), false);
419                            while (reader.next() == XMLStreamConstants.CHARACTERS) {
420                                out.write(reader.getText().getBytes("UTF-8"));
421                            }
422                            out.flush();
423                            out.close();
424                            dsContent = new FileCachedContent(f);
425                        } catch (final IOException e) {
426                            throw new RuntimeException(e);
427                        }
428                        readUntilClosed("binaryContent", FOXML_NS);
429                    } else {
430                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
431                    }
432                } else if (reader.isEndElement()) {
433                    if (reader.getLocalName().equals("datastreamVersion")) {
434                        return;
435                    }
436                } else {
437                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
438                            + reader.getLocation().getLineNumber() + ", column "
439                            + reader.getLocation().getColumnNumber()
440                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
441                }
442                reader.next();
443            }
444
445        }
446
447        @Override
448        public Optional<File> getFile() {
449            return dsContent.getFile();
450        }
451
452        private String extractInlineXml() throws XMLStreamException {
453            final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader);
454            while (eventReader.hasNext()) {
455                final XMLEvent event = eventReader.nextEvent();
456                if (event.isEndElement()
457                        && event.asEndElement().getName().getLocalPart().equals("xmlContent")
458                        && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) {
459                    break;
460                }
461            }
462
463            return inlineXml.removeFirst();
464        }
465
466        private void validateInlineXml() {
467            if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) {
468                final var transformedXml = transformInlineXmlForChecksum();
469                final var digest = DigestUtils.getDigest(contentDigest.getType());
470                final var digestBytes = DigestUtils.digest(digest, transformedXml);
471                final var digestHex = Hex.encodeHexString(digestBytes);
472
473                if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) {
474                    throw new RuntimeException(String.format(
475                            "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s",
476                            dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(),
477                            contentDigest.getType(), contentDigest.getDigest(), digestHex));
478                }
479            }
480        }
481
482        /**
483         * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/
484         * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/
485         * DatastreamXMLMetadata.java#L92
486         *
487         * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order.
488         *
489         * @return the xml in the format Fedora 3 used to calculate digests
490         */
491        private byte[] transformInlineXmlForChecksum() {
492            try {
493                // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :(
494                final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
495                        + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8);
496
497                final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8);
498                final var source = new InputSource(isReader);
499                source.setEncoding("UTF-8");
500
501                final Document doc = documentBuilder.parse(source);
502
503                final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false);
504                // indent == 0 means add no indenting
505                fmt.setIndent(0);
506                // default line width is 72, but only applies when indenting
507                fmt.setLineWidth(0);
508                fmt.setPreserveSpace(false);
509
510                final StringWriter out = new StringWriter();
511                final XMLSerializer ser = new XMLSerializer(out, fmt);
512                ser.serialize(doc);
513                out.close();
514
515                final var baos = new ByteArrayOutputStream();
516                final var br = new BufferedReader(new StringReader(out.toString()));
517                String line;
518                final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8));
519                while ((line = br.readLine()) != null) {
520                    line = line.trim();
521                    outStream.append(line);
522                }
523                outStream.close();
524
525                return baos.toByteArray();
526            } catch (IOException e) {
527                throw new UncheckedIOException(e);
528            } catch (SAXException e) {
529                try {
530                    LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream()));
531                } catch (IOException e2) {
532                    // swallow
533                }
534                throw new RuntimeException(e);
535            }
536        }
537
538        @Override
539        public DatastreamInfo getDatastreamInfo() {
540            return dsInfo;
541        }
542
543        @Override
544        public String getVersionId() {
545            return id;
546        }
547
548        @Override
549        public String getMimeType() {
550            return mimeType;
551        }
552
553        @Override
554        public String getLabel() {
555            return label;
556        }
557
558        @Override
559        public String getCreated() {
560            return created;
561        }
562
563        @Override
564        public String getAltIds() {
565            return altIds;
566        }
567
568        @Override
569        public String getFormatUri() {
570            return formatUri;
571        }
572
573        @Override
574        public long getSize() {
575            return size;
576        }
577
578        @Override
579        public ContentDigest getContentDigest() {
580            // The digests for inline xml do not match what is stored in the FOXML and should not be returned here.
581            if (isInlineXml) {
582                return null;
583            }
584            return contentDigest;
585        }
586
587        @Override
588        public InputStream getContent() throws IOException {
589            return dsContent.getInputStream();
590        }
591
592        @Override
593        public String getExternalOrRedirectURL() {
594            if (dsContent instanceof URLCachedContent) {
595                return ((URLCachedContent) dsContent).getURL().toString();
596            } else {
597                throw new IllegalStateException();
598            }
599        }
600
601        @Override
602        public boolean isFirstVersionIn(final ObjectReference obj) {
603            final List<DatastreamVersion> datastreams =
604                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
605            return datastreams.indexOf(this) == 0;
606        }
607
608        @Override
609        public boolean isLastVersionIn(final ObjectReference obj) {
610            final List<DatastreamVersion> datastreams =
611                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
612            return datastreams.indexOf(this) == datastreams.size() - 1;
613        }
614    }
615
616    private static Map<String, String> getAttributes(final XMLStreamReader r,
617            final String ... allowedNames) {
618        final HashMap<String, String> result = new HashMap<String, String>();
619        final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames));
620        for (int i = 0; i < r.getAttributeCount(); i ++) {
621            final String localName = r.getAttributeLocalName(i);
622            final String value = r.getAttributeValue(i);
623            if (allowed.contains(localName)) {
624                result.put(localName, value);
625            } else {
626                System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\"");
627            }
628        }
629        return result;
630
631    }
632
633}