001/**
002 * The contents of this file are subject to the license and copyright
003 * detailed in the LICENSE and NOTICE files at the root of the source
004 * tree.
005 *
006 */
007package org.fcrepo.migration.foxml;
008
009import io.micrometer.core.instrument.Metrics;
010import io.micrometer.core.instrument.Timer;
011import org.apache.commons.codec.binary.Base64OutputStream;
012import org.apache.commons.codec.binary.Hex;
013import org.apache.commons.codec.digest.DigestUtils;
014import org.apache.commons.io.FileUtils;
015import org.apache.commons.io.IOUtils;
016import org.apache.commons.lang3.StringUtils;
017import org.apache.xml.serialize.OutputFormat;
018import org.apache.xml.serialize.XMLSerializer;
019import org.codehaus.stax2.XMLInputFactory2;
020import org.fcrepo.migration.ContentDigest;
021import org.fcrepo.migration.DatastreamInfo;
022import org.fcrepo.migration.DatastreamVersion;
023import org.fcrepo.migration.DefaultContentDigest;
024import org.fcrepo.migration.DefaultObjectInfo;
025import org.fcrepo.migration.FedoraObjectProcessor;
026import org.fcrepo.migration.ObjectInfo;
027import org.fcrepo.migration.ObjectProperties;
028import org.fcrepo.migration.ObjectReference;
029import org.fcrepo.migration.StreamingFedoraObjectHandler;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032import org.w3c.dom.Document;
033import org.xml.sax.InputSource;
034import org.xml.sax.SAXException;
035
036import javax.xml.bind.JAXBContext;
037import javax.xml.bind.JAXBElement;
038import javax.xml.bind.JAXBException;
039import javax.xml.bind.Unmarshaller;
040import javax.xml.parsers.DocumentBuilder;
041import javax.xml.parsers.DocumentBuilderFactory;
042import javax.xml.parsers.ParserConfigurationException;
043import javax.xml.stream.XMLEventReader;
044import javax.xml.stream.XMLInputFactory;
045import javax.xml.stream.XMLStreamConstants;
046import javax.xml.stream.XMLStreamException;
047import javax.xml.stream.XMLStreamReader;
048import javax.xml.stream.events.XMLEvent;
049import java.io.BufferedInputStream;
050import java.io.BufferedOutputStream;
051import java.io.BufferedReader;
052import java.io.ByteArrayOutputStream;
053import java.io.File;
054import java.io.FileInputStream;
055import java.io.FileNotFoundException;
056import java.io.FileOutputStream;
057import java.io.IOException;
058import java.io.InputStream;
059import java.io.InputStreamReader;
060import java.io.OutputStreamWriter;
061import java.io.PrintWriter;
062import java.io.StringReader;
063import java.io.StringWriter;
064import java.io.UncheckedIOException;
065import java.net.MalformedURLException;
066import java.net.URL;
067import java.nio.charset.StandardCharsets;
068import java.time.Instant;
069import java.util.ArrayList;
070import java.util.Arrays;
071import java.util.HashMap;
072import java.util.HashSet;
073import java.util.LinkedList;
074import java.util.List;
075import java.util.Map;
076import java.util.Optional;
077import java.util.Set;
078import java.util.regex.Pattern;
079
080/**
081 * A FedoraObjectProcessor implementation that uses the STaX API to process
082 * a FOXML XML InputStream.
083 * @author mdurbin
084 */
085public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor {
086
087    private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class);
088
089    private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>",
090            Pattern.DOTALL);
091
092    private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#";
093
094    private static final String METRIC_NAME = "fcrepo.storage.foxml.object";
095    private static final String OPERATION = "operation";
096    private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject");
097    private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject");
098
099    private URLFetcher fetcher;
100
101    private String localFedoraServer;
102
103    private InternalIDResolver idResolver;
104
105    private File file;
106
107    private InputStream stream;
108
109    private XMLStreamReader reader;
110
111    private DocumentBuilder documentBuilder;
112
113    private List<File> tempFiles;
114
115    private LinkedList<String> inlineXml;
116
117    /**
118     * The basic object information read from the XML stream at construction
119     * time by processing the root XML element and its attributes.
120     */
121    private ObjectInfo objectInfo;
122
123    /**
124     * foxml input stream fedora object processor.
125     * @param file the FOXML file
126     * @param fetcher the fetcher
127     * @param resolver the resolver
128     * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server
129     *                          from which the content exposed by the "is" parameter comes.
130     * @throws XMLStreamException xml stream exception
131     */
132    public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher,
133                                                 final InternalIDResolver resolver, final String localFedoraServer)
134            throws XMLStreamException, FileNotFoundException {
135        this.file = file;
136        this.fetcher = fetcher;
137        this.idResolver = resolver;
138        this.localFedoraServer = localFedoraServer;
139        final XMLInputFactory factory = XMLInputFactory.newFactory();
140        stream = new BufferedInputStream(new FileInputStream(file));
141        reader = factory.createXMLStreamReader(stream);
142        reader.nextTag();
143        final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation");
144        objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath());
145        while (reader.next() == XMLStreamConstants.CHARACTERS) {
146        }
147
148        tempFiles = new ArrayList<File>();
149
150        final var builderFactory = DocumentBuilderFactory.newInstance();
151        builderFactory.setNamespaceAware(true);
152        builderFactory.setIgnoringComments(false);
153        try {
154            documentBuilder = builderFactory.newDocumentBuilder();
155        } catch (ParserConfigurationException e) {
156            throw new RuntimeException(e);
157        }
158
159        try {
160            inlineXml = new LinkedList<>();
161            final var content = FileUtils.readFileToString(file);
162            final var matcher = INLINE_PATTERN.matcher(content);
163            while (matcher.find()) {
164                inlineXml.add(matcher.group(1));
165            }
166        } catch (IOException e) {
167            throw new UncheckedIOException(e);
168        }
169    }
170
171    @Override
172    public ObjectInfo getObjectInfo() {
173        return objectInfo;
174    }
175
176    @Override
177    public void processObject(final StreamingFedoraObjectHandler handler) {
178        final var stopwatch = Timer.start();
179        handler.beginObject(objectInfo);
180        Foxml11DatastreamInfo dsInfo = null;
181        try {
182            handler.processObjectProperties(readProperties());
183            while (reader.hasNext()) {
184                if (reader.isCharacters()) {
185                    if (!reader.isWhiteSpace()) {
186                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
187                    } else {
188                        // skip whitespace...
189                    }
190                } else if (reader.isStartElement()) {
191                    if (reader.getLocalName().equals("datastream")
192                            && reader.getNamespaceURI().equals(FOXML_NS)) {
193                        dsInfo = new Foxml11DatastreamInfo(objectInfo, reader);
194                    } else if (reader.getLocalName().equals("datastreamVersion")) {
195                        final var v = new Foxml11DatastreamVersion(dsInfo, reader);
196                        v.validateInlineXml();
197                        handler.processDatastreamVersion(v);
198                    } else {
199                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
200                    }
201                } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) {
202                    dsInfo = null;
203                } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) {
204                    // end of document....
205                } else {
206                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
207                            + reader.getLocation().getLineNumber() + ", column "
208                            + reader.getLocation().getColumnNumber()
209                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
210                }
211                reader.next();
212            }
213        } catch (Exception e) {
214            abort(handler, e);
215        } finally {
216            stopwatch.stop(processObjectTimer);
217        }
218
219        completeObjectTimer.record(() -> complete(handler));
220    }
221
222    private void complete(final StreamingFedoraObjectHandler handler) {
223        try {
224            handler.completeObject(objectInfo);
225            cleanUpTempFiles();
226        } catch (Exception e) {
227            abort(handler, e);
228        }
229    }
230
231    private void abort(final StreamingFedoraObjectHandler handler, final Exception e) {
232        try {
233            handler.abortObject(objectInfo);
234            if (e instanceof RuntimeException) {
235                throw (RuntimeException) e;
236            }
237            throw new RuntimeException(e);
238        } finally {
239            cleanUpTempFiles();
240            close();
241        }
242    }
243
244    /**
245     * Close resources associated to the processor
246     */
247    public void close() {
248        try {
249            reader.close();
250        } catch (final XMLStreamException e) {
251            LOG.warn("Failed to close reader cleanly", e);
252        }
253        try {
254            stream.close();
255        } catch (IOException e) {
256            LOG.warn("Failed to close file cleanly", e);
257        }
258    }
259
260    private void cleanUpTempFiles() {
261        for (final File f : this.tempFiles) {
262            if (f.exists()) {
263                f.delete();
264            }
265        }
266    }
267
268    private ObjectProperties readProperties() throws JAXBException, XMLStreamException {
269        final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class);
270        final Unmarshaller unmarshaller = jc.createUnmarshaller();
271        final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class);
272        final FoxmlObjectProperties properties = p.getValue();
273        return properties;
274    }
275
276    private void readUntilClosed(final String name, final String namespace) throws XMLStreamException {
277        while (reader.hasNext()) {
278            if (reader.isEndElement() && reader.getLocalName().equals(name)
279                    && reader.getNamespaceURI().equals(namespace)) {
280                return;
281            } else {
282                // skip all other stuff....
283            }
284            reader.next();
285        }
286    }
287
288    private class Foxml11DatastreamInfo implements DatastreamInfo {
289
290        private String id;
291
292        private String controlGroup;
293
294        private String fedoraUri;
295
296        private String state;
297
298        private boolean versionable;
299
300        private ObjectInfo objectInfo;
301
302        public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) {
303            this.objectInfo = objectInfo;
304            final Map<String, String> attributes
305            = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE");
306            id = attributes.get("ID");
307            controlGroup = attributes.get("CONTROL_GROUP");
308            fedoraUri = attributes.get("FEDORA_URI");
309            state = attributes.get("STATE");
310            versionable = Boolean.valueOf(attributes.get("VERSIONABLE"));
311        }
312
313        @Override
314        public ObjectInfo getObjectInfo() {
315            return objectInfo;
316        }
317
318        @Override
319        public String getDatastreamId() {
320            return id;
321        }
322
323        @Override
324        public String getControlGroup() {
325            return controlGroup;
326        }
327
328        @Override
329        public String getFedoraURI() {
330            return fedoraUri;
331        }
332
333        @Override
334        public String getState() {
335            return state;
336        }
337
338        @Override
339        public boolean getVersionable() {
340            return versionable;
341        }
342    }
343
344    public class Foxml11DatastreamVersion implements DatastreamVersion {
345
346        private DatastreamInfo dsInfo;
347
348        private String id;
349        private String label;
350        private String created;
351        private Instant createdInstant;
352        private String mimeType;
353        private String altIds;
354        private String formatUri;
355        private long size;
356        private ContentDigest contentDigest;
357        private CachedContent dsContent;
358        private boolean isInlineXml = false;
359
360        /**
361         * foxml datastream version.
362         * @param dsInfo the datastream information
363         * @param reader the reader
364         * @throws XMLStreamException xml stream exception
365         */
366        public Foxml11DatastreamVersion(final DatastreamInfo dsInfo,
367                final XMLStreamReader reader) throws XMLStreamException {
368            this.dsInfo = dsInfo;
369            final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL",
370                    "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE");
371            id = dsAttributes.get("ID");
372            label = dsAttributes.get("LABEL");
373            created = dsAttributes.get("CREATED");
374            createdInstant = created != null ? Instant.parse(created) : null;
375            mimeType = dsAttributes.get("MIMETYPE");
376            altIds = dsAttributes.get("ALT_IDS");
377            formatUri = dsAttributes.get("FORMAT_URI");
378            size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1;
379            reader.next();
380
381            while (reader.hasNext()) {
382                if (reader.isCharacters()) {
383                    if (!reader.isWhiteSpace()) {
384                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
385                    } else {
386                        // skip whitespace...
387                    }
388                } else if (reader.isStartElement()) {
389                    final String localName = reader.getLocalName();
390                    if (localName.equals("contentDigest")) {
391                        final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST");
392                        this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST"));
393                    } else if (localName.equals("xmlContent")) {
394                        // this XML fragment may not be valid out of context
395                        // context, so write it out as a complete XML
396                        // file...
397                        reader.next();
398
399                        isInlineXml = true;
400                        dsContent = new MemoryCachedContent(extractInlineXml());
401                    } else if (localName.equals("contentLocation")) {
402                        final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE");
403                        if (attributes.get("TYPE").equals("INTERNAL_ID")) {
404                            dsContent = idResolver.resolveInternalID(attributes.get("REF"));
405                        } else {
406                            try {
407                                String ref = attributes.get("REF");
408                                if (ref.contains("local.fedora.server")) {
409                                    ref = ref.replace("local.fedora.server", localFedoraServer);
410                                }
411                                dsContent = new URLCachedContent(new URL(ref), fetcher);
412                            } catch (final MalformedURLException e) {
413                                throw new RuntimeException(e);
414                            }
415                        }
416                    } else if (localName.equals("binaryContent")) {
417                        try {
418                            final File f = File.createTempFile("decoded", "file");
419                            tempFiles.add(f);
420                            final Base64OutputStream out = new Base64OutputStream(
421                                    new BufferedOutputStream(new FileOutputStream(f)), false);
422                            while (reader.next() == XMLStreamConstants.CHARACTERS) {
423                                out.write(reader.getText().getBytes("UTF-8"));
424                            }
425                            out.flush();
426                            out.close();
427                            dsContent = new FileCachedContent(f);
428                        } catch (final IOException e) {
429                            throw new RuntimeException(e);
430                        }
431                        readUntilClosed("binaryContent", FOXML_NS);
432                    } else {
433                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
434                    }
435                } else if (reader.isEndElement()) {
436                    if (reader.getLocalName().equals("datastreamVersion")) {
437                        return;
438                    }
439                } else {
440                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
441                            + reader.getLocation().getLineNumber() + ", column "
442                            + reader.getLocation().getColumnNumber()
443                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
444                }
445                reader.next();
446            }
447
448        }
449
450        @Override
451        public Optional<File> getFile() {
452            return dsContent.getFile();
453        }
454
455        private String extractInlineXml() throws XMLStreamException {
456            final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader);
457            while (eventReader.hasNext()) {
458                final XMLEvent event = eventReader.nextEvent();
459                if (event.isEndElement()
460                        && event.asEndElement().getName().getLocalPart().equals("xmlContent")
461                        && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) {
462                    break;
463                }
464            }
465
466            return inlineXml.removeFirst();
467        }
468
469        private void validateInlineXml() {
470            if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) {
471                final var transformedXml = transformInlineXmlForChecksum();
472                final var digest = DigestUtils.getDigest(contentDigest.getType());
473                final var digestBytes = DigestUtils.digest(digest, transformedXml);
474                final var digestHex = Hex.encodeHexString(digestBytes);
475
476                if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) {
477                    throw new RuntimeException(String.format(
478                            "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s",
479                            dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(),
480                            contentDigest.getType(), contentDigest.getDigest(), digestHex));
481                }
482            }
483        }
484
485        /**
486         * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/
487         * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/
488         * DatastreamXMLMetadata.java#L92
489         *
490         * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order.
491         *
492         * @return the xml in the format Fedora 3 used to calculate digests
493         */
494        private byte[] transformInlineXmlForChecksum() {
495            try {
496                // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :(
497                final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
498                        + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8);
499
500                final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8);
501                final var source = new InputSource(isReader);
502                source.setEncoding("UTF-8");
503
504                final Document doc = documentBuilder.parse(source);
505
506                final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false);
507                // indent == 0 means add no indenting
508                fmt.setIndent(0);
509                // default line width is 72, but only applies when indenting
510                fmt.setLineWidth(0);
511                fmt.setPreserveSpace(false);
512
513                final StringWriter out = new StringWriter();
514                final XMLSerializer ser = new XMLSerializer(out, fmt);
515                ser.serialize(doc);
516                out.close();
517
518                final var baos = new ByteArrayOutputStream();
519                final var br = new BufferedReader(new StringReader(out.toString()));
520                String line;
521                final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8));
522                while ((line = br.readLine()) != null) {
523                    line = line.trim();
524                    outStream.append(line);
525                }
526                outStream.close();
527
528                return baos.toByteArray();
529            } catch (IOException e) {
530                throw new UncheckedIOException(e);
531            } catch (SAXException e) {
532                try {
533                    LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream()));
534                } catch (IOException e2) {
535                    // swallow
536                }
537                throw new RuntimeException(e);
538            }
539        }
540
541        @Override
542        public DatastreamInfo getDatastreamInfo() {
543            return dsInfo;
544        }
545
546        @Override
547        public String getVersionId() {
548            return id;
549        }
550
551        @Override
552        public String getMimeType() {
553            return mimeType;
554        }
555
556        @Override
557        public String getLabel() {
558            return label;
559        }
560
561        @Override
562        public String getCreated() {
563            return created;
564        }
565
566        @Override
567        public Instant getCreatedInstant() {
568            return createdInstant;
569        }
570
571        @Override
572        public String getAltIds() {
573            return altIds;
574        }
575
576        @Override
577        public String getFormatUri() {
578            return formatUri;
579        }
580
581        @Override
582        public long getSize() {
583            return size;
584        }
585
586        @Override
587        public ContentDigest getContentDigest() {
588            // The digests for inline xml do not match what is stored in the FOXML and should not be returned here.
589            if (isInlineXml) {
590                return null;
591            }
592            return contentDigest;
593        }
594
595        @Override
596        public InputStream getContent() throws IOException {
597            return dsContent.getInputStream();
598        }
599
600        @Override
601        public String getExternalOrRedirectURL() {
602            if (dsContent instanceof URLCachedContent) {
603                return ((URLCachedContent) dsContent).getURL().toString();
604            } else {
605                throw new IllegalStateException();
606            }
607        }
608
609        @Override
610        public boolean isFirstVersionIn(final ObjectReference obj) {
611            final List<DatastreamVersion> datastreams =
612                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
613            return datastreams.indexOf(this) == 0;
614        }
615
616        @Override
617        public boolean isLastVersionIn(final ObjectReference obj) {
618            final List<DatastreamVersion> datastreams =
619                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
620            return datastreams.indexOf(this) == datastreams.size() - 1;
621        }
622    }
623
624    private static Map<String, String> getAttributes(final XMLStreamReader r,
625            final String ... allowedNames) {
626        final HashMap<String, String> result = new HashMap<String, String>();
627        final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames));
628        for (int i = 0; i < r.getAttributeCount(); i ++) {
629            final String localName = r.getAttributeLocalName(i);
630            final String value = r.getAttributeValue(i);
631            if (allowed.contains(localName)) {
632                result.put(localName, value);
633            } else {
634                System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\"");
635            }
636        }
637        return result;
638
639    }
640
641}