001/*
002 * Copyright 2015 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.fcrepo.migration.foxml;
017
018import org.apache.commons.codec.binary.Base64OutputStream;
019import org.apache.commons.codec.binary.Hex;
020import org.apache.commons.codec.digest.DigestUtils;
021import org.apache.commons.io.FileUtils;
022import org.apache.commons.io.IOUtils;
023import org.apache.commons.lang3.StringUtils;
024import org.apache.xml.serialize.OutputFormat;
025import org.apache.xml.serialize.XMLSerializer;
026import org.codehaus.stax2.XMLInputFactory2;
027import org.fcrepo.migration.ContentDigest;
028import org.fcrepo.migration.DatastreamInfo;
029import org.fcrepo.migration.DatastreamVersion;
030import org.fcrepo.migration.DefaultContentDigest;
031import org.fcrepo.migration.DefaultObjectInfo;
032import org.fcrepo.migration.FedoraObjectProcessor;
033import org.fcrepo.migration.ObjectInfo;
034import org.fcrepo.migration.ObjectProperties;
035import org.fcrepo.migration.ObjectReference;
036import org.fcrepo.migration.StreamingFedoraObjectHandler;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039import org.w3c.dom.Document;
040import org.xml.sax.InputSource;
041import org.xml.sax.SAXException;
042
043import javax.xml.bind.JAXBContext;
044import javax.xml.bind.JAXBElement;
045import javax.xml.bind.JAXBException;
046import javax.xml.bind.Unmarshaller;
047import javax.xml.parsers.DocumentBuilder;
048import javax.xml.parsers.DocumentBuilderFactory;
049import javax.xml.parsers.ParserConfigurationException;
050import javax.xml.stream.XMLEventReader;
051import javax.xml.stream.XMLInputFactory;
052import javax.xml.stream.XMLStreamConstants;
053import javax.xml.stream.XMLStreamException;
054import javax.xml.stream.XMLStreamReader;
055import javax.xml.stream.events.XMLEvent;
056import java.io.BufferedInputStream;
057import java.io.BufferedReader;
058import java.io.ByteArrayOutputStream;
059import java.io.File;
060import java.io.FileInputStream;
061import java.io.FileNotFoundException;
062import java.io.FileOutputStream;
063import java.io.IOException;
064import java.io.InputStream;
065import java.io.InputStreamReader;
066import java.io.OutputStreamWriter;
067import java.io.PrintWriter;
068import java.io.StringReader;
069import java.io.StringWriter;
070import java.io.UncheckedIOException;
071import java.net.MalformedURLException;
072import java.net.URL;
073import java.nio.charset.StandardCharsets;
074import java.util.ArrayList;
075import java.util.Arrays;
076import java.util.HashMap;
077import java.util.HashSet;
078import java.util.LinkedList;
079import java.util.List;
080import java.util.Map;
081import java.util.Set;
082import java.util.regex.Pattern;
083
084/**
085 * A FedoraObjectProcessor implementation that uses the STaX API to process
086 * a FOXML XML InputStream.
087 * @author mdurbin
088 */
089public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor {
090
091    private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class);
092
093    private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>",
094            Pattern.DOTALL);
095
096    private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#";
097
098    private URLFetcher fetcher;
099
100    private String localFedoraServer;
101
102    private InternalIDResolver idResolver;
103
104    private File file;
105
106    private InputStream stream;
107
108    private XMLStreamReader reader;
109
110    private DocumentBuilder documentBuilder;
111
112    private List<File> tempFiles;
113
114    private LinkedList<String> inlineXml;
115
116    /**
117     * The basic object information read from the XML stream at construction
118     * time by processing the root XML element and its attributes.
119     */
120    private ObjectInfo objectInfo;
121
122    /**
123     * foxml input stream fedora object processor.
124     * @param file the FOXML file
125     * @param fetcher the fetcher
126     * @param resolver the resolver
127     * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server
128     *                          from which the content exposed by the "is" parameter comes.
129     * @throws XMLStreamException xml stream exception
130     */
131    public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher,
132                                                 final InternalIDResolver resolver, final String localFedoraServer)
133            throws XMLStreamException, FileNotFoundException {
134        this.file = file;
135        this.fetcher = fetcher;
136        this.idResolver = resolver;
137        this.localFedoraServer = localFedoraServer;
138        final XMLInputFactory factory = XMLInputFactory.newFactory();
139        stream = new BufferedInputStream(new FileInputStream(file));
140        reader = factory.createXMLStreamReader(stream);
141        reader.nextTag();
142        final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation");
143        objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath());
144        while (reader.next() == XMLStreamConstants.CHARACTERS) {
145        }
146
147        tempFiles = new ArrayList<File>();
148
149        final var builderFactory = DocumentBuilderFactory.newInstance();
150        builderFactory.setNamespaceAware(true);
151        builderFactory.setIgnoringComments(false);
152        try {
153            documentBuilder = builderFactory.newDocumentBuilder();
154        } catch (ParserConfigurationException e) {
155            throw new RuntimeException(e);
156        }
157
158        try {
159            inlineXml = new LinkedList<>();
160            final var content = FileUtils.readFileToString(file);
161            final var matcher = INLINE_PATTERN.matcher(content);
162            while (matcher.find()) {
163                inlineXml.add(matcher.group(1));
164            }
165        } catch (IOException e) {
166            throw new UncheckedIOException(e);
167        }
168    }
169
170    @Override
171    public ObjectInfo getObjectInfo() {
172        return objectInfo;
173    }
174
175    @Override
176    public void processObject(final StreamingFedoraObjectHandler handler) {
177        handler.beginObject(objectInfo);
178        Foxml11DatastreamInfo dsInfo = null;
179        try {
180            handler.processObjectProperties(readProperties());
181            while (reader.hasNext()) {
182                if (reader.isCharacters()) {
183                    if (!reader.isWhiteSpace()) {
184                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
185                    } else {
186                        // skip whitespace...
187                    }
188                } else if (reader.isStartElement()) {
189                    if (reader.getLocalName().equals("datastream")
190                            && reader.getNamespaceURI().equals(FOXML_NS)) {
191                        dsInfo = new Foxml11DatastreamInfo(objectInfo, reader);
192                    } else if (reader.getLocalName().equals("datastreamVersion")) {
193                        final var v = new Foxml11DatastreamVersion(dsInfo, reader);
194                        v.validateInlineXml();
195                        handler.processDatastreamVersion(v);
196                    } else {
197                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
198                    }
199                } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) {
200                    dsInfo = null;
201                } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) {
202                    // end of document....
203                    handler.completeObject(objectInfo);
204                    cleanUpTempFiles();
205                } else {
206                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
207                            + reader.getLocation().getLineNumber() + ", column "
208                            + reader.getLocation().getColumnNumber()
209                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
210                }
211                reader.next();
212            }
213
214        } catch (Exception e) {
215            handler.abortObject(objectInfo);
216            if (e instanceof RuntimeException) {
217                throw (RuntimeException) e;
218            }
219            throw new RuntimeException(e);
220        } finally {
221            cleanUpTempFiles();
222            close();
223        }
224    }
225
226    /**
227     * Close resources associated to the processor
228     */
229    public void close() {
230        try {
231            reader.close();
232        } catch (final XMLStreamException e) {
233            LOG.warn("Failed to close reader cleanly", e);
234        }
235        try {
236            stream.close();
237        } catch (IOException e) {
238            LOG.warn("Failed to close file cleanly", e);
239        }
240    }
241
242    private void cleanUpTempFiles() {
243        for (final File f : this.tempFiles) {
244            if (f.exists()) {
245                f.delete();
246            }
247        }
248    }
249
250    private ObjectProperties readProperties() throws JAXBException, XMLStreamException {
251        final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class);
252        final Unmarshaller unmarshaller = jc.createUnmarshaller();
253        final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class);
254        final FoxmlObjectProperties properties = p.getValue();
255        return properties;
256    }
257
258    private void readUntilClosed(final String name, final String namespace) throws XMLStreamException {
259        while (reader.hasNext()) {
260            if (reader.isEndElement() && reader.getLocalName().equals(name)
261                    && reader.getNamespaceURI().equals(namespace)) {
262                return;
263            } else {
264                // skip all other stuff....
265            }
266            reader.next();
267        }
268    }
269
270    private class Foxml11DatastreamInfo implements DatastreamInfo {
271
272        private String id;
273
274        private String controlGroup;
275
276        private String fedoraUri;
277
278        private String state;
279
280        private boolean versionable;
281
282        private ObjectInfo objectInfo;
283
284        public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) {
285            this.objectInfo = objectInfo;
286            final Map<String, String> attributes
287            = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE");
288            id = attributes.get("ID");
289            controlGroup = attributes.get("CONTROL_GROUP");
290            fedoraUri = attributes.get("FEDORA_URI");
291            state = attributes.get("STATE");
292            versionable = Boolean.valueOf(attributes.get("VERSIONABLE"));
293        }
294
295        @Override
296        public ObjectInfo getObjectInfo() {
297            return objectInfo;
298        }
299
300        @Override
301        public String getDatastreamId() {
302            return id;
303        }
304
305        @Override
306        public String getControlGroup() {
307            return controlGroup;
308        }
309
310        @Override
311        public String getFedoraURI() {
312            return fedoraUri;
313        }
314
315        @Override
316        public String getState() {
317            return state;
318        }
319
320        @Override
321        public boolean getVersionable() {
322            return versionable;
323        }
324    }
325
326    public class Foxml11DatastreamVersion implements DatastreamVersion {
327
328        private DatastreamInfo dsInfo;
329
330        private String id;
331        private String label;
332        private String created;
333        private String mimeType;
334        private String altIds;
335        private String formatUri;
336        private long size;
337        private ContentDigest contentDigest;
338        private CachedContent dsContent;
339        private boolean isInlineXml = false;
340
341        /**
342         * foxml datastream version.
343         * @param dsInfo the datastream information
344         * @param reader the reader
345         * @throws XMLStreamException xml stream exception
346         */
347        public Foxml11DatastreamVersion(final DatastreamInfo dsInfo,
348                final XMLStreamReader reader) throws XMLStreamException {
349            this.dsInfo = dsInfo;
350            final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL",
351                    "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE");
352            id = dsAttributes.get("ID");
353            label = dsAttributes.get("LABEL");
354            created = dsAttributes.get("CREATED");
355            mimeType = dsAttributes.get("MIMETYPE");
356            altIds = dsAttributes.get("ALT_IDS");
357            formatUri = dsAttributes.get("FORMAT_URI");
358            size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1;
359            reader.next();
360
361            while (reader.hasNext()) {
362                if (reader.isCharacters()) {
363                    if (!reader.isWhiteSpace()) {
364                        throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\"");
365                    } else {
366                        // skip whitespace...
367                    }
368                } else if (reader.isStartElement()) {
369                    final String localName = reader.getLocalName();
370                    if (localName.equals("contentDigest")) {
371                        final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST");
372                        this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST"));
373                    } else if (localName.equals("xmlContent")) {
374                        // this XML fragment may not be valid out of context
375                        // context, so write it out as a complete XML
376                        // file...
377                        reader.next();
378
379                        isInlineXml = true;
380                        dsContent = new MemoryCachedContent(extractInlineXml());
381                    } else if (localName.equals("contentLocation")) {
382                        final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE");
383                        if (attributes.get("TYPE").equals("INTERNAL_ID")) {
384                            dsContent = idResolver.resolveInternalID(attributes.get("REF"));
385                        } else {
386                            try {
387                                String ref = attributes.get("REF");
388                                if (ref.contains("local.fedora.server")) {
389                                    ref = ref.replace("local.fedora.server", localFedoraServer);
390                                }
391                                dsContent = new URLCachedContent(new URL(ref), fetcher);
392                            } catch (final MalformedURLException e) {
393                                throw new RuntimeException(e);
394                            }
395                        }
396                    } else if (localName.equals("binaryContent")) {
397                        try {
398                            final File f = File.createTempFile("decoded", "file");
399                            tempFiles.add(f);
400                            final Base64OutputStream out = new Base64OutputStream(new FileOutputStream(f), false);
401                            while (reader.next() == XMLStreamConstants.CHARACTERS) {
402                                out.write(reader.getText().getBytes("UTF-8"));
403                            }
404                            out.flush();
405                            out.close();
406                            dsContent = new FileCachedContent(f);
407                        } catch (final IOException e) {
408                            throw new RuntimeException(e);
409                        }
410                        readUntilClosed("binaryContent", FOXML_NS);
411                    } else {
412                        throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!");
413                    }
414                } else if (reader.isEndElement()) {
415                    if (reader.getLocalName().equals("datastreamVersion")) {
416                        return;
417                    }
418                } else {
419                    throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line "
420                            + reader.getLocation().getLineNumber() + ", column "
421                            + reader.getLocation().getColumnNumber()
422                            + "!" + (reader.isCharacters() ? "  \"" + reader.getText() + "\"" : ""));
423                }
424                reader.next();
425            }
426
427        }
428
429        private String extractInlineXml() throws XMLStreamException {
430            final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader);
431            while (eventReader.hasNext()) {
432                final XMLEvent event = eventReader.nextEvent();
433                if (event.isEndElement()
434                        && event.asEndElement().getName().getLocalPart().equals("xmlContent")
435                        && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) {
436                    break;
437                }
438            }
439
440            return inlineXml.removeFirst();
441        }
442
443        private void validateInlineXml() {
444            if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) {
445                final var transformedXml = transformInlineXmlForChecksum();
446                final var digest = DigestUtils.getDigest(contentDigest.getType());
447                final var digestBytes = DigestUtils.digest(digest, transformedXml);
448                final var digestHex = Hex.encodeHexString(digestBytes);
449
450                if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) {
451                    throw new RuntimeException(String.format(
452                            "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s",
453                            dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(),
454                            contentDigest.getType(), contentDigest.getDigest(), digestHex));
455                }
456            }
457        }
458
459        /**
460         * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/
461         * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/
462         * DatastreamXMLMetadata.java#L92
463         *
464         * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order.
465         *
466         * @return the xml in the format Fedora 3 used to calculate digests
467         */
468        private byte[] transformInlineXmlForChecksum() {
469            try {
470                // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :(
471                final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
472                        + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8);
473
474                final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8);
475                final var source = new InputSource(isReader);
476                source.setEncoding("UTF-8");
477
478                final Document doc = documentBuilder.parse(source);
479
480                final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false);
481                // indent == 0 means add no indenting
482                fmt.setIndent(0);
483                // default line width is 72, but only applies when indenting
484                fmt.setLineWidth(0);
485                fmt.setPreserveSpace(false);
486
487                final StringWriter out = new StringWriter();
488                final XMLSerializer ser = new XMLSerializer(out, fmt);
489                ser.serialize(doc);
490                out.close();
491
492                final var baos = new ByteArrayOutputStream();
493                final var br = new BufferedReader(new StringReader(out.toString()));
494                String line;
495                final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8));
496                while ((line = br.readLine()) != null) {
497                    line = line.trim();
498                    outStream.append(line);
499                }
500                outStream.close();
501
502                return baos.toByteArray();
503            } catch (IOException e) {
504                throw new UncheckedIOException(e);
505            } catch (SAXException e) {
506                try {
507                    LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream()));
508                } catch (IOException e2) {
509                    // swallow
510                }
511                throw new RuntimeException(e);
512            }
513        }
514
515        @Override
516        public DatastreamInfo getDatastreamInfo() {
517            return dsInfo;
518        }
519
520        @Override
521        public String getVersionId() {
522            return id;
523        }
524
525        @Override
526        public String getMimeType() {
527            return mimeType;
528        }
529
530        @Override
531        public String getLabel() {
532            return label;
533        }
534
535        @Override
536        public String getCreated() {
537            return created;
538        }
539
540        @Override
541        public String getAltIds() {
542            return altIds;
543        }
544
545        @Override
546        public String getFormatUri() {
547            return formatUri;
548        }
549
550        @Override
551        public long getSize() {
552            return size;
553        }
554
555        @Override
556        public ContentDigest getContentDigest() {
557            // The digests for inline xml do not match what is stored in the FOXML and should not be returned here.
558            if (isInlineXml) {
559                return null;
560            }
561            return contentDigest;
562        }
563
564        @Override
565        public InputStream getContent() throws IOException {
566            return dsContent.getInputStream();
567        }
568
569        @Override
570        public String getExternalOrRedirectURL() {
571            if (dsContent instanceof URLCachedContent) {
572                return ((URLCachedContent) dsContent).getURL().toString();
573            } else {
574                throw new IllegalStateException();
575            }
576        }
577
578        @Override
579        public boolean isFirstVersionIn(final ObjectReference obj) {
580            final List<DatastreamVersion> datastreams =
581                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
582            return datastreams.indexOf(this) == 0;
583        }
584
585        @Override
586        public boolean isLastVersionIn(final ObjectReference obj) {
587            final List<DatastreamVersion> datastreams =
588                    obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId());
589            return datastreams.indexOf(this) == datastreams.size() - 1;
590        }
591    }
592
593    private static Map<String, String> getAttributes(final XMLStreamReader r,
594            final String ... allowedNames) {
595        final HashMap<String, String> result = new HashMap<String, String>();
596        final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames));
597        for (int i = 0; i < r.getAttributeCount(); i ++) {
598            final String localName = r.getAttributeLocalName(i);
599            final String value = r.getAttributeValue(i);
600            if (allowed.contains(localName)) {
601                result.put(localName, value);
602            } else {
603                System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\"");
604            }
605        }
606        return result;
607
608    }
609
610}