001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.sequencer.pdf;
017
018import java.io.ByteArrayInputStream;
019import java.io.IOException;
020import java.io.InputStream;
021import java.util.Calendar;
022import java.util.List;
023import java.util.TimeZone;
024import java.util.stream.Collectors;
025import javax.jcr.Binary;
026import javax.jcr.NamespaceRegistry;
027import javax.jcr.Node;
028import javax.jcr.Property;
029import javax.jcr.RepositoryException;
030import javax.jcr.Value;
031import javax.jcr.ValueFactory;
032import org.modeshape.common.util.CheckArg;
033import org.modeshape.common.util.StringUtil;
034import org.modeshape.jcr.api.JcrConstants;
035import org.modeshape.jcr.api.nodetype.NodeTypeManager;
036import org.modeshape.jcr.api.sequencer.Sequencer;
037
038/**
039 * A sequencer that processes the binary content of an PDF file, extracts the metadata, and then writes that
040 * metadata to the repository.
041 * <p>
042 * This sequencer produces data that corresponds to the following structure:
043 * <ul>
044 * <li><strong>pdf:metadata</strong> node of type <code>pdf:metadata</code>
045 *  <ul>
046 *   <li><strong>jcr:mimeType</strong> - optional string property for the mime type of the image</li>
047 *   <li><strong>pdf:pageCount</strong> - mandatory long property specifying number of pages</li>
048 *   <li><strong>pdf:encrypted</strong> - mandatory boolean property specifying whether the document is encrypted</li>
049 *   <li><strong>pdf:version</strong> - mandatory string property for the version of the PDF format</li>
050 *   <li><strong>pdf:orientation</strong> - mandatory string property specifying the orientation of the paper (landscape, portrait, reverse landscape)</li>
051 *   <li><strong>pdf:author</strong> - optional string property for the author of the document</li>
052 *   <li><strong>pdf:creationDate</strong> - optional date property for the creation date of the document</li>
053 *   <li><strong>pdf:creator</strong> - optional string property for the creator of the document</li>
054 *   <li><strong>pdf:keywords</strong> - optional string property for the keywords of the document (comma delimited)</li>
055 *   <li><strong>pdf:modificationDate</strong> - optional date property for the modification date of document</li>
056 *   <li><strong>pdf:producer</strong> - optional string property for the producer of the document</li>
057 *   <li><strong>pdf:subject</strong> - optional string property for the subject of the document</li>
058 *   <li><strong>pdf:title</strong> - optional string property for the title of the document</li>
059 *   <li><strong>pdf:xmp</strong> - optional child node for the metadata fields from XMP block
060 *    <ul>
061 *     <li><strong>xmp:baseURL</strong> - optional string property for the baseURL</li>
062 *     <li><strong>xmp:createDate</strong> - optional date property for modification date of this object</li>
063 *     <li><strong>xmp:creatorTool</strong> - optional string property specifying the creator tool used to make this document</li></li>
064 *     <li><strong>xmp:identifier</strong> - optional multi-valued string property for the identifiers of the object</li>
065 *     <li><strong>xmp:label</strong> - optional string property for the label of the object</li>
066 *     <li><strong>xmp:metadataDate</strong> - optional date property for creation date of this metadata</li>
067 *     <li><strong>xmp:modifyDate</strong> - optional date property for modification date of this object</li>
068 *     <li><strong>xmp:nickname</strong> - optional string property for the nickname</li>
069 *     <li><strong>xmp:rating</strong> - optional string property for the nickname</li>
070 *     <li><strong>xmp:label</strong> - optional string property for the label</li>
071 *     </ul>
072 *   </li>
073 *   <li><strong>pdf:page</strong> - optional child node for the metadata fields related to individual pages
074 *    <ul>
075 *     <li><strong>pdf:pageNumber</strong> - mandatory long property for the number of this page</li>
076 *     <li><strong>pdf:attachement</strong> - optional child node for the metadata fields related to attachment
077 *      <ul>
078 *       <li><strong>pdf:creationDate</strong> - optional date property for creation date of this attachment</li>
079 *       <li><strong>pdf:modificationDate</strong> - optional date property for modification date of this attachment</li>
080 *       <li><strong>pdf:subject</strong> - optional string property for the subject of this attachment</li>
081 *       <li><strong>pdf:name</strong> - optional string property for the name of this attachment</li>
082 *       <li><strong>jcr:mimeType</strong> - optional string property for the mime type of this attachment</li>
083 *       <li><strong>jcr:data</strong> - optional binary property for the content of this attachment</li>
084 *      </ul>
085 *     </li>
086 *    </ul>
087 *   </li>
088 *  </ul>
089 * </p>
090 * 
091 * @since 5.1
092 */
093public class PdfMetadataSequencer extends Sequencer {
094
095    @Override
096    public void initialize( NamespaceRegistry registry,
097                            NodeTypeManager nodeTypeManager ) throws RepositoryException, IOException {
098        super.registerNodeTypes("pdf.cnd", nodeTypeManager, true);
099        registerDefaultMimeTypes(PdfBasicMetadata.MIME_TYPE_STRING);
100    }
101
102    @Override
103    public boolean execute( Property inputProperty,
104                            Node outputNode,
105                            Context context ) throws Exception {
106        Binary binaryValue = inputProperty.getBinary();
107        CheckArg.isNotNull(binaryValue, "binary");
108        Node sequencedNode = getPdfMetadataNode(outputNode);
109        try {
110            if (processBasicMetadata(sequencedNode, binaryValue)) {
111                processXMPMetadata(sequencedNode, binaryValue);
112                return true;
113            } else {
114                getLogger().warn("Ignoring pdf from node {0} because basic metadata cannot be extracted",
115                                 inputProperty.getParent().getPath());
116                return false;
117            }
118        } catch (java.lang.NoClassDefFoundError ncdfe) {
119            if (ncdfe.getMessage().toLowerCase().contains("bouncycastle")) {
120                getLogger().warn("Ignoring pdf from node {0} because it's encrypted and encrypted PDFs are not supported", 
121                                 inputProperty.getParent().getPath());
122                return false;
123            }
124            throw ncdfe;
125        }
126    }
127
128    private boolean processBasicMetadata( Node sequencedNode,
129                                          Binary binaryValue) {
130        PdfBasicMetadata metadata = null;
131        try (InputStream stream = binaryValue.getStream()) {
132            metadata = new PdfBasicMetadata(stream);
133            if (metadata.check()) {
134                setPropertyIfMetadataPresent(sequencedNode, JcrConstants.JCR_MIME_TYPE, PdfBasicMetadata.MIME_TYPE_STRING);
135
136                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PAGE_COUNT, metadata.getPageCount());
137                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ORIENTATION, metadata.getOrientation());
138                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ENCRYPTED, metadata.isEncrypted());
139                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.VERSION, metadata.getVersion());
140
141                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.AUTHOR, metadata.getAuthor());
142                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATION_DATE, metadata.getCreationDate());
143                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATOR, metadata.getCreator());
144                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.KEYWORDS, metadata.getKeywords());
145                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.MODIFICATION_DATE, metadata.getModificationDate());
146                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PRODUCER, metadata.getProducer());
147                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.SUBJECT, metadata.getSubject());
148                setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.TITLE, metadata.getTitle());
149
150                for (PdfPageMetadata pageMetadata : metadata.getPages()) {
151                    Node pageNode = sequencedNode.addNode(PdfMetadataLexicon.PAGE_NODE, PdfMetadataLexicon.PAGE_NODE);
152
153                    setPropertyIfMetadataPresent(pageNode, PdfMetadataLexicon.PAGE_NUMBER, pageMetadata.getPageNumber());
154                    for (PdfAttachmentMetadata attachmentMetadata : pageMetadata.getAttachments()) {
155                        Node attachmentNode = pageNode.addNode(PdfMetadataLexicon.ATTACHMENT_NODE, PdfMetadataLexicon.ATTACHMENT_NODE);
156
157                        setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_MIME_TYPE, attachmentMetadata.getMimeType());
158                        setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.CREATION_DATE, attachmentMetadata.getCreationDate());
159                        setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.MODIFICATION_DATE, attachmentMetadata.getModificationDate());
160                        setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.SUBJECT, attachmentMetadata.getSubject());
161                        setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.NAME, attachmentMetadata.getName());
162                        setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_DATA, attachmentMetadata.getData());
163                    }
164                }
165                return true;
166            }
167        } catch (Exception e) {
168            getLogger().error(e, "Couldn't process stream.");
169        }
170        return false;
171    }
172
173    private boolean processXMPMetadata( Node sequencedNode,
174                                        Binary binaryValue) {
175        PdfXmpMetadata metadata = null;
176        try (InputStream stream = binaryValue.getStream()) {
177            metadata = new PdfXmpMetadata(stream);
178            if (metadata.check()) {
179                Node xmpNode = sequencedNode.addNode(PdfMetadataLexicon.XMP_NODE, PdfMetadataLexicon.XMP_NODE);
180
181                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.BASE_URL, metadata.getBaseURL());
182                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATE_DATE, metadata.getCreateDate());
183                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATOR_TOOL, metadata.getCreatorTool());
184                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.IDENTIFIER, metadata.getIdentifier());
185                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.METADATA_DATE, metadata.getMetadataDate());
186                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.MODIFY_DATE, metadata.getModifyDate());
187                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.NICKNAME, metadata.getNickname());
188                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.RATING, metadata.getRating());
189                setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.LABEL, metadata.getLabel());
190                return true;
191            }
192        } catch (Exception e) {
193            getLogger().error(e, "Couldn't process stream.");
194        }
195
196        return false;
197    }
198
199    private Node getPdfMetadataNode( Node outputNode ) throws RepositoryException {
200        if (outputNode.isNew()) {
201            outputNode.setPrimaryType(PdfMetadataLexicon.METADATA_NODE);
202            return outputNode;
203        }
204        return outputNode.addNode(PdfMetadataLexicon.METADATA_NODE, PdfMetadataLexicon.METADATA_NODE);
205    }
206
207    private void setPropertyIfMetadataPresent( Node node,
208                                               String propertyName,
209                                               Object value ) throws RepositoryException {
210        if (value != null) {
211            if (value instanceof String && !StringUtil.isBlank((String) value)) {
212                node.setProperty(propertyName, (String) value);
213            } else if (value instanceof Boolean) {
214                node.setProperty(propertyName, (Boolean) value);
215            } else if (value instanceof Long) {
216                node.setProperty(propertyName, (Long) value);
217            } else if (value instanceof Integer) {
218                node.setProperty(propertyName, new Long((Integer) value));
219            } else if (value instanceof Calendar) {
220                // pdfbox 1.8.x doesn't parse the timezones correctly...
221                // see PDFBOX-3352
222                Calendar calendarValue = (Calendar) value;
223                if (calendarValue.getTimeZone().getID().toLowerCase().equals("unknown")) {
224                    calendarValue.setTimeZone(TimeZone.getDefault());
225                }
226                node.setProperty(propertyName, calendarValue);
227            } else if (value instanceof byte[]) {
228                InputStream is = new ByteArrayInputStream((byte []) value);
229                Binary binaryProperty = node.getSession().getValueFactory().createBinary(is);
230                node.setProperty(propertyName, binaryProperty);
231            } else if (value instanceof List<?>) {
232                ValueFactory vf = node.getSession().getValueFactory();
233                List<Value> values = ((List<?>) value).stream()
234                                                      .filter(val -> val instanceof String)
235                                                      .map(val -> vf.createValue((String) val))
236                                                      .collect(Collectors.toList());
237                if (!values.isEmpty()) {
238                    node.setProperty(propertyName, values.toArray(new Value[values.size()]));
239                }
240            } else {
241                throw new IllegalArgumentException(String.format("The value of the property %s has unknown type and couldn't be saved.", propertyName));
242            }
243        }
244    }
245}