001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.sequencer.pdf; 017 018import java.io.ByteArrayInputStream; 019import java.io.IOException; 020import java.io.InputStream; 021import java.util.Calendar; 022import java.util.List; 023import java.util.TimeZone; 024import java.util.stream.Collectors; 025import javax.jcr.Binary; 026import javax.jcr.NamespaceRegistry; 027import javax.jcr.Node; 028import javax.jcr.Property; 029import javax.jcr.RepositoryException; 030import javax.jcr.Value; 031import javax.jcr.ValueFactory; 032import org.modeshape.common.util.CheckArg; 033import org.modeshape.common.util.StringUtil; 034import org.modeshape.jcr.api.JcrConstants; 035import org.modeshape.jcr.api.nodetype.NodeTypeManager; 036import org.modeshape.jcr.api.sequencer.Sequencer; 037 038/** 039 * A sequencer that processes the binary content of an PDF file, extracts the metadata, and then writes that 040 * metadata to the repository. 041 * <p> 042 * This sequencer produces data that corresponds to the following structure: 043 * <ul> 044 * <li><strong>pdf:metadata</strong> node of type <code>pdf:metadata</code> 045 * <ul> 046 * <li><strong>jcr:mimeType</strong> - optional string property for the mime type of the image</li> 047 * <li><strong>pdf:pageCount</strong> - mandatory long property specifying number of pages</li> 048 * <li><strong>pdf:encrypted</strong> - mandatory boolean property specifying whether the document is encrypted</li> 049 * <li><strong>pdf:version</strong> - mandatory string property for the version of the PDF format</li> 050 * <li><strong>pdf:orientation</strong> - mandatory string property specifying the orientation of the paper (landscape, portrait, reverse landscape)</li> 051 * <li><strong>pdf:author</strong> - optional string property for the author of the document</li> 052 * <li><strong>pdf:creationDate</strong> - optional date property for the creation date of the document</li> 053 * <li><strong>pdf:creator</strong> - optional string property for the creator of the document</li> 054 * <li><strong>pdf:keywords</strong> - optional string property for the keywords of the document (comma delimited)</li> 055 * <li><strong>pdf:modificationDate</strong> - optional date property for the modification date of document</li> 056 * <li><strong>pdf:producer</strong> - optional string property for the producer of the document</li> 057 * <li><strong>pdf:subject</strong> - optional string property for the subject of the document</li> 058 * <li><strong>pdf:title</strong> - optional string property for the title of the document</li> 059 * <li><strong>pdf:xmp</strong> - optional child node for the metadata fields from XMP block 060 * <ul> 061 * <li><strong>xmp:baseURL</strong> - optional string property for the baseURL</li> 062 * <li><strong>xmp:createDate</strong> - optional date property for modification date of this object</li> 063 * <li><strong>xmp:creatorTool</strong> - optional string property specifying the creator tool used to make this document</li></li> 064 * <li><strong>xmp:identifier</strong> - optional multi-valued string property for the identifiers of the object</li> 065 * <li><strong>xmp:label</strong> - optional string property for the label of the object</li> 066 * <li><strong>xmp:metadataDate</strong> - optional date property for creation date of this metadata</li> 067 * <li><strong>xmp:modifyDate</strong> - optional date property for modification date of this object</li> 068 * <li><strong>xmp:nickname</strong> - optional string property for the nickname</li> 069 * <li><strong>xmp:rating</strong> - optional string property for the nickname</li> 070 * <li><strong>xmp:label</strong> - optional string property for the label</li> 071 * </ul> 072 * </li> 073 * <li><strong>pdf:page</strong> - optional child node for the metadata fields related to individual pages 074 * <ul> 075 * <li><strong>pdf:pageNumber</strong> - mandatory long property for the number of this page</li> 076 * <li><strong>pdf:attachement</strong> - optional child node for the metadata fields related to attachment 077 * <ul> 078 * <li><strong>pdf:creationDate</strong> - optional date property for creation date of this attachment</li> 079 * <li><strong>pdf:modificationDate</strong> - optional date property for modification date of this attachment</li> 080 * <li><strong>pdf:subject</strong> - optional string property for the subject of this attachment</li> 081 * <li><strong>pdf:name</strong> - optional string property for the name of this attachment</li> 082 * <li><strong>jcr:mimeType</strong> - optional string property for the mime type of this attachment</li> 083 * <li><strong>jcr:data</strong> - optional binary property for the content of this attachment</li> 084 * </ul> 085 * </li> 086 * </ul> 087 * </li> 088 * </ul> 089 * </p> 090 * 091 * @since 5.1 092 */ 093public class PdfMetadataSequencer extends Sequencer { 094 095 @Override 096 public void initialize( NamespaceRegistry registry, 097 NodeTypeManager nodeTypeManager ) throws RepositoryException, IOException { 098 super.registerNodeTypes("pdf.cnd", nodeTypeManager, true); 099 registerDefaultMimeTypes(PdfBasicMetadata.MIME_TYPE_STRING); 100 } 101 102 @Override 103 public boolean execute( Property inputProperty, 104 Node outputNode, 105 Context context ) throws Exception { 106 Binary binaryValue = inputProperty.getBinary(); 107 CheckArg.isNotNull(binaryValue, "binary"); 108 Node sequencedNode = getPdfMetadataNode(outputNode); 109 try { 110 if (processBasicMetadata(sequencedNode, binaryValue)) { 111 processXMPMetadata(sequencedNode, binaryValue); 112 return true; 113 } else { 114 getLogger().warn("Ignoring pdf from node {0} because basic metadata cannot be extracted", 115 inputProperty.getParent().getPath()); 116 return false; 117 } 118 } catch (java.lang.NoClassDefFoundError ncdfe) { 119 if (ncdfe.getMessage().toLowerCase().contains("bouncycastle")) { 120 getLogger().warn("Ignoring pdf from node {0} because it's encrypted and encrypted PDFs are not supported", 121 inputProperty.getParent().getPath()); 122 return false; 123 } 124 throw ncdfe; 125 } 126 } 127 128 private boolean processBasicMetadata( Node sequencedNode, 129 Binary binaryValue) { 130 PdfBasicMetadata metadata = null; 131 try (InputStream stream = binaryValue.getStream()) { 132 metadata = new PdfBasicMetadata(stream); 133 if (metadata.check()) { 134 setPropertyIfMetadataPresent(sequencedNode, JcrConstants.JCR_MIME_TYPE, PdfBasicMetadata.MIME_TYPE_STRING); 135 136 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PAGE_COUNT, metadata.getPageCount()); 137 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ORIENTATION, metadata.getOrientation()); 138 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ENCRYPTED, metadata.isEncrypted()); 139 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.VERSION, metadata.getVersion()); 140 141 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.AUTHOR, metadata.getAuthor()); 142 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATION_DATE, metadata.getCreationDate()); 143 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATOR, metadata.getCreator()); 144 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.KEYWORDS, metadata.getKeywords()); 145 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.MODIFICATION_DATE, metadata.getModificationDate()); 146 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PRODUCER, metadata.getProducer()); 147 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.SUBJECT, metadata.getSubject()); 148 setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.TITLE, metadata.getTitle()); 149 150 for (PdfPageMetadata pageMetadata : metadata.getPages()) { 151 Node pageNode = sequencedNode.addNode(PdfMetadataLexicon.PAGE_NODE, PdfMetadataLexicon.PAGE_NODE); 152 153 setPropertyIfMetadataPresent(pageNode, PdfMetadataLexicon.PAGE_NUMBER, pageMetadata.getPageNumber()); 154 for (PdfAttachmentMetadata attachmentMetadata : pageMetadata.getAttachments()) { 155 Node attachmentNode = pageNode.addNode(PdfMetadataLexicon.ATTACHMENT_NODE, PdfMetadataLexicon.ATTACHMENT_NODE); 156 157 setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_MIME_TYPE, attachmentMetadata.getMimeType()); 158 setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.CREATION_DATE, attachmentMetadata.getCreationDate()); 159 setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.MODIFICATION_DATE, attachmentMetadata.getModificationDate()); 160 setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.SUBJECT, attachmentMetadata.getSubject()); 161 setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.NAME, attachmentMetadata.getName()); 162 setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_DATA, attachmentMetadata.getData()); 163 } 164 } 165 return true; 166 } 167 } catch (Exception e) { 168 getLogger().error(e, "Couldn't process stream."); 169 } 170 return false; 171 } 172 173 private boolean processXMPMetadata( Node sequencedNode, 174 Binary binaryValue) { 175 PdfXmpMetadata metadata = null; 176 try (InputStream stream = binaryValue.getStream()) { 177 metadata = new PdfXmpMetadata(stream); 178 if (metadata.check()) { 179 Node xmpNode = sequencedNode.addNode(PdfMetadataLexicon.XMP_NODE, PdfMetadataLexicon.XMP_NODE); 180 181 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.BASE_URL, metadata.getBaseURL()); 182 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATE_DATE, metadata.getCreateDate()); 183 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATOR_TOOL, metadata.getCreatorTool()); 184 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.IDENTIFIER, metadata.getIdentifier()); 185 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.METADATA_DATE, metadata.getMetadataDate()); 186 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.MODIFY_DATE, metadata.getModifyDate()); 187 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.NICKNAME, metadata.getNickname()); 188 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.RATING, metadata.getRating()); 189 setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.LABEL, metadata.getLabel()); 190 return true; 191 } 192 } catch (Exception e) { 193 getLogger().error(e, "Couldn't process stream."); 194 } 195 196 return false; 197 } 198 199 private Node getPdfMetadataNode( Node outputNode ) throws RepositoryException { 200 if (outputNode.isNew()) { 201 outputNode.setPrimaryType(PdfMetadataLexicon.METADATA_NODE); 202 return outputNode; 203 } 204 return outputNode.addNode(PdfMetadataLexicon.METADATA_NODE, PdfMetadataLexicon.METADATA_NODE); 205 } 206 207 private void setPropertyIfMetadataPresent( Node node, 208 String propertyName, 209 Object value ) throws RepositoryException { 210 if (value != null) { 211 if (value instanceof String && !StringUtil.isBlank((String) value)) { 212 node.setProperty(propertyName, (String) value); 213 } else if (value instanceof Boolean) { 214 node.setProperty(propertyName, (Boolean) value); 215 } else if (value instanceof Long) { 216 node.setProperty(propertyName, (Long) value); 217 } else if (value instanceof Integer) { 218 node.setProperty(propertyName, new Long((Integer) value)); 219 } else if (value instanceof Calendar) { 220 // pdfbox 1.8.x doesn't parse the timezones correctly... 221 // see PDFBOX-3352 222 Calendar calendarValue = (Calendar) value; 223 if (calendarValue.getTimeZone().getID().toLowerCase().equals("unknown")) { 224 calendarValue.setTimeZone(TimeZone.getDefault()); 225 } 226 node.setProperty(propertyName, calendarValue); 227 } else if (value instanceof byte[]) { 228 InputStream is = new ByteArrayInputStream((byte []) value); 229 Binary binaryProperty = node.getSession().getValueFactory().createBinary(is); 230 node.setProperty(propertyName, binaryProperty); 231 } else if (value instanceof List<?>) { 232 ValueFactory vf = node.getSession().getValueFactory(); 233 List<Value> values = ((List<?>) value).stream() 234 .filter(val -> val instanceof String) 235 .map(val -> vf.createValue((String) val)) 236 .collect(Collectors.toList()); 237 if (!values.isEmpty()) { 238 node.setProperty(propertyName, values.toArray(new Value[values.size()])); 239 } 240 } else { 241 throw new IllegalArgumentException(String.format("The value of the property %s has unknown type and couldn't be saved.", propertyName)); 242 } 243 } 244 } 245}