001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.sequencer.pdf; 017 018import java.awt.print.PageFormat; 019import java.io.InputStream; 020import java.util.ArrayList; 021import java.util.Calendar; 022import java.util.List; 023import org.apache.pdfbox.pdmodel.PDDocument; 024import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 025import org.apache.pdfbox.pdmodel.PDDocumentInformation; 026import org.apache.pdfbox.pdmodel.PDPage; 027import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; 028import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; 029import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 030import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; 031import org.apache.pdfbox.printing.PDFPageable; 032 033/** 034 * Utility for extracting Document Information Directory metadata from PDF files. 035 * 036 * @since 5.1 037 */ 038public class PdfBasicMetadata { 039 040 static final String MIME_TYPE_STRING = "application/pdf"; 041 042 static final String[] ORIENTATION_STRINGS = {"landscape", "portrait", "reverse landscape"}; 043 044 private Integer pageCount; 045 private String author; 046 private Calendar creationDate; 047 private String creator; 048 private String keywords; 049 private Calendar modificationDate; 050 private String producer; 051 private String subject; 052 private String title; 053 private String orientation; 054 private Boolean encrypted; 055 private String version; 056 057 private List<PdfPageMetadata> pages = new ArrayList<>(); 058 059 private InputStream in; 060 061 public PdfBasicMetadata( InputStream inputStream ) { 062 this.in = inputStream; 063 } 064 065 /* 066 * Check that given file is supported by this sequencer. 067 */ 068 public boolean check() throws Exception { 069 try (PDDocument document = PDDocument.load(in)) { 070 PDDocumentCatalog catalog = document.getDocumentCatalog(); 071 PDFPageable pageable = new PDFPageable(document); 072 PageFormat firstPage = pageable.getPageFormat(0); 073 074 encrypted = document.isEncrypted(); 075 pageCount = document.getNumberOfPages(); 076 orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; 077 version = String.valueOf(document.getDocument().getVersion()); 078 String catalogVersion = catalog.getVersion(); 079 if (catalogVersion != null && !catalogVersion.isEmpty()) { 080 // According to specs version saved here should be determining instead 081 // the version in header. It is barely used, though. 082 version = catalogVersion; 083 } 084 085 if (!encrypted) { 086 PDDocumentInformation metadata = document.getDocumentInformation(); 087 author = metadata.getAuthor(); 088 creationDate = metadata.getCreationDate(); 089 creator = metadata.getCreator(); 090 keywords = metadata.getKeywords(); 091 modificationDate = metadata.getModificationDate(); 092 producer = metadata.getProducer(); 093 subject = metadata.getSubject(); 094 title = metadata.getTitle(); 095 } 096 097 // extract all attached files from all pages 098 int pageNumber = 0; 099 for (Object page : catalog.getPages()) { 100 pageNumber += 1; 101 PdfPageMetadata pageMetadata = new PdfPageMetadata(); 102 pageMetadata.setPageNumber(pageNumber); 103 for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { 104 if (annotation instanceof PDAnnotationFileAttachment) { 105 PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); 106 107 PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; 108 PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); 109 PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); 110 111 attachmentMetadata.setSubject(fann.getSubject()); 112 attachmentMetadata.setName(fileSpec.getFilename()); 113 attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); 114 attachmentMetadata.setModificationDate(embeddedFile.getModDate()); 115 attachmentMetadata.setMimeType(embeddedFile.getSubtype()); 116 attachmentMetadata.setData(embeddedFile.toByteArray()); 117 118 pageMetadata.addAttachment(attachmentMetadata); 119 } 120 } 121 pages.add(pageMetadata); 122 } 123 return true; 124 } 125 } 126 127 128 public Integer getPageCount() { 129 return pageCount; 130 } 131 132 public String getAuthor() { 133 return author; 134 } 135 136 public Calendar getCreationDate() { 137 return creationDate; 138 } 139 140 public String getCreator() { 141 return creator; 142 } 143 144 public String getKeywords() { 145 return keywords; 146 } 147 148 public Calendar getModificationDate() { 149 return modificationDate; 150 } 151 152 public String getProducer() { 153 return producer; 154 } 155 156 public String getSubject() { 157 return subject; 158 } 159 160 public String getTitle() { 161 return title; 162 } 163 164 public String getOrientation() { 165 return orientation; 166 } 167 168 public boolean isEncrypted() { 169 return encrypted; 170 } 171 172 public String getVersion() { 173 return version; 174 } 175 176 public List<PdfPageMetadata> getPages() { 177 return pages; 178 } 179 180 public void addPage( PdfPageMetadata page ) { 181 this.pages.add(page); 182 } 183 184}