001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.sequencer.pdf;
017
018import java.awt.print.PageFormat;
019import java.io.InputStream;
020import java.util.ArrayList;
021import java.util.Calendar;
022import java.util.List;
023import org.apache.pdfbox.pdmodel.PDDocument;
024import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
025import org.apache.pdfbox.pdmodel.PDDocumentInformation;
026import org.apache.pdfbox.pdmodel.PDPage;
027import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
028import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
029import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
030import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
031import org.apache.pdfbox.printing.PDFPageable;
032
033/**
034 * Utility for extracting Document Information Directory metadata from PDF files.
035 * 
036 * @since 5.1
037 */
038public class PdfBasicMetadata {
039
040    static final String MIME_TYPE_STRING = "application/pdf";
041
042    static final String[] ORIENTATION_STRINGS = {"landscape", "portrait", "reverse landscape"};
043
044    private Integer pageCount;
045    private String author;
046    private Calendar creationDate;
047    private String creator;
048    private String keywords;
049    private Calendar modificationDate;
050    private String producer;
051    private String subject;
052    private String title;
053    private String orientation;
054    private Boolean encrypted;
055    private String version;
056
057    private List<PdfPageMetadata> pages = new ArrayList<>();
058
059    private InputStream in;
060
061    public PdfBasicMetadata( InputStream inputStream ) {
062        this.in = inputStream;
063    }
064
065    /*
066     * Check that given file is supported by this sequencer.
067     */
068    public boolean check() throws Exception {
069        try (PDDocument document = PDDocument.load(in)) {
070            PDDocumentCatalog catalog = document.getDocumentCatalog();
071            PDFPageable pageable = new PDFPageable(document);
072            PageFormat firstPage = pageable.getPageFormat(0);
073
074            encrypted = document.isEncrypted();
075            pageCount = document.getNumberOfPages();
076            orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
077            version = String.valueOf(document.getDocument().getVersion());
078            String catalogVersion = catalog.getVersion();
079            if (catalogVersion != null && !catalogVersion.isEmpty()) {
080                // According to specs version saved here should be determining instead
081                // the version in header. It is barely used, though.
082                version = catalogVersion;
083            }
084
085            if (!encrypted) {
086                PDDocumentInformation metadata = document.getDocumentInformation();
087                author = metadata.getAuthor();
088                creationDate = metadata.getCreationDate();
089                creator = metadata.getCreator();
090                keywords = metadata.getKeywords();
091                modificationDate = metadata.getModificationDate();
092                producer = metadata.getProducer();
093                subject = metadata.getSubject();
094                title = metadata.getTitle();
095            }
096
097            // extract all attached files from all pages
098            int pageNumber = 0;
099            for (Object page : catalog.getPages()) {
100                pageNumber += 1;
101                PdfPageMetadata pageMetadata = new PdfPageMetadata();
102                pageMetadata.setPageNumber(pageNumber);
103                for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
104                    if (annotation instanceof PDAnnotationFileAttachment) {
105                        PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();
106
107                        PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
108                        PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
109                        PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
110
111                        attachmentMetadata.setSubject(fann.getSubject());
112                        attachmentMetadata.setName(fileSpec.getFilename());
113                        attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
114                        attachmentMetadata.setModificationDate(embeddedFile.getModDate());
115                        attachmentMetadata.setMimeType(embeddedFile.getSubtype());
116                        attachmentMetadata.setData(embeddedFile.toByteArray());
117
118                        pageMetadata.addAttachment(attachmentMetadata);
119                    }
120                }
121                pages.add(pageMetadata);
122            }
123            return true;
124        }
125    }
126
127
128    public Integer getPageCount() {
129        return pageCount;
130    }
131
132    public String getAuthor() {
133        return author;
134    }
135
136    public Calendar getCreationDate() {
137        return creationDate;
138    }
139
140    public String getCreator() {
141        return creator;
142    }
143
144    public String getKeywords() {
145        return keywords;
146    }
147
148    public Calendar getModificationDate() {
149        return modificationDate;
150    }
151
152    public String getProducer() {
153        return producer;
154    }
155
156    public String getSubject() {
157        return subject;
158    }
159
160    public String getTitle() {
161        return title;
162    }
163
164    public String getOrientation() {
165        return orientation;
166    }
167
168    public boolean isEncrypted() {
169        return encrypted;
170    }
171
172    public String getVersion() {
173        return version;
174    }
175
176    public List<PdfPageMetadata> getPages() {
177        return pages;
178    }
179
180    public void addPage( PdfPageMetadata page ) {
181        this.pages.add(page);
182    }
183
184}