/*
 * Decompiled with CFR 0.152.
 */
package org.allenai.scienceparse.pdfapi;

import com.gs.collections.api.list.primitive.FloatList;
import com.gs.collections.impl.list.mutable.primitive.FloatArrayList;
import java.io.IOException;
import java.io.InputStream;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.function.ToDoubleFunction;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.allenai.scienceparse.pdfapi.PDFDoc;
import org.allenai.scienceparse.pdfapi.PDFFontMetrics;
import org.allenai.scienceparse.pdfapi.PDFLine;
import org.allenai.scienceparse.pdfapi.PDFMetadata;
import org.allenai.scienceparse.pdfapi.PDFPage;
import org.allenai.scienceparse.pdfapi.PDFToken;
import org.allenai.scienceparse.pdfapi.PdfDocExtractionResult;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.DateConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PDFExtractor {
    private static final Logger log = LoggerFactory.getLogger(PDFExtractor.class);
    private final Options opts;
    public boolean DEBUG = false;

    public PDFExtractor(Options options) {
        this.opts = options;
    }

    public PDFExtractor() {
        this.opts = Options.builder().build();
    }

    private static List<String> guessKeywordList(String string) {
        return string != null && string.length() > 0 ? Arrays.asList(string.split(",")) : Collections.emptyList();
    }

    private static List<String> guessAuthorList(String string) {
        if (string != null && string.length() > 0) {
            String[] stringArray = string.indexOf(59) >= 0 ? string.split(";") : string.split(",");
            ArrayList<String> arrayList = new ArrayList<String>(stringArray.length);
            for (String string2 : stringArray) {
                arrayList.add(string2.trim());
            }
            return arrayList;
        }
        return Collections.emptyList();
    }

    private static double relDiff(double d, double d2) {
        return Math.abs(d - d2) / Math.min(Math.abs(d), Math.abs(d2));
    }

    private boolean badPDFTitleFast(String string2) {
        if (string2 == null) {
            return true;
        }
        if (string2.endsWith(".pdf") || string2.endsWith(".doc") || string2.endsWith("...") || string2.trim().toLowerCase().startsWith("proceedings of") || string2.trim().startsWith("arXiv:")) {
            return true;
        }
        String[] stringArray = string2.split("\\s+");
        boolean bl = Stream.of(stringArray).filter(string -> !string.isEmpty()).anyMatch(string -> Character.isUpperCase(string.charAt(0)));
        return !bl;
    }

    private boolean badPDFTitle(PDFPage pDFPage, String string) {
        if (this.badPDFTitleFast(string)) {
            return true;
        }
        Optional<PDFLine> optional = pDFPage.lines.stream().filter(pDFLine -> {
            String string2 = pDFLine.lineText();
            return string2.startsWith(string) || string.startsWith(string2);
        }).findFirst();
        return !optional.isPresent();
    }

    private Date toDate(String string) {
        if (string == null) {
            return null;
        }
        String string2 = string.replace("^D:", "");
        Calendar calendar = null;
        calendar = DateConverter.toCalendar((String)string2);
        return calendar == null ? null : calendar.getTime();
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public PdfDocExtractionResult extractResultFromInputStream(InputStream inputStream) {
        try (PDDocument pDDocument = PDDocument.load((InputStream)inputStream);){
            PdfDocExtractionResult pdfDocExtractionResult = this.extractResultFromPDDocument(pDDocument);
            return pdfDocExtractionResult;
        }
    }

    public PdfDocExtractionResult extractResultFromPDDocument(PDDocument pDDocument) {
        Object object;
        boolean bl;
        Object object2;
        Object object3;
        PDDocumentInformation pDDocumentInformation = pDDocument.getDocumentInformation();
        List<String> list = PDFExtractor.guessKeywordList(pDDocumentInformation.getKeywords());
        List<String> list2 = PDFExtractor.guessAuthorList(pDDocumentInformation.getAuthor());
        PDFMetadata.PDFMetadataBuilder pDFMetadataBuilder = PDFMetadata.builder().title(pDDocumentInformation.getTitle() != null ? pDDocumentInformation.getTitle().trim() : null).keywords(list).authors(list2).creator(pDDocumentInformation.getCreator());
        String string2 = pDDocumentInformation.getCustomMetadataValue(COSName.CREATION_DATE.getName());
        if (string2 != null) {
            pDFMetadataBuilder.createDate(this.toDate(string2));
        } else {
            object3 = Stream.of("Date", "Created").map(arg_0 -> ((PDDocumentInformation)pDDocumentInformation).getCustomMetadataValue(arg_0)).filter(string -> string != null && string.matches("\\d\\d\\d\\d")).mapToInt(Integer::parseInt).findFirst();
            if (((OptionalInt)object3).isPresent()) {
                object2 = Calendar.getInstance();
                ((Calendar)object2).clear();
                ((Calendar)object2).set(1, ((OptionalInt)object3).getAsInt());
                pDFMetadataBuilder.createDate(((Calendar)object2).getTime());
            }
        }
        object3 = pDDocumentInformation.getCustomMetadataValue(COSName.CREATION_DATE.getName());
        if (object3 != null) {
            pDFMetadataBuilder.lastModifiedDate(this.toDate((String)object3));
        }
        object2 = new PDFCaptureTextStripper();
        object2.getText(pDDocument);
        String string3 = pDDocumentInformation.getTitle();
        if (((PDFCaptureTextStripper)((Object)object2)).pages.isEmpty() || this.badPDFTitle((PDFPage)((PDFCaptureTextStripper)((Object)object2)).pages.get(0), string3)) {
            string3 = null;
        }
        boolean bl2 = bl = string3 != null;
        if (this.opts.useHeuristicTitle && string3 == null) {
            try {
                object = this.getHeuristicTitle((PDFCaptureTextStripper)((Object)object2));
                if (!this.badPDFTitleFast((String)object)) {
                    string3 = object;
                }
            }
            catch (Exception exception) {
                log.warn("Exception while guessing heuristic title", (Throwable)exception);
            }
        }
        pDFMetadataBuilder.title(string3);
        object = PDFDoc.builder().pages(((PDFCaptureTextStripper)((Object)object2)).pages).meta(pDFMetadataBuilder.build()).build();
        return PdfDocExtractionResult.builder().document((PDFDoc)object).highPrecision(bl).build();
    }

    public PDFDoc extractFromInputStream(InputStream inputStream) {
        return this.extractResultFromInputStream((InputStream)inputStream).document;
    }

    private String getHeuristicTitle(PDFCaptureTextStripper pDFCaptureTextStripper) {
        int n2;
        PDFPage pDFPage = (PDFPage)pDFCaptureTextStripper.pages.get(0);
        ToDoubleFunction<PDFLine> toDoubleFunction = pDFLine -> pDFLine.getTokens().stream().mapToDouble(pDFToken -> pDFToken.getFontMetrics().getPtSize()).average().getAsDouble();
        double d = pDFPage.getLines().stream().filter(pDFLine -> !pDFLine.getTokens().isEmpty()).mapToDouble(toDoubleFunction::applyAsDouble).max().getAsDouble();
        int n3 = IntStream.range(0, pDFPage.lines.size()).filter(n -> toDoubleFunction.applyAsDouble(pDFPage.lines.get(n)) == d).findFirst().getAsInt();
        if (n3 == (n2 = IntStream.range(n3 + 1, pDFPage.lines.size()).filter(n -> toDoubleFunction.applyAsDouble(pDFPage.lines.get(n)) < d).findFirst().orElse(pDFPage.lines.size() - 1))) {
            return null;
        }
        double d2 = Double.NaN;
        List<PDFLine> list = pDFPage.lines.subList(n3, n2);
        if (list.size() == 1) {
            return list.get(0).lineText();
        }
        PDFLine pDFLine2 = list.get(0);
        float f = pDFLine2.bounds().get(1) / (float)pDFPage.getPageHeight();
        if ((double)f > 0.66 || n3 > 5) {
            return null;
        }
        int n4 = 0;
        while (n4 + 1 < list.size()) {
            PDFLine pDFLine3 = list.get(n4);
            PDFLine pDFLine4 = list.get(n4 + 1);
            double d3 = pDFLine4.bounds().get(1) - pDFLine3.bounds().get(3);
            double d4 = d3 / (double)pDFLine3.height();
            if (d4 > 1.5 || n4 > 0 && PDFExtractor.relDiff(d3, d2) > 0.1) {
                list = list.subList(0, n4 + 1);
                break;
            }
            d2 = d3;
            ++n4;
        }
        return list.stream().map(PDFLine::lineText).collect(Collectors.joining(" "));
    }

    private class PDFCaptureTextStripper
    extends PDFTextStripper {
        private List<PDFPage> pages = new ArrayList<PDFPage>();
        private List<PDFLine> curLines;
        private List<PDFToken> curLineTokens;
        private PDFToken lastToken;

        protected void writeString(String string, List<TextPosition> list) throws IOException {
            ArrayList<TextPosition> arrayList = new ArrayList<TextPosition>();
            ArrayList<PDFToken> arrayList2 = new ArrayList<PDFToken>();
            double d = -1.0;
            for (TextPosition object : list) {
                ArrayList<Object> arrayList3;
                if (d > 0.0 && (double)object.getX() < d) {
                    arrayList3 = new ArrayList(arrayList);
                    if (arrayList3.size() > 0) {
                        arrayList2.add(RawChunk.of(arrayList3).toPDFToken());
                    }
                    arrayList.clear();
                    arrayList.add(object);
                } else if (object.getUnicode().trim().isEmpty()) {
                    arrayList3 = new ArrayList<TextPosition>(arrayList);
                    if (arrayList3.size() > 0) {
                        arrayList2.add(RawChunk.of(arrayList3).toPDFToken());
                    }
                    arrayList.clear();
                } else {
                    arrayList.add(object);
                }
                d = object.getX();
            }
            if (!arrayList.isEmpty()) {
                arrayList2.add(RawChunk.of(new ArrayList<TextPosition>(arrayList)).toPDFToken());
            }
            for (PDFToken pDFToken : arrayList2) {
                this.updateFromToken(pDFToken);
            }
        }

        private void updateFromToken(PDFToken pDFToken) {
            if (this.curLineTokens.isEmpty()) {
                this.curLineTokens.add(pDFToken);
            } else {
                boolean bl;
                double d = pDFToken.bounds.get(1);
                double d2 = pDFToken.bounds.get(3);
                assert (d <= d2);
                double d3 = this.lastToken.bounds.get(1);
                double d4 = this.lastToken.bounds.get(3);
                assert (d3 <= d4);
                boolean bl2 = d >= d3 && d <= d4 || d2 >= d3 && d2 <= d4;
                float f = Math.max(pDFToken.getFontMetrics().getSpaceWidth(), pDFToken.getFontMetrics().ptSize);
                float f2 = pDFToken.bounds.get(0) - this.lastToken.bounds.get(2);
                boolean bl3 = bl = f2 > 0.0f && f2 < 4.0f * f;
                if (bl2 && bl) {
                    this.curLineTokens.add(pDFToken);
                } else {
                    this.curLines.add(this.toLine(this.curLineTokens));
                    this.curLineTokens.clear();
                    this.curLineTokens.add(pDFToken);
                }
            }
            this.lastToken = pDFToken;
        }

        protected void startPage(PDPage pDPage) {
            this.curLines = new ArrayList<PDFLine>();
            this.curLineTokens = new ArrayList<PDFToken>();
        }

        private PDFLine toLine(List<PDFToken> list) {
            return PDFLine.builder().tokens(new ArrayList<PDFToken>(list)).build();
        }

        protected void endPage(PDPage pDPage) {
            if (!this.curLineTokens.isEmpty()) {
                this.curLines.add(this.toLine(this.curLineTokens));
            }
            PDRectangle pDRectangle = pDPage.getMediaBox() == null ? pDPage.getArtBox() : pDPage.getMediaBox();
            PDFPage pDFPage = PDFPage.builder().lines(new ArrayList<PDFLine>(this.curLines)).pageNumber(this.pages.size()).pageWidth((int)pDRectangle.getWidth()).pageHeight((int)pDRectangle.getHeight()).build();
            this.pages.add(pDFPage);
        }
    }

    private static final class RawChunk {
        public final List<TextPosition> textPositions;

        public String discardSuperscripts(String string, FloatList floatList) {
            double d = (double)(floatList.get(3) + floatList.get(1)) / 2.0;
            double d2 = floatList.get(3) - floatList.get(1);
            StringBuilder stringBuilder = new StringBuilder();
            int n = 0;
            for (TextPosition textPosition : this.textPositions) {
                if ((double)textPosition.getY() > d || d - (double)textPosition.getY() > d2 / 6.0) {
                    stringBuilder.append(textPosition.getUnicode());
                }
                ++n;
            }
            return stringBuilder.toString();
        }

        public PDFToken toPDFToken() {
            PDFToken.PDFTokenBuilder pDFTokenBuilder = PDFToken.builder();
            String string = this.textPositions.stream().map(TextPosition::getUnicode).collect(Collectors.joining(""));
            TextPosition textPosition = this.textPositions.get(0);
            PDFont pDFont = textPosition.getFont();
            PDFontDescriptor pDFontDescriptor = pDFont.getFontDescriptor();
            String string2 = pDFontDescriptor == null ? PDFFontMetrics.UNKNWON_FONT_FAMILY : pDFontDescriptor.getFontName();
            float f = textPosition.getFontSizeInPt();
            if (f > 45.0f) {
                f /= 10.0f;
            }
            string2 = string2 + "_" + f + "_" + textPosition.getWidthOfSpace();
            PDFFontMetrics pDFFontMetrics = PDFFontMetrics.of(string2, f, textPosition.getWidthOfSpace());
            pDFTokenBuilder.fontMetrics(pDFFontMetrics);
            float f2 = Float.POSITIVE_INFINITY;
            float f3 = Float.NEGATIVE_INFINITY;
            float f4 = Float.POSITIVE_INFINITY;
            float f5 = Float.NEGATIVE_INFINITY;
            for (TextPosition textPosition2 : this.textPositions) {
                float f6;
                float f7;
                float f8;
                float f9 = textPosition2.getX();
                if (f9 < f2) {
                    f2 = f9;
                }
                if ((f8 = f9 + textPosition2.getWidth()) > f3) {
                    f3 = f8;
                }
                if ((f7 = textPosition2.getY() - textPosition2.getHeight()) < f4) {
                    f4 = f7;
                }
                if (!((f6 = textPosition2.getY()) > f5)) continue;
                f5 = f6;
            }
            FloatArrayList floatArrayList = FloatArrayList.newListWith((float[])new float[]{f2, f4, f3, f5});
            pDFTokenBuilder.bounds((FloatList)floatArrayList);
            string = this.discardSuperscripts(string, (FloatList)floatArrayList);
            string = Normalizer.normalize(string, Normalizer.Form.NFKC);
            pDFTokenBuilder.token(string);
            return pDFTokenBuilder.build();
        }

        private RawChunk(List<TextPosition> list) {
            this.textPositions = list;
        }

        public static RawChunk of(List<TextPosition> list) {
            return new RawChunk(list);
        }

        public List<TextPosition> getTextPositions() {
            return this.textPositions;
        }

        public boolean equals(Object object) {
            if (object == this) {
                return true;
            }
            if (!(object instanceof RawChunk)) {
                return false;
            }
            RawChunk rawChunk = (RawChunk)object;
            List<TextPosition> list = this.getTextPositions();
            List<TextPosition> list2 = rawChunk.getTextPositions();
            return !(list == null ? list2 != null : !((Object)list).equals(list2));
        }

        public int hashCode() {
            int n = 1;
            List<TextPosition> list = this.getTextPositions();
            n = n * 59 + (list == null ? 43 : ((Object)list).hashCode());
            return n;
        }

        public String toString() {
            return "PDFExtractor.RawChunk(textPositions=" + this.getTextPositions() + ")";
        }
    }

    public static class Options {
        public boolean useHeuristicTitle = false;

        Options(boolean bl) {
            this.useHeuristicTitle = bl;
        }

        public static OptionsBuilder builder() {
            return new OptionsBuilder();
        }

        public static class OptionsBuilder {
            private boolean useHeuristicTitle;

            OptionsBuilder() {
            }

            public OptionsBuilder useHeuristicTitle(boolean bl) {
                this.useHeuristicTitle = bl;
                return this;
            }

            public Options build() {
                return new Options(this.useHeuristicTitle);
            }

            public String toString() {
                return "PDFExtractor.Options.OptionsBuilder(useHeuristicTitle=" + this.useHeuristicTitle + ")";
            }
        }
    }
}

