/*
 * Decompiled with CFR 0.152.
 */
package org.allenai.scienceparse;

import com.gs.collections.api.block.function.Function;
import com.gs.collections.api.block.predicate.primitive.FloatIntPredicate;
import com.gs.collections.api.map.primitive.MutableFloatIntMap;
import com.gs.collections.api.tuple.Pair;
import com.gs.collections.api.tuple.primitive.FloatIntPair;
import com.gs.collections.impl.factory.primitive.FloatIntMaps;
import com.gs.collections.impl.tuple.Tuples;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.allenai.scienceparse.PDFToCRFInput;
import org.allenai.scienceparse.RegexWithTimeout;
import org.allenai.scienceparse.pdfapi.PDFDoc;
import org.allenai.scienceparse.pdfapi.PDFLine;
import org.allenai.scienceparse.pdfapi.PDFPage;
import org.allenai.scienceparse.pdfapi.PDFToken;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PDFDocToPartitionedText {
    private static final Logger log = LoggerFactory.getLogger(PDFDocToPartitionedText.class);
    private static final Pattern inLineAbstractPattern = Pattern.compile("^abstract ?\\p{P}?", 66);
    private static final Pattern[] generalAbstractCleaners = new Pattern[]{Pattern.compile("Key ?words(:| |\\.).*$", 64), Pattern.compile("KEY ?WORDS(:| |\\.).*$", 64), Pattern.compile("Key ?Words(:| |\\.).*$", 64), Pattern.compile("(1|I)\\.? Introduction.*$", 66), Pattern.compile("Categories and Subject Descriptors.*$", 64), Pattern.compile("0 [1-2][0-9]{3}.*$", 64), Pattern.compile("Contents.*$", 64), Pattern.compile("Index terms\\p{P}.*$", 66)};
    private static final Pattern paragraphAbstractCleaner = Pattern.compile("^summary ?\\p{P}?", 66);
    public static Set<String> referenceHeaders = new HashSet<String>(Arrays.asList("references", "citations", "bibliography", "reference", "bibliographie"));
    private static Pattern referenceStartPattern = Pattern.compile("^\\d{1,2}\\.|^\\[");

    public static List<String> getRaw(PDFDoc pDFDoc) {
        ArrayList<String> arrayList = new ArrayList<String>();
        StringBuilder stringBuilder = new StringBuilder();
        PDFLine pDFLine = null;
        double d = PDFDocToPartitionedText.getRawBlockLineBreak(pDFDoc);
        for (PDFPage pDFPage : pDFDoc.getPages()) {
            for (PDFLine pDFLine2 : pDFPage.getLines()) {
                String string;
                if ((double)PDFDocToPartitionedText.breakSize(pDFLine2, pDFLine) > d) {
                    string = stringBuilder.toString();
                    if (string.endsWith("<lb>")) {
                        string = string.substring(0, string.length() - 4);
                    }
                    arrayList.add(string);
                    stringBuilder = new StringBuilder();
                }
                if ((string = PDFDocToPartitionedText.lineToString(pDFLine2)).length() > 0) {
                    stringBuilder.append(string);
                    stringBuilder.append("<lb>");
                }
                pDFLine = pDFLine2;
            }
            if (stringBuilder.length() <= 0) continue;
            Object object = stringBuilder.toString();
            if (((String)object).endsWith("<lb>")) {
                object = ((String)object).substring(0, ((String)object).length() - 4);
            }
            arrayList.add((String)object);
            stringBuilder = new StringBuilder();
        }
        return arrayList;
    }

    public static float breakSize(PDFLine pDFLine, PDFLine pDFLine2) {
        if (pDFLine == null || pDFLine2 == null) {
            return 0.0f;
        }
        float f = PDFToCRFInput.getH(pDFLine2);
        float f2 = PDFToCRFInput.getH(pDFLine);
        return (PDFToCRFInput.getY(pDFLine, true) - PDFToCRFInput.getY(pDFLine2, false)) / Math.min(f, f2);
    }

    private static List<Double> getBreaks(PDFPage pDFPage) {
        PDFLine pDFLine = null;
        ArrayList<Double> arrayList = new ArrayList<Double>();
        for (PDFLine pDFLine2 : pDFPage.getLines()) {
            double d = PDFDocToPartitionedText.breakSize(pDFLine2, pDFLine);
            if (d > 0.0) {
                arrayList.add(d);
            }
            pDFLine = pDFLine2;
        }
        arrayList.sort(Double::compare);
        return arrayList;
    }

    private static List<Double> getBreaks(PDFDoc pDFDoc) {
        ArrayList<Double> arrayList = new ArrayList<Double>();
        for (PDFPage pDFPage : pDFDoc.getPages()) {
            arrayList.addAll(PDFDocToPartitionedText.getBreaks(pDFPage));
        }
        arrayList.sort(Double::compare);
        return arrayList;
    }

    public static double getReferenceLineBreak(PDFDoc pDFDoc) {
        List<Double> list = PDFDocToPartitionedText.getBreaks(pDFDoc);
        if (list.isEmpty()) {
            return 1.0;
        }
        int n = 7 * list.size() / 9;
        return list.get(n);
    }

    public static double getRawBlockLineBreak(PDFDoc pDFDoc) {
        List<Double> list = PDFDocToPartitionedText.getBreaks(pDFDoc);
        if (list.isEmpty()) {
            return 1.0;
        }
        int n = 7 * list.size() / 9;
        return list.get(n);
    }

    public static double getFirstPagePartitionBreak(PDFPage pDFPage) {
        List<Double> list = PDFDocToPartitionedText.getBreaks(pDFPage);
        if (list.isEmpty()) {
            return 1.0;
        }
        int n = 3 * list.size() / 6;
        return list.get(n) + 0.5;
    }

    private static String lineToString(PDFLine pDFLine) {
        StringBuilder stringBuilder = new StringBuilder();
        for (PDFToken pDFToken : pDFLine.tokens) {
            stringBuilder.append(pDFToken.token);
            stringBuilder.append(' ');
        }
        return stringBuilder.toString().trim();
    }

    private static String cleanLine(String string) {
        string = string.replaceAll("\r|\t|\n", " ").trim();
        while (string.contains("  ")) {
            string = string.replaceAll("  ", " ");
        }
        return string;
    }

    public static String getFirstTextBlock(PDFDoc pDFDoc) {
        PDFPage pDFPage = pDFDoc.pages.get(0);
        double d = PDFDocToPartitionedText.getFirstPagePartitionBreak(pDFPage);
        StringBuilder stringBuilder = new StringBuilder();
        PDFLine pDFLine = null;
        boolean bl = true;
        for (PDFLine pDFLine2 : pDFPage.lines) {
            if (bl) {
                bl = false;
                continue;
            }
            if ((double)PDFDocToPartitionedText.breakSize(pDFLine2, pDFLine) > d) {
                if (stringBuilder.length() > 400) {
                    return stringBuilder.toString().trim();
                }
                stringBuilder.delete(0, stringBuilder.length());
                stringBuilder.append(' ');
                stringBuilder.append(PDFDocToPartitionedText.cleanLine(PDFDocToPartitionedText.lineToString(pDFLine2)));
            } else {
                stringBuilder.append(' ');
                stringBuilder.append(PDFDocToPartitionedText.cleanLine(PDFDocToPartitionedText.lineToString(pDFLine2)));
            }
            pDFLine = pDFLine2;
        }
        return "";
    }

    public static String getAbstract(List<String> list, PDFDoc pDFDoc) {
        boolean bl = false;
        StringBuilder stringBuilder = new StringBuilder();
        Object object = list.iterator();
        while (object.hasNext()) {
            Pattern[] patternArray = object.next();
            if (bl) {
                if (patternArray.length() < 20) break;
                stringBuilder.append(' ');
                stringBuilder.append(patternArray.trim());
            }
            if (patternArray.toLowerCase().contains("abstract") && patternArray.length() < 10) {
                bl = true;
                continue;
            }
            if (patternArray.toLowerCase().contains("a b s t r a c t")) {
                bl = true;
                continue;
            }
            if (!RegexWithTimeout.matcher(inLineAbstractPattern, (CharSequence)patternArray).find()) continue;
            stringBuilder.append(RegexWithTimeout.matcher(inLineAbstractPattern, (CharSequence)patternArray).replaceFirst(""));
            bl = true;
        }
        if (((String)(object = stringBuilder.toString().trim())).length() == 0) {
            object = PDFDocToPartitionedText.getFirstTextBlock(pDFDoc);
            object = RegexWithTimeout.matcher(paragraphAbstractCleaner, (CharSequence)object).replaceFirst("");
        }
        for (Pattern pattern : generalAbstractCleaners) {
            object = RegexWithTimeout.matcher(pattern, (CharSequence)object).replaceFirst("");
        }
        object = ((String)object).replaceAll("- ", "");
        return object;
    }

    private static boolean lenientRefStart(PDFLine pDFLine, PDFLine pDFLine2, double d) {
        PDFToken pDFToken = pDFLine.tokens.get(0);
        return (pDFToken.token.equals("[1]") || pDFToken.token.equals("1.")) && pDFLine.tokens.size() > 1 && (PDFToCRFInput.getX(pDFLine.tokens.get(1), true) > pDFToken.fontMetrics.spaceWidth || (double)PDFDocToPartitionedText.breakSize(pDFLine, pDFLine2) > d);
    }

    private static boolean gapAcrossMiddle(PDFToken pDFToken, PDFToken pDFToken2, PDFPage pDFPage, float f) {
        double d = PDFToCRFInput.getXGap(pDFToken, pDFToken2);
        double d2 = (double)pDFPage.getPageWidth() / 2.0;
        double d3 = (double)(PDFToCRFInput.getX(pDFToken, false) + PDFToCRFInput.getX(pDFToken2, true)) / 2.0;
        return d > (double)(5.0f * f) && Math.abs(d3 - d2) < (double)(50.0f * f);
    }

    public static List<Pair<PDFPage, PDFLine>> repairColumns(List<Pair<PDFPage, PDFLine>> list) {
        ArrayList<Pair<PDFPage, PDFLine>> arrayList = new ArrayList<Pair<PDFPage, PDFLine>>();
        ArrayList<PDFLine> arrayList2 = new ArrayList<PDFLine>();
        Object object2 = null;
        ArrayList<Double> arrayList3 = new ArrayList<Double>();
        double d = 0.0;
        for (Pair<PDFPage, PDFLine> pair : list) {
            ArrayList<PDFToken> arrayList4;
            PDFLine pDFLine3 = (PDFLine)pair.getTwo();
            PDFPage object5 = (PDFPage)pair.getOne();
            if (object5 != object2 && object2 != null) {
                arrayList4 = object2;
                arrayList2.sort((pDFLine, pDFLine2) -> Double.compare(PDFDocToPartitionedText.lineSorter(pDFLine, arrayList4), PDFDocToPartitionedText.lineSorter(pDFLine2, arrayList4)));
                for (PDFLine pDFLine4 : arrayList2) {
                    arrayList.add((Pair<PDFPage, PDFLine>)Tuples.pair((Object)object2, (Object)pDFLine4));
                }
                arrayList2 = new ArrayList();
            }
            arrayList4 = new ArrayList<PDFToken>();
            Object object = null;
            for (PDFToken pDFToken : pDFLine3.tokens) {
                if (object != null && PDFDocToPartitionedText.gapAcrossMiddle(object, pDFToken, object5, ((PDFToken)object).fontMetrics.spaceWidth)) {
                    arrayList3.add((double)(PDFToCRFInput.getX((PDFToken)object, false) + PDFToCRFInput.getX(pDFToken, true)) / 2.0);
                    d += (double)((PDFToken)object).fontMetrics.spaceWidth;
                    arrayList2.add(PDFLine.builder().tokens(new ArrayList<PDFToken>(arrayList4)).build());
                    arrayList4 = new ArrayList();
                }
                arrayList4.add(pDFToken);
                object = pDFToken;
            }
            if (arrayList4.size() > 0) {
                arrayList2.add(PDFLine.builder().tokens(new ArrayList<PDFToken>(arrayList4)).build());
            }
            object2 = object5;
        }
        boolean bl = false;
        if (arrayList3.size() > 10) {
            bl = true;
            double d2 = 0.0;
            d /= (double)arrayList3.size();
            Iterator iterator = arrayList3.iterator();
            while (iterator.hasNext()) {
                double d3 = (Double)iterator.next();
                d2 += d3;
            }
            d2 /= (double)arrayList3.size();
            Iterator iterator2 = arrayList3.iterator();
            while (iterator2.hasNext()) {
                double d4 = (Double)iterator2.next();
                if (!(Math.abs(d4 - d2) > 3.0 * d)) continue;
                bl = false;
                break;
            }
        }
        if (bl) {
            ArrayList<PDFToken> arrayList5 = object2;
            arrayList2.sort((pDFLine, pDFLine2) -> Double.compare(PDFDocToPartitionedText.lineSorter(pDFLine, arrayList5), PDFDocToPartitionedText.lineSorter(pDFLine2, arrayList5)));
            for (PDFLine pDFLine5 : arrayList2) {
                arrayList.add(Tuples.pair(object2, (Object)pDFLine5));
            }
            log.info("using re-ordered lines: " + arrayList.size());
            return arrayList;
        }
        return list;
    }

    private static double lineSorter(PDFLine pDFLine, PDFPage pDFPage) {
        return 1.0E8 * (PDFDocToPartitionedText.firstCol(pDFLine, pDFPage) ? 0.0 : 1.0) + (double)PDFToCRFInput.getY(pDFLine, true);
    }

    private static boolean firstCol(PDFLine pDFLine, PDFPage pDFPage) {
        double d = (double)pDFPage.getPageWidth() / 3.0;
        return (double)PDFToCRFInput.getX(pDFLine, true) < d;
    }

    public static List<String> getRawReferences(PDFDoc pDFDoc) {
        PDFLine pDFLine;
        PDFLine pDFLine2 = null;
        boolean bl = false;
        boolean bl2 = false;
        double d = PDFDocToPartitionedText.getReferenceLineBreak(pDFDoc);
        boolean bl3 = false;
        List<Pair<PDFPage, PDFLine>> list = new ArrayList<Pair<PDFPage, PDFLine>>();
        int n2 = 0;
        for (int i = 0; i < 2; ++i) {
            int n3 = 0;
            if (i == 1) {
                if (bl2) break;
                bl3 = true;
            }
            for (PDFPage pDFPage : pDFDoc.getPages()) {
                double d2 = Double.MAX_VALUE;
                double d3 = -1.0;
                for (PDFLine pDFLine322 : pDFPage.getLines()) {
                    if (!bl && pDFLine322 != null && pDFLine322.tokens != null && pDFLine322.tokens.size() > 0) {
                        if (pDFLine322.tokens.get((int)(pDFLine322.tokens.size() - 1)).token != null && referenceHeaders.contains(pDFLine322.tokens.get((int)(pDFLine322.tokens.size() - 1)).token.trim().toLowerCase().replaceAll("\\p{Punct}*$", "")) && pDFLine322.tokens.size() < 5) {
                            bl = true;
                            bl2 = true;
                            pDFLine2 = pDFLine322;
                            continue;
                        }
                        if (bl3 && n3 > n2 / 4 && PDFDocToPartitionedText.lenientRefStart(pDFLine322, pDFLine2, d)) {
                            bl = true;
                            bl2 = true;
                        }
                    }
                    if (bl) {
                        list.add((Pair<PDFPage, PDFLine>)Tuples.pair((Object)pDFPage, (Object)pDFLine322));
                    }
                    pDFLine2 = pDFLine322;
                }
                if (i == 0) {
                    ++n2;
                }
                ++n3;
            }
        }
        list = PDFDocToPartitionedText.repairColumns(list);
        ArrayList arrayList = new ArrayList();
        PDFPage pDFPage = null;
        double d4 = Double.MAX_VALUE;
        for (Pair<PDFPage, PDFLine> object : list) {
            PDFPage pDFPage2 = (PDFPage)object.getOne();
            pDFLine = (PDFLine)object.getTwo();
            if (pDFLine.tokens.isEmpty()) continue;
            double d5 = PDFToCRFInput.getY(pDFLine, true);
            double f3 = PDFToCRFInput.getY(pDFLine, false);
            if (pDFPage2 != pDFPage || d5 < d4) {
                ArrayList<PDFLine> f5 = new ArrayList<PDFLine>();
                f5.add(pDFLine);
                arrayList.add(f5);
                d4 = f3;
            } else {
                ((List)arrayList.get(arrayList.size() - 1)).add(pDFLine);
                d4 = f3;
            }
            pDFPage = pDFPage2;
        }
        ArrayList arrayList3 = new ArrayList();
        for (List list2 : arrayList) {
            Object n5;
            float f22;
            PDFLine pDFLine322;
            pDFLine = FloatIntMaps.mutable.empty();
            for (PDFLine pDFLine322 : list2) {
                float floatIntPair2 = PDFToCRFInput.getX(pDFLine322, true);
                float floatIntPair3 = pDFLine322.tokens.get((int)0).fontMetrics.spaceWidth;
                float f7 = floatIntPair2;
                for (float f22 : pDFLine.keySet().toArray()) {
                    if (!(Math.abs(f22 - floatIntPair2) < floatIntPair3)) continue;
                    f7 = f22;
                    break;
                }
                int n3 = pDFLine.getIfAbsent(f7, 0);
                pDFLine.remove(f7);
                pDFLine.put((f7 * (float)n3 + floatIntPair2) / (float)(n3 + 1), n3 + 1);
            }
            float f6 = -1000.0f;
            pDFLine322 = pDFLine.keyValuesView().toSortedListBy((Function & Serializable)floatIntPair -> -floatIntPair.getTwo()).take(2).toImmutable();
            if (pDFLine322.size() > 1) {
                FloatIntPair floatIntPair2 = (FloatIntPair)pDFLine322.get(0);
                FloatIntPair floatIntPair3 = (FloatIntPair)pDFLine322.get(1);
                float i = -1.0f;
                for (PDFLine pDFLine6 : list2) {
                    float f9 = PDFToCRFInput.getX(pDFLine6, true);
                    f22 = pDFLine6.tokens.get((int)0).fontMetrics.spaceWidth;
                    if (Math.abs(floatIntPair2.getOne() - f9) < f22) {
                        f9 = floatIntPair2.getOne();
                    } else {
                        if (!(Math.abs(floatIntPair3.getOne() - f9) < f22)) continue;
                        f9 = floatIntPair3.getOne();
                    }
                    if (i < 0.0f) {
                        i = f9;
                    }
                    if (!referenceStartPattern.matcher((CharSequence)(n5 = PDFDocToPartitionedText.lineToString(pDFLine6))).find()) continue;
                    f6 = f9;
                    break;
                }
                if (f6 < 0.0f) {
                    f6 = floatIntPair3.getOne();
                }
            }
            MutableFloatIntMap f4 = FloatIntMaps.mutable.empty();
            for (int l = 1; l < list2.size(); ++l) {
                float f3;
                PDFLine pDFLine3 = (PDFLine)list2.get(l - 1);
                PDFLine pDFLine4 = (PDFLine)list2.get(l);
                f22 = f3 = PDFDocToPartitionedText.breakSize(pDFLine4, pDFLine3);
                for (float string : f4.keySet().toArray()) {
                    if (!(Math.abs((float)(string - f3)) <= 0.1f)) continue;
                    f22 = string;
                    break;
                }
                int n4 = f4.getIfAbsent(f22, 0);
                f4.remove(f22);
                f4.put((f22 * (float)n4 + f3) / (float)(n4 + 1), n4 + 1);
            }
            long l = f4.sum();
            f4 = f4.select((FloatIntPredicate & Serializable)(f, n) -> (long)n >= l / 5L && n > 1 && f < 3.0f);
            float f5 = -1.0f;
            if (list2.size() > 5 && f4.size() >= 2) {
                f5 = f4.keySet().max();
            }
            int n6 = 0;
            StringBuilder f22 = new StringBuilder();
            pDFLine2 = null;
            n5 = list2.iterator();
            while (n5.hasNext()) {
                boolean bl4;
                PDFLine pDFLine5 = (PDFLine)n5.next();
                float f7 = PDFToCRFInput.getX(pDFLine5, true);
                String string = PDFDocToPartitionedText.lineToString(pDFLine5);
                float f8 = pDFLine5.tokens.get((int)0).fontMetrics.spaceWidth;
                boolean bl5 = bl4 = ++n6 >= 6 || f5 < 0.0f && f6 > 0.0f && Math.abs(f7 - f6) < f8 || referenceStartPattern.matcher(string).find() || pDFLine2 != null && f5 > 0.0f && PDFDocToPartitionedText.breakSize(pDFLine5, pDFLine2) >= f5 - 0.1f;
                if (bl4) {
                    String string2 = PDFDocToPartitionedText.cleanLine(f22.toString());
                    if (!string2.isEmpty()) {
                        arrayList3.add(string2);
                    }
                    f22.setLength(0);
                    f22.append(string);
                    n6 = 1;
                } else {
                    f22.append("<lb>");
                    f22.append(string);
                }
                pDFLine2 = pDFLine5;
            }
            n5 = PDFDocToPartitionedText.cleanLine(f22.toString());
            if (((String)n5).isEmpty()) continue;
            arrayList3.add(n5);
        }
        return arrayList3;
    }
}

