/*
 * Decompiled with CFR 0.152.
 */
package org.allenai.scienceparse;

import com.google.common.collect.UnmodifiableIterator;
import com.gs.collections.api.block.function.Function;
import com.gs.collections.api.block.function.primitive.DoubleToDoubleFunction;
import com.gs.collections.api.block.predicate.Predicate;
import com.gs.collections.api.map.primitive.ObjectDoubleMap;
import com.gs.collections.api.tuple.Pair;
import com.gs.collections.impl.map.mutable.primitive.ObjectDoubleHashMap;
import com.gs.collections.impl.tuple.Tuples;
import com.medallia.word2vec.Searcher;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.allenai.datastore.Datastore;
import org.allenai.ml.sequences.crf.CRFPredicateExtractor;
import org.allenai.scienceparse.PaperToken;
import org.allenai.scienceparse.Parser;
import org.allenai.scienceparse.ParserLMFeatures;
import org.allenai.scienceparse.PrintFeaturizedCRFInput;
import org.allenai.scienceparse.StringUtils;
import org.allenai.scienceparse.WordVectorCache;
import org.allenai.scienceparse.pdfapi.PDFToken;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PDFPredicateExtractor
implements CRFPredicateExtractor<PaperToken, String> {
    private static final Logger log = LoggerFactory.getLogger(PDFPredicateExtractor.class);
    public static final List<String> stopWords = Arrays.asList("a", "an", "the", "in", "of", "for", "from", "and", "as", "but", "to");
    public static final HashSet<String> stopHash = new HashSet<String>(stopWords);
    private final ParserLMFeatures lmFeats;
    private final Searcher word2vecSearcher;
    public static final String[] wordEmbeddingFeatureNames = new String[1000];

    public PDFPredicateExtractor() {
        this(null);
    }

    public PDFPredicateExtractor(ParserLMFeatures parserLMFeatures) {
        try {
            Path path = Datastore.apply().filePath("org.allenai.scienceparse", "Word2VecModel.bin", 1);
            this.word2vecSearcher = WordVectorCache.searcherForPath(path);
        }
        catch (IOException iOException) {
            throw new RuntimeException(iOException);
        }
        this.lmFeats = parserLMFeatures;
    }

    public static List<String> getCaseMasks(String string) {
        Pattern pattern = Pattern.compile("[A-Z][a-z]*");
        Pattern pattern2 = Pattern.compile("[a-z]+");
        Pattern pattern3 = Pattern.compile("[a-zA-Z]+");
        Pattern pattern4 = Pattern.compile("[0-9]+");
        Pattern pattern5 = Pattern.compile(".*[0-9]+.*");
        Pattern pattern6 = Pattern.compile("[A-Z]\\.");
        Pattern pattern7 = Pattern.compile(".*[^\\p{ASCII}]+.*");
        Pattern pattern8 = Pattern.compile("[a-zA-Z]+:");
        Pattern pattern9 = Pattern.compile(".*@.*");
        List<Pattern> list = Arrays.asList(pattern, pattern2, pattern3, pattern4, pattern5, pattern6, pattern7, pattern8, pattern9);
        List<String> list2 = Arrays.asList("%Xxx", "%xxx", "%letters", "%dig", "%hasNum", "%letDot", "%hasNonAscii", "%capWordColon", "%hasAt");
        ArrayList<String> arrayList = new ArrayList<String>();
        for (int i = 0; i < list.size(); ++i) {
            Pattern pattern10 = list.get(i);
            if (!pattern10.matcher(string).matches()) continue;
            arrayList.add(list2.get(i));
        }
        return arrayList;
    }

    public static boolean isStopWord(String string) {
        return stopHash.contains(string);
    }

    public static float getY(PaperToken paperToken, boolean bl) {
        if (bl) {
            return paperToken.getPdfToken().bounds.get(1);
        }
        return paperToken.getPdfToken().bounds.get(3);
    }

    public static double smoothFreq(String string, ObjectDoubleHashMap<String> objectDoubleHashMap) {
        double d = objectDoubleHashMap.get((Object)string);
        if (d > 0.0) {
            d -= 0.6;
        }
        return Math.log10(d + 0.1);
    }

    private static float height(PDFToken pDFToken) {
        return pDFToken.bounds.get(3) - pDFToken.bounds.get(1);
    }

    private static float width(PDFToken pDFToken) {
        return pDFToken.bounds.get(0) - pDFToken.bounds.get(2);
    }

    public static float getExtreme(List<PaperToken> list, TokenPropertySelector tokenPropertySelector, boolean bl) {
        float f = -1.0f;
        float f2 = Float.NEGATIVE_INFINITY;
        if (bl) {
            f = 1.0f;
        }
        for (PaperToken paperToken : list) {
            float f3 = tokenPropertySelector.getProp(paperToken) * f;
            if (!(f3 > f2)) continue;
            f2 = f3;
        }
        return f2 * f;
    }

    public static float linearNormalize(float f, Pair<Float, Float> pair) {
        if ((double)Math.abs(((Float)pair.getTwo()).floatValue() - ((Float)pair.getOne()).floatValue()) < 1.0E-8) {
            return 0.5f;
        }
        return (f - ((Float)pair.getOne()).floatValue()) / (((Float)pair.getTwo()).floatValue() - ((Float)pair.getOne()).floatValue());
    }

    public static Pair<Float, Float> getExtrema(List<PaperToken> list, TokenPropertySelector tokenPropertySelector) {
        Pair pair = Tuples.pair((Object)Float.valueOf(PDFPredicateExtractor.getExtreme(list, tokenPropertySelector, false)), (Object)Float.valueOf(PDFPredicateExtractor.getExtreme(list, tokenPropertySelector, true)));
        return pair;
    }

    public static float getFixedFont(PaperToken paperToken) {
        float f = paperToken.getPdfToken().fontMetrics.ptSize;
        if (f > 30.0f) {
            return 11.0f;
        }
        return f;
    }

    public double logYDelt(float f, float f2) {
        return Math.log(Math.max(f - f2, 1.0E-5f));
    }

    public List<ObjectDoubleMap<String>> nodePredicates(List<PaperToken> list) {
        Object object;
        ArrayList<ObjectDoubleMap<String>> arrayList = new ArrayList<ObjectDoubleMap<String>>();
        Pair<Float, Float> pair = PDFPredicateExtractor.getExtrema(list.subList(1, list.size() - 1), paperToken -> PDFPredicateExtractor.height(paperToken.getPdfToken()));
        Pair<Float, Float> pair2 = PDFPredicateExtractor.getExtrema(list.subList(1, list.size() - 1), paperToken -> PDFPredicateExtractor.getFixedFont(paperToken));
        for (int i = 0; i < list.size(); ++i) {
            object = new ObjectDoubleHashMap();
            float f = -10.0f;
            float f2 = -10.0f;
            float f3 = -10.0f;
            float f4 = -10.0f;
            float f5 = 0.0f;
            float f6 = -1000000.0f;
            int n = -1;
            int n2 = -1;
            if (i == 0) {
                object.put((Object)"<S>", 1.0);
            } else if (i == list.size() - 1) {
                object.put((Object)"</S>", 1.0);
            } else {
                if (i != 1) {
                    n = list.get(i - 1).getLine();
                    f = PDFPredicateExtractor.getFixedFont(list.get(i - 1));
                    f3 = PDFPredicateExtractor.height(list.get(i - 1).getPdfToken());
                    f5 = PDFPredicateExtractor.getY(list.get(i - 1), false);
                }
                if (i != list.size() - 2) {
                    n2 = list.get(i + 1).getLine();
                    f2 = PDFPredicateExtractor.getFixedFont(list.get(i + 1));
                    f4 = PDFPredicateExtractor.height(list.get(i + 1).getPdfToken());
                    f6 = PDFPredicateExtractor.getY(list.get(i + 1), true);
                } else {
                    f6 = PDFPredicateExtractor.getY(list.get(i), false) + PDFPredicateExtractor.height(list.get(i).getPdfToken());
                }
                float f7 = PDFPredicateExtractor.getFixedFont(list.get(i));
                float f8 = PDFPredicateExtractor.height(list.get(i).getPdfToken());
                int n3 = list.get(i).getLine();
                if (f7 != f) {
                    object.put((Object)"%fcb", 1.0);
                }
                if (f7 != f2) {
                    object.put((Object)"%fcf", 1.0);
                }
                if (n3 != n) {
                    object.put((Object)"%lcb", 1.0);
                    object.put((Object)"%hGapB", this.logYDelt(PDFPredicateExtractor.getY(list.get(i), true), f5));
                }
                if (n3 != n2) {
                    object.put((Object)"%lcf", 1.0);
                    object.put((Object)"%hGapF", this.logYDelt(f6, PDFPredicateExtractor.getY(list.get(i), false)));
                }
                if ((double)Math.abs(Math.abs(f4 - f8) / Math.abs(f4 + f8)) > 0.1) {
                    object.put((Object)"%hcf", 1.0);
                }
                if ((double)Math.abs(Math.abs(f3 - f8) / Math.abs(f3 + f8)) > 0.1) {
                    object.put((Object)"%hcb", 1.0);
                }
                float f9 = PDFPredicateExtractor.linearNormalize(f7, pair2);
                object.put((Object)"%font", (double)f9);
                object.put((Object)"%line", Math.min((double)n3, 10.0));
                float f10 = PDFPredicateExtractor.linearNormalize(f8, pair);
                object.put((Object)"%h", (double)f10);
                String string = list.get((int)i).getPdfToken().token;
                PDFPredicateExtractor.getCaseMasks(string).forEach(arg_0 -> PDFPredicateExtractor.lambda$nodePredicates$2((ObjectDoubleHashMap)object, arg_0));
                if (PDFPredicateExtractor.isStopWord(string)) {
                    object.put((Object)"%stop", 1.0);
                    if (n3 != n && (object.containsKey((Object)"%XXX") || object.containsKey((Object)"%Xxx"))) {
                        object.put((Object)"%startCapStop", 1.0);
                    }
                } else if (object.containsKey((Object)"%xxx")) {
                    object.put((Object)"%uncapns", 1.0);
                }
                double d2 = Math.min((double)string.length(), 10.0) / 10.0;
                double d3 = (d2 - 0.5) * (d2 - 0.5);
                object.put((Object)"%adjLen", d2);
                object.put((Object)"%adjLenSq", d3);
                if (n3 <= 2) {
                    object.put((Object)"%first3lines", 1.0);
                }
                if (this.lmFeats != null) {
                    object.put((Object)"%tfreq", PDFPredicateExtractor.smoothFreq(string, this.lmFeats.titleBow));
                    object.put((Object)"%tffreq", PDFPredicateExtractor.smoothFreq(string, this.lmFeats.titleFirstBow));
                    object.put((Object)"%tlfreq", PDFPredicateExtractor.smoothFreq(string, this.lmFeats.titleLastBow));
                    object.put((Object)"%afreq", PDFPredicateExtractor.smoothFreq(Parser.fixupAuthors(string), this.lmFeats.authorBow));
                    object.put((Object)"%affreq", PDFPredicateExtractor.smoothFreq(Parser.fixupAuthors(string), this.lmFeats.authorFirstBow));
                    object.put((Object)"%alfreq", PDFPredicateExtractor.smoothFreq(Parser.fixupAuthors(string), this.lmFeats.authorLastBow));
                    object.put((Object)"%bfreq", PDFPredicateExtractor.smoothFreq(string, this.lmFeats.backgroundBow));
                    object.put((Object)"%bafreq", PDFPredicateExtractor.smoothFreq(Parser.fixupAuthors(string), this.lmFeats.backgroundBow));
                }
                String string2 = StringUtils.normalize(list.get((int)i).getPdfToken().token);
                object.put((Object)("%t=" + string2), 1.0);
                if (string2.equals("and") || string2.equals(",")) {
                    object.put((Object)"%and", 1.0);
                }
                String string3 = string2 + "$";
                for (int j = 0; j <= string3.length() - 3; ++j) {
                    String string4 = string3.substring(j, j + 3);
                    String string5 = "%tri=" + string4;
                    object.updateValue((Object)string5, 0.0, (DoubleToDoubleFunction & Serializable)d -> d + 1.0);
                }
                try {
                    UnmodifiableIterator unmodifiableIterator = this.word2vecSearcher.getRawVector(string).iterator();
                    int n4 = 0;
                    while (unmodifiableIterator.hasNext()) {
                        double d4 = (Double)unmodifiableIterator.next();
                        object.put((Object)wordEmbeddingFeatureNames[n4], d4);
                        ++n4;
                    }
                }
                catch (Searcher.UnknownWordException unknownWordException) {
                    // empty catch block
                }
            }
            arrayList.add((ObjectDoubleMap<String>)object);
        }
        if (log.isDebugEnabled()) {
            String string = arrayList.stream().map(objectDoubleMap -> objectDoubleMap.keysView().select((Predicate & Serializable)string -> string.startsWith("%t=")).collect((Function & Serializable)string -> string.substring(3)).makeString("-")).collect(Collectors.joining(" "));
            object = String.format("%x", string.hashCode());
            log.debug("{} CRF Input for {}", object, (Object)string);
            PrintFeaturizedCRFInput.stringsFromFeaturizedSeq(arrayList, (String)object).stream().forEach(arg_0 -> ((Logger)log).debug(arg_0));
        }
        return arrayList;
    }

    public List<ObjectDoubleMap<String>> edgePredicates(List<PaperToken> list) {
        ArrayList<ObjectDoubleMap<String>> arrayList = new ArrayList<ObjectDoubleMap<String>>();
        for (int i = 0; i < list.size() - 1; ++i) {
            ObjectDoubleHashMap objectDoubleHashMap = new ObjectDoubleHashMap();
            objectDoubleHashMap.put((Object)"B", 1.0);
            arrayList.add((ObjectDoubleMap<String>)objectDoubleHashMap);
        }
        return arrayList;
    }

    private static /* synthetic */ void lambda$nodePredicates$2(ObjectDoubleHashMap objectDoubleHashMap, String string) {
        objectDoubleHashMap.put((Object)string, 1.0);
    }

    static {
        for (int i = 0; i < wordEmbeddingFeatureNames.length; ++i) {
            PDFPredicateExtractor.wordEmbeddingFeatureNames[i] = String.format("%%emb%03d", i);
        }
    }

    private static interface TokenPropertySelector {
        public float getProp(PaperToken var1);
    }
}

