/*
 * Decompiled with CFR 0.152.
 */
package org.corpus_tools.salt.common.tokenizer;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Range;
import com.google.common.collect.TreeRangeMap;
import com.neovisionaries.i18n.LanguageCode;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.corpus_tools.salt.SaltFactory;
import org.corpus_tools.salt.common.SDocumentGraph;
import org.corpus_tools.salt.common.SSpan;
import org.corpus_tools.salt.common.SSpanningRelation;
import org.corpus_tools.salt.common.STextualDS;
import org.corpus_tools.salt.common.STextualRelation;
import org.corpus_tools.salt.common.SToken;
import org.corpus_tools.salt.common.tokenizer.AbbreviationDE;
import org.corpus_tools.salt.common.tokenizer.AbbreviationEN;
import org.corpus_tools.salt.common.tokenizer.AbbreviationFR;
import org.corpus_tools.salt.common.tokenizer.AbbreviationIT;
import org.corpus_tools.salt.core.SAnnotation;
import org.corpus_tools.salt.core.SRelation;
import org.corpus_tools.salt.exceptions.SaltTokenizerException;
import org.corpus_tools.salt.graph.Node;
import org.corpus_tools.salt.util.DataSourceSequence;
import org.knallgrau.utils.textcat.TextCategorizer;

public class Tokenizer {
    private SDocumentGraph documentGraph = null;
    private Map<LanguageCode, HashSet<String>> abbreviations = null;
    protected static final String P_CHAR = "\\[\\{\\(\u00b4`\"\u00bb\u00ab\u201a\u201e\u2020\u2021\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a";
    protected static final String F_CHAR = "\\]\\}'`\"\\),;:!\\?%\u00bb\u00ab\u201a\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a";
    private String PClitic = "";
    private String FClitic = "";

    public void setsDocumentGraph(SDocumentGraph sDocumentGraph) {
        this.documentGraph = sDocumentGraph;
    }

    public SDocumentGraph getDocumentGraph() {
        return this.documentGraph;
    }

    public List<SToken> tokenize(STextualDS sTextualDSs) {
        return this.tokenize(sTextualDSs, null);
    }

    public List<SToken> tokenize(STextualDS sTextualDSs, LanguageCode language) {
        return this.tokenize(sTextualDSs, language, null, null);
    }

    public List<SToken> tokenize(STextualDS sTextualDS, LanguageCode language, Integer startPos, Integer endPos) {
        List<SToken> retVal = null;
        if (sTextualDS == null) {
            throw new SaltTokenizerException("Cannot tokenize an empty 'SSTextualDS' object.");
        }
        if (this.getDocumentGraph() == null) {
            if (sTextualDS.getGraph() == null) {
                throw new SaltTokenizerException("Cannot add tokens to an empty SDocumentGraph object and can not estimate SDocumentGraph, because STextualDS does not belong to a SDocumentGraph object.");
            }
            this.setsDocumentGraph((SDocumentGraph)sTextualDS.getGraph());
        }
        if (sTextualDS.getText() != null) {
            if (startPos == null) {
                startPos = 0;
            }
            if (endPos == null) {
                endPos = sTextualDS.getText().length();
            }
            if (language == null && (language = Tokenizer.checkLanguage(sTextualDS.getText().substring(startPos, endPos))) == null) {
                language = Tokenizer.checkLanguage(sTextualDS.getText().substring(startPos, endPos));
            }
            if (language != null && this.getAbbreviations(language) == null) {
                if (LanguageCode.de.equals((Object)language)) {
                    this.addAbbreviation(LanguageCode.de, AbbreviationDE.createAbbriviations());
                } else if (LanguageCode.en.equals((Object)language)) {
                    this.addAbbreviation(LanguageCode.en, AbbreviationEN.createAbbriviations());
                } else if (LanguageCode.fr.equals((Object)language)) {
                    this.addAbbreviation(LanguageCode.fr, AbbreviationFR.createAbbriviations());
                } else if (LanguageCode.it.equals((Object)language)) {
                    this.addAbbreviation(LanguageCode.it, AbbreviationIT.createAbbriviations());
                }
            }
            this.setClitics(language);
            retVal = this.tokenizeToToken(sTextualDS, language, startPos, endPos);
        }
        return retVal;
    }

    public static LanguageCode checkLanguage(String text) {
        LanguageCode retVal = null;
        if (text != null) {
            TextCategorizer textCategorizer = new TextCategorizer();
            String lang = textCategorizer.categorize(text);
            return Tokenizer.mapISOLanguageCode(lang);
        }
        return retVal;
    }

    public static LanguageCode mapISOLanguageCode(String language) {
        LanguageCode retVal = null;
        if ("german".equals(language)) {
            retVal = LanguageCode.de;
        } else if ("english".equals(language)) {
            retVal = LanguageCode.en;
        } else if ("french".equals(language)) {
            retVal = LanguageCode.fr;
        } else if ("spanish".equals(language)) {
            retVal = LanguageCode.es;
        } else if ("italian".equals(language)) {
            retVal = LanguageCode.it;
        } else if ("swedish".equals(language)) {
            retVal = LanguageCode.sv;
        } else if ("polish".equals(language)) {
            retVal = LanguageCode.pl;
        } else if ("dutch".equals(language)) {
            retVal = LanguageCode.nl;
        } else if ("norwegian".equals(language)) {
            retVal = LanguageCode.no;
        } else if ("finnish".equals(language)) {
            retVal = LanguageCode.fi;
        } else if ("albanian".equals(language)) {
            retVal = LanguageCode.sq;
        } else if ("slovakian".equals(language)) {
            retVal = LanguageCode.sk;
        } else if ("slovenian".equals(language)) {
            retVal = LanguageCode.sl;
        } else if ("danish".equals(language)) {
            retVal = LanguageCode.da;
        } else if ("hungarian".equals(language)) {
            retVal = LanguageCode.hu;
        }
        return retVal;
    }

    public void addAbbreviation(LanguageCode language, HashSet<String> abbreviations) {
        if (language != null && abbreviations != null) {
            if (this.abbreviations == null) {
                this.abbreviations = new ConcurrentHashMap<LanguageCode, HashSet<String>>();
            }
            if (!this.abbreviations.containsKey((Object)language)) {
                this.abbreviations.put(language, abbreviations);
            }
        }
    }

    public void addAbbreviation(LanguageCode language, File abbreviationFile) {
        HashSet<String> abbreviations = null;
        try (BufferedReader inReader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(abbreviationFile), "UTF8"));){
            abbreviations = new HashSet<String>();
            String input = "";
            while ((input = inReader.readLine()) != null) {
                abbreviations.add(input);
            }
            inReader.close();
        }
        catch (FileNotFoundException e) {
            throw new SaltTokenizerException("Cannot tokenize the given text, because the file for abbreviation '" + abbreviationFile.getAbsolutePath() + "' was not found.");
        }
        catch (IOException e) {
            throw new SaltTokenizerException("Cannot tokenize the given text, because can not read file '" + abbreviationFile.getAbsolutePath() + "'.");
        }
        this.addAbbreviation(language, abbreviations);
    }

    public HashSet<String> getAbbreviations(LanguageCode language) {
        HashSet<String> retVal = null;
        if (language != null) {
            if (this.abbreviations == null) {
                this.abbreviations = new ConcurrentHashMap<LanguageCode, HashSet<String>>();
            }
            retVal = this.abbreviations.get((Object)language);
        }
        return retVal;
    }

    private void setClitics(LanguageCode language) {
        if (LanguageCode.en.equals((Object)language)) {
            this.FClitic = "('(s|re|ve|d|m|em|ll)|n't)";
        } else if (LanguageCode.fr.equals((Object)language)) {
            this.PClitic = "([dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu')";
            this.FClitic = "(-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-m\u00eames?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l\u00e0)";
        } else if (LanguageCode.es.equals((Object)language)) {
            this.PClitic = "([dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt')";
        }
    }

    public List<SToken> tokenizeToToken(STextualDS sTextualDS, LanguageCode language, Integer startPos, Integer endPos) {
        ArrayList<SToken> retVal = null;
        List<String> strTokens = null;
        String strInput = sTextualDS.getText().substring(startPos, endPos);
        strTokens = this.tokenizeToString(strInput, language);
        if (strTokens.size() > 0) {
            char[] chrText = strInput.toCharArray();
            int tokenCntr = 0;
            List<SToken> tokens = null;
            if (startPos != 0 || endPos.intValue() != sTextualDS.getText().length() || this.getDocumentGraph().getTextualDSs().size() > 1) {
                DataSourceSequence<Integer> sequence = new DataSourceSequence<Integer>();
                sequence.setDataSource(sTextualDS);
                sequence.setStart(startPos);
                sequence.setEnd(endPos);
                tokens = this.getDocumentGraph().getTokensBySequence(sequence);
            } else {
                tokens = this.getDocumentGraph().getTokens();
            }
            TreeRangeMap oldTokens = null;
            if (tokens != null && tokens.size() != 0 && this.getDocumentGraph().getTextualRelations() != null && this.getDocumentGraph().getTextualRelations().size() > 0) {
                oldTokens = TreeRangeMap.create();
                for (STextualRelation rel : this.getDocumentGraph().getTextualRelations()) {
                    oldTokens.put(Range.closed((Comparable)((Comparable)rel.getStart()), (Comparable)((Comparable)rel.getEnd())), rel.getSource());
                }
            }
            ArrayListMultimap old2newToken = ArrayListMultimap.create();
            for (int i = 0; i < chrText.length; ++i) {
                SToken oldToken;
                if (strTokens.get(tokenCntr).length() >= 1 && !strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i]))) continue;
                StringBuffer pattern = new StringBuffer();
                for (int y = 0; y < strTokens.get(tokenCntr).length(); ++y) {
                    pattern.append(chrText[i + y]);
                }
                if (strTokens.get(tokenCntr).hashCode() != pattern.toString().hashCode()) continue;
                int start = i + startPos;
                int end = i + startPos + strTokens.get(tokenCntr).length();
                if (this.getDocumentGraph() == null) {
                    throw new SaltTokenizerException("Cannot add tokens to an empty SDocumentGraph object.");
                }
                SToken sTok = this.getDocumentGraph().createToken(sTextualDS, start, end);
                if (retVal == null) {
                    retVal = new ArrayList<SToken>();
                }
                retVal.add(sTok);
                i = i + strTokens.get(tokenCntr).length() - 1;
                if (++tokenCntr >= strTokens.size()) break;
                if (oldTokens == null || (oldToken = (SToken)oldTokens.get((Comparable)Integer.valueOf(start))) == null) continue;
                old2newToken.put((Object)oldToken, (Object)sTok);
            }
            if (old2newToken != null) {
                for (SToken oldToken : old2newToken.keySet()) {
                    ArrayList<SToken> overlappedTokens = new ArrayList<SToken>(old2newToken.get((Object)oldToken));
                    if (overlappedTokens.size() == 1) {
                        this.getDocumentGraph().removeNode((Node)overlappedTokens.get(0));
                        continue;
                    }
                    SSpan span = this.getDocumentGraph().createSpan(overlappedTokens);
                    for (SAnnotation sAnno : oldToken.getAnnotations()) {
                        span.addAnnotation(sAnno);
                    }
                    ArrayList<SRelation> inRels = new ArrayList<SRelation>();
                    for (SRelation rel : this.getDocumentGraph().getInRelations(oldToken.getId())) {
                        inRels.add(rel);
                    }
                    for (SRelation inRel : inRels) {
                        if (inRel instanceof SSpanningRelation) {
                            if (!(inRel.getSource() instanceof SSpan)) continue;
                            SSpan parentSpan = (SSpan)inRel.getSource();
                            this.getDocumentGraph().removeRelation(inRel);
                            for (SToken overlappedToken : overlappedTokens) {
                                SSpanningRelation rel = SaltFactory.createSSpanningRelation();
                                rel.setSource(parentSpan);
                                rel.setTarget(overlappedToken);
                                this.getDocumentGraph().addRelation(rel);
                            }
                            continue;
                        }
                        inRel.setTarget(span);
                    }
                    ArrayList<SRelation> outRels = new ArrayList<SRelation>();
                    for (SRelation outRel : this.getDocumentGraph().getOutRelations(oldToken.getId())) {
                        if (outRel instanceof STextualRelation) continue;
                        outRels.add(outRel);
                    }
                    for (SRelation outRel : outRels) {
                        outRel.setSource(span);
                    }
                    this.getDocumentGraph().removeNode(oldToken);
                }
            }
        }
        return retVal;
    }

    public List<String> tokenizeToString(String strInput, LanguageCode language) {
        strInput = strInput.replaceAll("\\.\\.\\.", " ... ");
        strInput = strInput.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
        strInput = strInput.replaceAll("\\s+", " ");
        strInput = strInput.trim();
        String[] strOutput = strInput.split(" ");
        ArrayList<String> lstTokens = new ArrayList<String>(Arrays.asList(strOutput));
        Pattern p = null;
        Matcher m = null;
        Pattern p2 = null;
        Matcher m2 = null;
        for (int i = 0; i < lstTokens.size(); ++i) {
            p = Pattern.compile("^([\\[\\{\\(\u00b4`\"\u00bb\u00ab\u201a\u201e\u2020\u2021\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a])(.+)");
            m = p.matcher(lstTokens.get(i));
            if (m.find()) {
                lstTokens.remove(i);
                lstTokens.add(i, m.group(2));
                lstTokens.add(i, m.group(1));
                continue;
            }
            p = Pattern.compile("^(.+)([\\]\\}'`\"\\),;:!\\?%\u00bb\u00ab\u201a\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a])$");
            m = p.matcher(lstTokens.get(i));
            if (m.find()) {
                lstTokens.remove(i);
                lstTokens.add(i, m.group(2));
                lstTokens.add(i, m.group(1));
                --i;
                continue;
            }
            p = Pattern.compile("^(.+[\\]\\}'`\"\\),;:!\\?%\u00bb\u00ab\u201a\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a])(\\.)$");
            m = p.matcher(lstTokens.get(i));
            if (m.find()) {
                lstTokens.remove(i);
                lstTokens.add(i, m.group(2));
                lstTokens.add(i, m.group(1));
                --i;
                continue;
            }
            HashSet<String> abbreviations = this.getAbbreviations(language);
            if (abbreviations != null && abbreviations.contains(lstTokens.get(i)) || (m = (p = Pattern.compile("^([A-Za-z\u00c1\u00c2\u00c3\u00c8\u00fd\u00ae\u00d0\u00d7\u00dd\u00de\u00cd\u00f0\u00ce\u00d3\u00d4\u00d5\u00d8\u00d9\u00e3\u00f5\u0161\u203a\u20ac\u00df\u201a\u0192\u201e\u2021\u02c6\u2030\u0160\u2039\u0152\u008d\u017d\u008f\u00f8\u007f\u0178\u0015\u00f7\u00b7\u201d\u201c\u2019]\\.)+$")).matcher(lstTokens.get(i))).find()) continue;
            p = Pattern.compile("^(.+)(\\.)$");
            m = p.matcher(lstTokens.get(i));
            p2 = Pattern.compile("^(\\.\\.\\.|[0-9]+\\.)$");
            m2 = p2.matcher(lstTokens.get(i));
            if (m.find() && !m2.find()) {
                lstTokens.remove(i);
                lstTokens.add(i, m.group(2));
                lstTokens.add(i, m.group(1));
                ++i;
                continue;
            }
            p = Pattern.compile("^(" + this.PClitic + ")(.+)$");
            m = p.matcher(lstTokens.get(i));
            if (m.find() && !this.PClitic.isEmpty()) {
                lstTokens.remove(i);
                lstTokens.add(i, m.group(2));
                lstTokens.add(i, m.group(1));
                continue;
            }
            p = Pattern.compile("(.+)(" + this.FClitic + ")$");
            m = p.matcher(lstTokens.get(i));
            if (!m.find() || this.FClitic.isEmpty()) continue;
            lstTokens.remove(i);
            lstTokens.add(i, m.group(2));
            lstTokens.add(i, m.group(1));
            ++i;
        }
        return lstTokens;
    }
}

