package com.ibm.avatar.algebra.util.dict;

import com.ibm.avatar.algebra.function.scalar.GetCol;
import com.ibm.avatar.algebra.util.dict.DictParams;
import com.ibm.avatar.algebra.util.file.FileUtils;
import com.ibm.avatar.algebra.util.lang.LangCode;
import com.ibm.avatar.algebra.util.lang.LanguageSet;
import com.ibm.avatar.algebra.util.string.Escaper;
import com.ibm.avatar.algebra.util.string.StringUtils;
import com.ibm.avatar.algebra.util.tokenize.BaseOffsetsList;
import com.ibm.avatar.algebra.util.tokenize.DerivedOffsetsList;
import com.ibm.avatar.algebra.util.tokenize.Tokenizer;
import com.ibm.avatar.api.exceptions.InvalidDictionaryFileFormatException;
import com.ibm.avatar.logging.Log;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/* loaded from: input_file:com/ibm/avatar/algebra/util/dict/DictFile.class */
public class DictFile {
    public static final int CASE_INSENSITIVE = 0;
    public static final int CASE_SENSITIVE = 1;
    private static final String DICT_ENCODING = "UTF-8";
    private static final List<String> EMPTY_STRING_LIST = new ArrayList();
    private DictParams params;
    private List<String> entries;
    private final List<PerEntryParam> perEntryParams;
    private static Escaper escaper;
    public static final char TOKEN_DELIM = 9476;

    public DictFile(List<String> list, DictParams dictParams, List<PerEntryParam> list2) throws Exception {
        this.perEntryParams = null;
        this.entries = list;
        this.params = dictParams;
        if (null != list2) {
            throw new UnsupportedOperationException("Providing per entry matching parameter is currently not implemented");
        }
        cleanEntries();
    }

    public DictFile(InputStream inputStream, DictParams dictParams) throws IOException, InvalidDictionaryFileFormatException {
        this.perEntryParams = null;
        initDict(inputStream, dictParams);
    }

    public DictFile(File file, DictParams dictParams) throws IOException, InvalidDictionaryFileFormatException {
        DictParams dictParams2;
        this.perEntryParams = null;
        FileInputStream fileInputStream = new FileInputStream(file);
        if (null == dictParams) {
            dictParams2 = new DictParams();
            dictParams2.setDictName(file.getName());
        } else {
            dictParams2 = dictParams;
        }
        try {
            initDict(fileInputStream, dictParams2);
            fileInputStream.close();
        } catch (IllegalArgumentException e) {
            throw new InvalidDictionaryFileFormatException(e, "An error occurred while parsing dictionary entries from the file: '%s'. Specify the dictionary file in the format as described in the Information Center.", file.getCanonicalPath());
        }
    }

    public DictFile(File file) throws IOException, InvalidDictionaryFileFormatException {
        this(file, (DictParams) null);
    }

    public DictFile(String str) throws IOException, InvalidDictionaryFileFormatException {
        this(FileUtils.createValidatedFile(str));
    }

    private void initDict(InputStream inputStream, DictParams dictParams) throws IOException, InvalidDictionaryFileFormatException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new BufferedInputStream(inputStream), "UTF-8"));
        this.params = dictParams;
        this.entries = new ArrayList();
        escaper.setRemoveComments(true);
        int i = 0;
        while (bufferedReader.ready()) {
            i++;
            String readLine = bufferedReader.readLine();
            if (readLine != null) {
                try {
                    readLine = escaper.deEscapeStr(readLine);
                    if (0 != readLine.length()) {
                        this.entries.add(readLine);
                    }
                } catch (Exception e) {
                    throw new InvalidDictionaryFileFormatException(e, "An error occurred while parsing and de-escaping dictionary entry '%s' in line %d of dictionary: '%s'. Ensure special characters # and \\ are escaped correctly as described in the documentation for dictionary formats.", readLine, Integer.valueOf(i), this.params.getDictName());
                }
            }
        }
        cleanEntries();
    }

    private void cleanEntries() {
        for (int i = 0; i < this.entries.size(); i++) {
            this.entries.set(i, StringUtils.trim(this.entries.get(i)));
        }
        this.entries.removeAll(EMPTY_STRING_LIST);
    }

    public List<String> getEntries() {
        return this.entries;
    }

    public Iterator<String> getEntriesItr() {
        return this.entries.iterator();
    }

    public String getName() {
        return this.params.getDictName();
    }

    public DictParams getParams() {
        return this.params;
    }

    public void dumpToText(File file) throws IOException {
        FileWriter fileWriter = new FileWriter(file);
        fileWriter.append((CharSequence) "# Dictionary file generated by DictFile.dumpToText()\n");
        for (int i = 0; i < this.entries.size(); i++) {
            fileWriter.append((CharSequence) String.format("%s\n", escaper.escapeStr(this.entries.get(i))));
        }
        fileWriter.close();
    }

    public TreeMap<Character, Integer> computeCharCounts(Tokenizer tokenizer, LangCode langCode) {
        Iterator<String> entriesItr = getEntriesItr();
        BaseOffsetsList baseOffsetsList = new BaseOffsetsList();
        TreeMap<Character, Integer> treeMap = new TreeMap<>();
        while (entriesItr.hasNext()) {
            String canonicalizeEntry = canonicalizeEntry(tokenizer, baseOffsetsList, entriesItr.next(), langCode);
            for (int i = 0; i < canonicalizeEntry.length(); i++) {
                char charAt = canonicalizeEntry.charAt(i);
                if (treeMap.containsKey(Character.valueOf(charAt))) {
                    treeMap.put(Character.valueOf(charAt), Integer.valueOf(treeMap.get(Character.valueOf(charAt)).intValue() + 1));
                } else {
                    treeMap.put(Character.valueOf(charAt), 1);
                }
            }
        }
        return treeMap;
    }

    public static void mergeCharCounts(TreeMap<Character, Integer> treeMap, TreeMap<Character, Integer> treeMap2) {
        for (Map.Entry<Character, Integer> entry : treeMap.entrySet()) {
            Integer num = treeMap.get(entry.getKey());
            treeMap2.put(entry.getKey(), Integer.valueOf((null == num ? 0 : num.intValue()) + entry.getValue().intValue()));
        }
    }

    public static String canonicalizeEntry(Tokenizer tokenizer, BaseOffsetsList baseOffsetsList, String str, LangCode langCode) {
        tokenizer.tokenizeStr(str, langCode, baseOffsetsList);
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < baseOffsetsList.size(); i++) {
            sb.append(str.subSequence(baseOffsetsList.begin(i), baseOffsetsList.end(i)));
            if (i < baseOffsetsList.size() - 1) {
                sb.append((char) 9476);
            }
        }
        return sb.toString();
    }

    public ArrayList<String> getCanonEntries(Tokenizer tokenizer, LangCode langCode) {
        StringBuilder sb = new StringBuilder();
        int[] iArr = new int[this.entries.size()];
        int[] iArr2 = new int[this.entries.size()];
        int i = 0;
        for (int i2 = 0; i2 < this.entries.size(); i2++) {
            String str = this.entries.get(i2);
            sb.append(str);
            sb.append("\n\n");
            iArr[i2] = i;
            iArr2[i2] = i + str.length();
            i += str.length() + "\n\n".length();
        }
        String sb2 = sb.toString();
        BaseOffsetsList baseOffsetsList = new BaseOffsetsList();
        tokenizer.tokenizeStr(sb2, langCode, baseOffsetsList);
        ArrayList<String> arrayList = new ArrayList<>();
        for (int i3 = 0; i3 < this.entries.size(); i3++) {
            DerivedOffsetsList derivedOffsetsList = new DerivedOffsetsList(baseOffsetsList, iArr[i3], iArr2[i3], 0);
            if (1 != derivedOffsetsList.size()) {
                StringBuilder sb3 = new StringBuilder();
                for (int i4 = 0; i4 < derivedOffsetsList.size(); i4++) {
                    sb3.append((CharSequence) sb2, derivedOffsetsList.begin(i4), derivedOffsetsList.end(i4));
                    if (i4 < derivedOffsetsList.size() - 1) {
                        sb3.append((char) 9476);
                    }
                }
                arrayList.add(sb3.toString());
            } else {
                if (derivedOffsetsList.begin(0) != iArr[i3] || derivedOffsetsList.end(0) != iArr2[i3]) {
                    String str2 = this.entries.get(i3);
                    String substring = sb2.substring(derivedOffsetsList.begin(0), derivedOffsetsList.end(0));
                    String substring2 = sb2.substring(iArr[i3], iArr2[i3]);
                    Log.info("Monster string: %s", StringUtils.quoteStr('\"', sb2, true, true));
                    throw new RuntimeException(String.format("While tokenizing dictionary '%s' in language %s: Single-token entry runs from %d to %d, but tokenizer identified a token from %d to %d (original entry: '%s'; token '%s'; expected token '%s')\nThis error usually occurs because there is a difference between the tokenizer's concept of whitespace and that of java.lang.String.trim().", getName(), langCode, Integer.valueOf(iArr[i3]), Integer.valueOf(iArr2[i3]), Integer.valueOf(derivedOffsetsList.begin(0)), Integer.valueOf(derivedOffsetsList.end(0)), StringUtils.escapeUnicode(str2), substring, substring2));
                }
                arrayList.add(this.entries.get(i3));
            }
        }
        return arrayList;
    }

    public CompiledDictionary compile(Tokenizer tokenizer, DictMemoization dictMemoization) throws Exception {
        String sb;
        if (0 != 0) {
            Log.debug("Start dictionary compilation by tokenizing dictionary entries", new Object[0]);
        }
        long currentTimeMillis = System.currentTimeMillis();
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        StringBuilder sb2 = new StringBuilder();
        int[] iArr = new int[this.entries.size()];
        int[] iArr2 = new int[this.entries.size()];
        int i = 0;
        for (int i2 = 0; i2 < this.entries.size(); i2++) {
            String trim = this.entries.get(i2).trim();
            sb2.append(trim);
            sb2.append("\n\n");
            iArr[i2] = i;
            iArr2[i2] = i + trim.length();
            i += trim.length() + "\n\n".length();
        }
        String sb3 = sb2.toString();
        if (null == this.params.getLangStr()) {
            throw new Exception(String.format("Parameters for dictionary '%s' do not contain a language set", this.params.getDictName()));
        }
        LanguageSet create = LanguageSet.create(this.params.getLangStr(), dictMemoization);
        Iterator<LangCode> it = create.iterator();
        while (it.hasNext()) {
            LangCode next = it.next();
            BaseOffsetsList baseOffsetsList = new BaseOffsetsList();
            tokenizer.tokenizeStr(sb3, next, baseOffsetsList);
            for (int i3 = 0; i3 < this.entries.size(); i3++) {
                DerivedOffsetsList derivedOffsetsList = new DerivedOffsetsList(baseOffsetsList, iArr[i3], iArr2[i3], 0);
                if (1 != derivedOffsetsList.size()) {
                    StringBuilder sb4 = new StringBuilder();
                    for (int i4 = 0; i4 < derivedOffsetsList.size(); i4++) {
                        sb4.append((CharSequence) sb3, derivedOffsetsList.begin(i4), derivedOffsetsList.end(i4));
                        if (i4 < derivedOffsetsList.size() - 1) {
                            sb4.append((char) 9476);
                        }
                    }
                    sb = sb4.toString();
                } else {
                    if (derivedOffsetsList.begin(0) != iArr[i3] || derivedOffsetsList.end(0) != iArr2[i3]) {
                        throw new RuntimeException(String.format("While canonicalizing dictionary '%s' in language %s: Single-token entry runs from %d to %d, but tokenizer identified a token from %d to %d (original entry: '%s'; token '%s'; expected token '%s')\nThis error usually occurs because there is a difference between the tokenizer's concept of whitespace and that of java.lang.String.trim().", getName(), next, Integer.valueOf(iArr[i3]), Integer.valueOf(iArr2[i3]), Integer.valueOf(derivedOffsetsList.begin(0)), Integer.valueOf(derivedOffsetsList.end(0)), StringUtils.escapeUnicode(this.entries.get(i3)), sb3.substring(derivedOffsetsList.begin(0), derivedOffsetsList.end(0)), sb3.substring(iArr[i3], iArr2[i3])));
                    }
                    sb = this.entries.get(i3);
                }
                LanguageSet languageSet = linkedHashMap.get(sb);
                linkedHashMap.put(sb, null == languageSet ? LanguageSet.create(next, dictMemoization) : LanguageSet.create(languageSet, next, dictMemoization));
            }
        }
        if (0 != 0) {
            Log.debug("Time taken to tokenize the dictionary entries for all the languages: %d milli seconds", Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        }
        try {
            return linkedHashMap.size() > 0 ? encodeDictionary(linkedHashMap, create, tokenizer.getName()) : CompiledDictionary.createEmptyCompiledDictionary(this.params.getDictName(), tokenizer.getName());
        } catch (Exception e) {
            throw new Exception(String.format("Error while compiling dictionary %s", getName()), e);
        }
    }

    private CompiledDictionary encodeDictionary(Map<String, LanguageSet> map, LanguageSet languageSet, String str) throws Exception {
        int i;
        long currentTimeMillis = System.currentTimeMillis();
        ArrayList arrayList = new ArrayList(map.size());
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        EncodeDecodeLanguageSet encodeDecodeLanguageSet = new EncodeDecodeLanguageSet(languageSet);
        HashMap hashMap = new HashMap();
        Iterator<Map.Entry<String, LanguageSet>> it = map.entrySet().iterator();
        while (it.hasNext()) {
            String key = it.next().getKey();
            LanguageSet languageSet2 = map.get(key);
            boolean z = this.params.getDefaultCase() != DictParams.CaseSensitivityType.insensitive;
            String[] split = StringUtils.split(key, (char) 9476);
            int[] iArr = new int[split.length + 2];
            Integer num = (Integer) hashMap.get(languageSet2);
            if (null == num) {
                num = Integer.valueOf(encodeDecodeLanguageSet.encode(languageSet2));
                hashMap.put(languageSet2, num);
            }
            int i2 = 0 + 1;
            iArr[0] = num.intValue();
            if (z) {
                i = i2 + 1;
                iArr[i2] = 1;
            } else {
                i = i2 + 1;
                iArr[i2] = 0;
            }
            for (String str2 : split) {
                Integer num2 = linkedHashMap.get(str2);
                if (null == num2) {
                    num2 = Integer.valueOf(linkedHashMap.size());
                    linkedHashMap.put(str2, num2);
                }
                int i3 = i;
                i++;
                iArr[i3] = num2.intValue();
            }
            arrayList.add(iArr);
        }
        if (0 != 0) {
            Log.debug("Time taken to encode dictionary: %d milliseconds", Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        }
        return new CompiledDictionary(this.params.getDictName(), transposeTokenTable(linkedHashMap), languageSet, arrayList, str, Boolean.valueOf(this.params.supportLemmaMatch()));
    }

    private Map<Integer, String> transposeTokenTable(Map<String, Integer> map) {
        HashMap hashMap = new HashMap();
        for (String str : map.keySet()) {
            hashMap.put(map.get(str), str);
        }
        return hashMap;
    }

    static {
        EMPTY_STRING_LIST.add(GetCol.USAGE);
        escaper = new Escaper(new char[]{'#'}, new char[]{'#'});
    }
}
