/*
 * Decompiled with CFR 0.152.
 */
package org.lionsoul.jcseg.tokenizer;

import java.io.IOException;
import java.io.Reader;
import org.lionsoul.jcseg.tokenizer.ComplexSeg;
import org.lionsoul.jcseg.tokenizer.Word;
import org.lionsoul.jcseg.tokenizer.core.ADictionary;
import org.lionsoul.jcseg.tokenizer.core.IChunk;
import org.lionsoul.jcseg.tokenizer.core.IWord;
import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
import org.lionsoul.jcseg.util.EntityFormat;
import org.lionsoul.jcseg.util.IStringBuffer;
import org.lionsoul.jcseg.util.NumericUtil;
import org.lionsoul.jcseg.util.StringUtil;

public class NLPSeg
extends ComplexSeg {
    public NLPSeg(Reader input, JcsegTaskConfig config, ADictionary dic) throws IOException {
        super(input, config, dic);
        config.APPEND_CJK_PINYIN = false;
        config.APPEND_CJK_SYN = false;
        config.MAX_LATIN_LENGTH = 128;
    }

    public NLPSeg(JcsegTaskConfig config, ADictionary dic) throws IOException {
        this(null, config, dic);
    }

    @Override
    protected IWord getNextCJKWord(int c, int pos) throws IOException {
        char[] chars = this.nextCJKSentence(c);
        int cjkidx = 0;
        IWord w = null;
        while (cjkidx < chars.length) {
            w = null;
            int numVal = NumericUtil.isCNNumeric(chars[cjkidx]);
            if (numVal > -1) {
                IWord unitWord = null;
                int wordLen = -1;
                String num = this.nextCNNumeric(chars, cjkidx);
                if ((this.ctrlMask & 2) != 0) {
                    if (this.config.CNFRA_TO_ARABIC) {
                        String[] split = num.split("\u5206\u4e4b");
                        w = new Word(NumericUtil.cnNumericToArabic(split[1], true) + "/" + NumericUtil.cnNumericToArabic(split[0], true), 9, "numeric.fraction");
                        w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                    } else {
                        w = new Word(num, 9, "numeric.cn.fraction");
                        w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                    }
                } else {
                    String temp = null;
                    IStringBuffer sb = new IStringBuffer();
                    if (numVal <= 10) {
                        int j = num.length();
                        for (int i = 0; cjkidx + j < chars.length && i < this.config.MAX_UNIT_LENGTH; ++i) {
                            sb.append(chars[cjkidx + j]);
                            temp = sb.toString();
                            if (this.dic.match(1, temp)) {
                                unitWord = this.dic.get(1, temp);
                            }
                            ++j;
                        }
                    }
                    if (unitWord == null) {
                        IWord wd = null;
                        sb.clear().append(num);
                        for (int j = num.length(); cjkidx + j < chars.length && j < this.config.MAX_LENGTH; ++j) {
                            sb.append(chars[cjkidx + j]);
                            temp = sb.toString();
                            if (!this.dic.match(0, temp)) continue;
                            wd = this.dic.get(0, temp);
                        }
                        if (wd != null) {
                            w = wd.clone();
                            wordLen = w.getLength();
                        } else if (this.config.CNNUM_TO_ARABIC) {
                            String arabic = NumericUtil.cnNumericToArabic(num, true) + "";
                            w = new Word(arabic, 9, "numeric.interger");
                            w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                        } else {
                            w = new Word(num, 9, "numeric.cn");
                            w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                        }
                    } else if (this.config.CNNUM_TO_ARABIC) {
                        String arabic = NumericUtil.cnNumericToArabic(num, true) + "";
                        String entity = "numeric.interger#" + unitWord.getEntity();
                        w = new Word(arabic, 9, entity);
                        w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                    } else {
                        String entity = "numeric.cn#" + unitWord.getEntity();
                        w = new Word(num, 1, entity);
                        w.setPartSpeech(IWord.NUMERIC_POSPEECH);
                    }
                }
                this.wordPool.add(w);
                w.setPosition(pos + cjkidx);
                cjkidx += wordLen > 0 ? wordLen : num.length();
                if (unitWord == null) continue;
                IWord wd = unitWord.clone();
                wd.setPosition(pos + cjkidx);
                this.wordPool.add(wd);
                cjkidx += wd.getLength();
                continue;
            }
            IChunk chunk = this.getBestCJKChunk(chars, cjkidx);
            w = chunk.getWords()[0];
            int T = -1;
            if (this.config.I_CN_NAME && w.getLength() <= 2 && chunk.getWords().length > 1) {
                StringBuilder sb = new StringBuilder();
                sb.append(w.getValue());
                String str = null;
                if (this.dic.match(2, w.getValue()) && (str = this.findCHName(chars, 0, chunk)) != null) {
                    T = 3;
                    sb.append(str);
                } else if (this.dic.match(6, w.getValue()) && chunk.getWords()[1].getLength() <= 2 && this.dic.match(2, chunk.getWords()[1].getValue())) {
                    T = 4;
                    sb.append(chunk.getWords()[1].getValue());
                }
                if (T != -1) {
                    w = new Word(sb.toString(), T);
                    w.setEntity(T == 4 ? "name.nickname" : "name.cn");
                    w.setPartSpeech(IWord.NAME_POSPEECH);
                }
            }
            if (this.config.CLEAR_STOPWORD && this.dic.match(7, w.getValue())) {
                cjkidx += w.getLength();
                continue;
            }
            IWord ce = null;
            if ((this.ctrlMask & 1) != 0 && chars.length - cjkidx <= this.dic.mixPrefixLength) {
                ce = this.getNextMixedWord(chars, cjkidx);
            }
            if (ce == null) {
                if (T == -1) {
                    w = w.clone();
                }
            } else {
                w = ce.clone();
            }
            w.setPosition(pos + cjkidx);
            this.wordPool.add(w);
            cjkidx += w.getLength();
            if (T != -1) continue;
            this.appendWordFeatures(w);
        }
        if (this.wordPool.size() == 0) {
            return null;
        }
        return (IWord)this.wordPool.remove();
    }

    /*
     * Enabled aggressive block sorting
     */
    @Override
    protected IWord nextLatinWord(int c, int pos) throws IOException {
        int j;
        int mc;
        String tstr;
        IStringBuffer ibuffer;
        int ch;
        this.isb.clear();
        if (c > 65280) {
            c -= 65248;
        }
        if (c >= 65 && c <= 90) {
            c += 32;
        }
        this.isb.append((char)c);
        boolean _check = false;
        boolean _wspace = false;
        int atcount = 0;
        int ptcount = 0;
        int _ctype = 0;
        int tcount = 1;
        int _TYPE = StringUtil.getEnCharType(c);
        while ((ch = this.readNext()) != -1) {
            block59: {
                if (ch > 65280) {
                    ch -= 65248;
                }
                if ((_ctype = StringUtil.getEnCharType(ch)) == 3) {
                    _wspace = true;
                    break;
                }
                if (_ctype == 2) {
                    if (ch == 64) {
                        ++atcount;
                    } else if (ch == 46) {
                        ++ptcount;
                    } else {
                        if (ch == 58) {
                            int nchr1 = this.readNext();
                            int nchr2 = this.readNext();
                            if (nchr1 == 47 && nchr2 == 47) {
                                this.isb.append((char)ch).append((char)nchr1).append((char)nchr2);
                                ch = -1;
                                break block59;
                            } else {
                                this.pushBack(nchr2);
                                this.pushBack(nchr1);
                                this.pushBack(ch);
                                break;
                            }
                        }
                        if (!StringUtil.isENKeepPunctuaton((char)ch)) {
                            this.pushBack(ch);
                            break;
                        }
                    }
                }
            }
            if (_ctype == -1) {
                this.pushBack(ch);
                if (!StringUtil.isCJKChar(ch)) break;
                _check = true;
                break;
            }
            if (ch >= 65 && ch <= 90) {
                ch += 32;
            }
            if (ch > 0) {
                this.isb.append((char)ch);
            }
            if (_ctype != _TYPE) {
                ++tcount;
                _TYPE = _ctype;
            }
            if (this.isb.length() <= this.config.MAX_LATIN_LENGTH) continue;
        }
        int oLen = this.isb.length();
        for (int i = oLen - 1; i > 0 && this.isb.charAt(i) == '.'; --i) {
            this.pushBack(this.isb.charAt(i));
            this.isb.deleteCharAt(i);
            _check = false;
        }
        if (oLen > this.isb.length() && !StringUtil.isEnPunctuation(this.isb.last())) {
            --tcount;
        }
        IWord wd = null;
        String str = this.isb.toString();
        if (atcount == 1 && EntityFormat.isMailAddress(str)) {
            wd = new Word(str, 5, "email");
            wd.setPartSpeech(IWord.EN_POSPEECH);
            return wd;
        }
        if (tcount == 1 && StringUtil.isEnNumeric(this.isb.first()) && EntityFormat.isMobileNumber(str)) {
            wd = new Word(str, 5, "mobile");
            wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
            return wd;
        }
        if (tcount == 7 && StringUtil.isEnNumeric(this.isb.first()) && EntityFormat.isIpAddress(str)) {
            wd = new Word(str, 5, "ip");
            wd.setPartSpeech(IWord.EN_POSPEECH);
            return wd;
        }
        if (ptcount > 0 && EntityFormat.isUrlAddress(str, this.dic)) {
            wd = new Word(str, 5, "url");
            wd.setPartSpeech(IWord.EN_POSPEECH);
            return wd;
        }
        if (ch != -1 && !_wspace) {
            if (!_check) {
                boolean isDigit = StringUtil.isDigit(str);
                if (isDigit || StringUtil.isDecimal(str)) {
                    String entity = isDigit ? "numeric.interger" : "numeric.decimal";
                    wd = new Word(str, 5, entity);
                    wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
                    ch = this.readNext();
                    String unit = (char)ch + "";
                    if (this.dic.match(1, unit)) {
                        IWord unitWord = this.dic.get(1, unit).clone();
                        unitWord.setPosition(pos + str.length());
                        this.wordPool.add(unitWord);
                    } else {
                        this.pushBack(ch);
                    }
                }
                if (wd == null) {
                    wd = this.dic.match(0, str) ? this.dic.get(0, str).clone() : new Word(str, 5);
                }
                if (wd.getPartSpeech() == null) {
                    wd.setPartSpeech(IWord.EN_POSPEECH);
                }
                return wd;
            }
            int length = this.isb.length();
            if (length > 1 && this.isb.charAt(length - 1) == '%' && StringUtil.isEnNumeric(this.isb.charAt(length - 2))) {
                wd = new Word(str, 5, "numeric.percentage");
                wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
                return wd;
            }
            ibuffer = new IStringBuffer(str);
            tstr = null;
            mc = 0;
            this.ialist.clear();
        } else {
            boolean isPercentage = false;
            for (int i = this.isb.length() - 1; i > 0; --i) {
                if (this.isb.charAt(i) == '%') {
                    if (i > 0 && StringUtil.isEnNumeric(this.isb.charAt(i - 1))) {
                        isPercentage = true;
                        break;
                    }
                } else if (!StringUtil.isEnPunctuation(this.isb.charAt(i))) break;
                if (this.dic.match(0, str)) {
                    wd = this.dic.get(0, str).clone();
                    break;
                }
                this.pushBack(this.isb.charAt(i));
                this.isb.deleteCharAt(i);
                str = this.isb.toString();
            }
            if (wd == null) {
                if (isPercentage) {
                    wd = new Word(str, 5, "numeric.percentage");
                    wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
                } else if (tcount == 1 && StringUtil.isDigit(str)) {
                    wd = new Word(str, 5, "numeric.interger");
                    wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
                } else if (tcount == 3 && StringUtil.isDecimal(str)) {
                    wd = new Word(str, 5, "numeric.decimal");
                    wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
                } else if (this.dic.match(0, str)) {
                    wd = this.dic.get(0, str).clone();
                } else {
                    wd = new Word(str, 5);
                    wd.setPartSpeech(IWord.EN_POSPEECH);
                }
            }
            if (wd.getPartSpeech() == null) {
                wd.setPartSpeech(IWord.EN_POSPEECH);
            }
            return wd;
        }
        for (j = 0; j < this.dic.mixSuffixLength && (ch = this.readNext()) != -1; ++j) {
            if (StringUtil.isWhitespace(ch)) {
                this.pushBack(ch);
                break;
            }
            ibuffer.append((char)ch);
            this.ialist.add(ch);
            tstr = ibuffer.toString();
            if (!this.dic.match(0, tstr)) continue;
            wd = this.dic.get(0, tstr);
            mc = j + 1;
        }
        ibuffer.clear();
        ibuffer = null;
        for (int i = j - 1; i >= mc; --i) {
            this.pushBack(this.ialist.get(i));
        }
        if (wd != null) {
            if ((wd = wd.clone()).getPartSpeech() == null) {
                wd.setPartSpeech(IWord.MIX_POSPEECH);
            }
            return wd;
        }
        boolean isDigit = StringUtil.isDigit(str);
        if (isDigit || StringUtil.isDecimal(str)) {
            String entity = isDigit ? "numeric.interger" : "numeric.decimal";
            wd = new Word(str, 5, entity);
            wd.setPartSpeech(IWord.NUMERIC_POSPEECH);
            this.ialist.clear();
            IWord unitWord = null;
            IStringBuffer sb = new IStringBuffer();
            for (j = 0; j < this.config.MAX_UNIT_LENGTH && (ch = this.readNext()) != -1; ++j) {
                if (StringUtil.isWhitespace(ch)) {
                    this.pushBack(ch);
                    break;
                }
                sb.append((char)ch);
                this.ialist.add(ch);
                tstr = this.isb.toString();
                if (!this.dic.match(1, tstr)) continue;
                unitWord = this.dic.get(1, tstr);
                mc = j + 1;
            }
            if (unitWord != null) {
                unitWord = unitWord.clone();
                unitWord.setPosition(pos + str.length());
                this.wordPool.add(unitWord);
            }
            for (int i = j - 1; i >= mc; --i) {
                this.pushBack(this.ialist.get(i));
            }
        }
        if (wd == null && (wd = this.dic.match(0, str) ? this.dic.get(0, str).clone() : new Word(str, 5)).getPartSpeech() == null) {
            wd.setPartSpeech(IWord.EN_POSPEECH);
        }
        return wd;
    }
}

