package com.ibm.avatar.algebra.util.tokenize;

import com.ibm.avatar.algebra.datamodel.Span;
import com.ibm.avatar.algebra.datamodel.SpanText;
import com.ibm.avatar.algebra.datamodel.Text;
import com.ibm.avatar.algebra.util.lang.LangCode;
import com.ibm.avatar.algebra.util.string.StringUtils;
import com.ibm.avatar.logging.Log;
import java.util.TreeSet;

/* loaded from: input_file:com/ibm/avatar/algebra/util/tokenize/Tokenizer.class */
public abstract class Tokenizer {
    public final OffsetsList tokenize(SpanText spanText) {
        if (spanText instanceof Span) {
            return tokenize((Span) spanText);
        }
        if (spanText instanceof Text) {
            return tokenize((Text) spanText);
        }
        throw new RuntimeException("This code should not be reached");
    }

    public final OffsetsList tokenize(Text text) {
        BaseOffsetsList baseOffsetsList = (BaseOffsetsList) text.getCachedTokens();
        if (null != baseOffsetsList) {
            return baseOffsetsList;
        }
        String text2 = text.getText();
        BaseOffsetsList baseOffsetsList2 = new BaseOffsetsList();
        tokenizeStr(text2, text.getLanguage(), baseOffsetsList2);
        text.setCachedTokens(baseOffsetsList2);
        return baseOffsetsList2;
    }

    public final OffsetsList tokenize(Span span) {
        if (null != span.getCachedTokens()) {
            return span.getCachedTokens();
        }
        Text docTextObj = span.getDocTextObj();
        tokenize(docTextObj);
        DerivedOffsetsList derivedOffsetsList = new DerivedOffsetsList((BaseOffsetsList) docTextObj.getCachedTokens(), span.getBegin(), span.getEnd(), span.getBegin());
        if (derivedOffsetsList.beginsOnTokenBoundary()) {
            span.setBeginTok(derivedOffsetsList.getFirstIx());
        } else {
            span.setBeginTok(-2);
        }
        if (derivedOffsetsList.endsOnTokenBoundary()) {
            span.setEndTok(derivedOffsetsList.getLastIx());
        } else {
            span.setEndTok(-2);
        }
        span.setCachedTokens(derivedOffsetsList);
        return derivedOffsetsList;
    }

    public final void tokenize(Text text, int i, int i2, DerivedOffsetsList derivedOffsetsList) {
        tokenize(text);
        BaseOffsetsList baseOffsetsList = (BaseOffsetsList) tokenize(text);
        derivedOffsetsList.init(baseOffsetsList, i, 0 == baseOffsetsList.size() ? i : Math.min(baseOffsetsList.end(Math.min(baseOffsetsList.nextBeginIx(i) + i2, baseOffsetsList.size() - 1)), text.getLength()), 0);
    }

    public void tokenizeBackwards(Text text, int i, int i2, DerivedOffsetsList derivedOffsetsList) {
        if (i > text.getLength()) {
            throw new ArrayIndexOutOfBoundsException(String.format("Span index %d out of range (max %d)", Integer.valueOf(i), Integer.valueOf(text.getLength())));
        }
        tokenize(text);
        BaseOffsetsList baseOffsetsList = (BaseOffsetsList) text.getCachedTokens();
        int prevBeginIx = baseOffsetsList.prevBeginIx(i);
        int max = Math.max(0, (prevBeginIx - i2) + 1);
        int max2 = Math.max(baseOffsetsList.begin(max), 0);
        if (0 != 0) {
            Log.debug("tokenizeBackwards(%d): Creating DerivedOffsetsList from %d (token %d) to %d (token %d)", Integer.valueOf(i2), Integer.valueOf(max2), Integer.valueOf(max), Integer.valueOf(i), Integer.valueOf(prevBeginIx));
            Log.debug("tokenizeBackwards(): Target text is: '%s'", StringUtils.escapeForPrinting(text.getText().subSequence(max2, i)));
        }
        derivedOffsetsList.init(baseOffsetsList, max2, i, 0);
    }

    public abstract void tokenizeStr(CharSequence charSequence, LangCode langCode, BaseOffsetsList baseOffsetsList);

    public abstract boolean supportsPOSTagging();

    public abstract TreeSet<Integer> decodePOSSpec(String str, LangCode langCode);

    public abstract CharSequence posCodeToString(int i, LangCode langCode);

    public abstract boolean supportLemmatization();

    public abstract String getName();
}
