package com.ibm.avatar.algebra.scan;

import com.ibm.avatar.algebra.base.MemoizationTable;
import com.ibm.avatar.algebra.base.MultiInputOperator;
import com.ibm.avatar.algebra.datamodel.AbstractTupleSchema;
import com.ibm.avatar.algebra.datamodel.FieldGetter;
import com.ibm.avatar.algebra.datamodel.FieldSetter;
import com.ibm.avatar.algebra.datamodel.FieldType;
import com.ibm.avatar.algebra.datamodel.Pair;
import com.ibm.avatar.algebra.datamodel.Text;
import com.ibm.avatar.algebra.datamodel.Tuple;
import com.ibm.avatar.algebra.datamodel.TupleList;
import com.ibm.avatar.algebra.datamodel.TupleSchema;
import com.ibm.avatar.algebra.oldscan.CsvFileScan;
import com.ibm.avatar.algebra.oldscan.DBDumpFileScan;
import com.ibm.avatar.algebra.oldscan.DirDocScan;
import com.ibm.avatar.algebra.oldscan.JsonFileScan;
import com.ibm.avatar.algebra.oldscan.TextFileScan;
import com.ibm.avatar.algebra.oldscan.ZipFileScan;
import com.ibm.avatar.algebra.util.lang.LangCode;
import com.ibm.avatar.api.Constants;
import com.ibm.avatar.api.exceptions.TextAnalyticsException;
import com.ibm.avatar.api.exceptions.UnrecognizedFileFormatException;
import com.ibm.avatar.aql.tam.ModuleUtils;
import java.io.CharConversionException;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;

/* loaded from: input_file:com/ibm/avatar/algebra/scan/DocScanInternal.class */
public abstract class DocScanInternal extends MultiInputOperator {
    protected FieldSetter<String> docTextAcc;
    protected ArrayList<FieldGetter<Text>> docTextGetters;
    protected AbstractTupleSchema docSchema;
    private LangCode docLangOverride;
    private boolean scanStarted;
    protected boolean stripCR;
    private static final TupleList[] EMPTY_LIST = new TupleList[0];
    private static final AbstractTupleSchema[] defaultSupportedSchemas = {createOneColumnSchema(), createLabeledSchema()};

    public static final TupleSchema createLabeledSchema() {
        return ModuleUtils.createSortedDocSchema(new TupleSchema(new String[]{"text", Constants.LABEL_COL_NAME}, new FieldType[]{FieldType.TEXT_TYPE, FieldType.TEXT_TYPE}));
    }

    public static final TupleSchema createOneColumnSchema() {
        return ModuleUtils.createSortedDocSchema(new TupleSchema(new String[]{"text"}, new FieldType[]{FieldType.TEXT_TYPE}));
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public DocScanInternal() {
        this.docTextGetters = new ArrayList<>();
        this.docSchema = createOneColumnSchema();
        this.docLangOverride = null;
        this.scanStarted = false;
        this.stripCR = false;
        setViewName("Document");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public DocScanInternal(AbstractTupleSchema abstractTupleSchema) {
        this();
        this.docSchema = abstractTupleSchema;
    }

    @Override // com.ibm.avatar.algebra.base.Operator
    public void checkEndOfInput(MemoizationTable memoizationTable) throws Exception {
        if (!this.scanStarted) {
            startScan(memoizationTable);
            setScanStarted();
        }
        if (memoizationTable.haveMoreInput()) {
            reallyCheckEndOfInput(memoizationTable);
        }
    }

    protected abstract void reallyCheckEndOfInput(MemoizationTable memoizationTable) throws Exception;

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.ibm.avatar.algebra.base.Operator
    public AbstractTupleSchema createOutputSchema() {
        AbstractTupleSchema abstractTupleSchema = this.docSchema;
        this.docTextAcc = abstractTupleSchema.textSetter(getTextColName());
        this.docTextGetters.add(abstractTupleSchema.textAcc(getTextColName()));
        return abstractTupleSchema;
    }

    public String getTextColName() {
        return "text";
    }

    @Override // com.ibm.avatar.algebra.base.MultiInputOperator
    public void reallyEvaluate(MemoizationTable memoizationTable, TupleList[] tupleListArr) throws Exception {
        ArrayList<FieldGetter<Text>> textGetters;
        try {
            Tuple nextDoc = getNextDoc(memoizationTable);
            if (null != this.docLangOverride && null != (textGetters = textGetters())) {
                Iterator<FieldGetter<Text>> it = textGetters.iterator();
                while (it.hasNext()) {
                    Text val = it.next().getVal(nextDoc);
                    if (null != val) {
                        val.overrideLanguage(this.docLangOverride);
                    }
                }
            }
            reallyCheckEndOfInput(memoizationTable);
            if (null != nextDoc) {
                addResultTup(nextDoc, memoizationTable);
            } else if (!memoizationTable.endOfInput()) {
                throw new Exception("DocScan received null instead of a document");
            }
        } catch (CharConversionException e) {
            throw new IOException("Input document text contains a byte sequence that does not conform to the UTF-8 standard.  Please convert the document to UTF-8 encoding.  The text editors vim and emacs can perform this conversion.");
        }
    }

    protected boolean getScanStarted() {
        return this.scanStarted;
    }

    public void setScanStarted() {
        if (this.scanStarted) {
            throw new RuntimeException("Started scan twice");
        }
        this.scanStarted = true;
    }

    protected boolean getStripCR() {
        return this.stripCR;
    }

    public void setStripCR(boolean z) {
        this.stripCR = z;
    }

    @Override // com.ibm.avatar.algebra.base.MultiInputOperator
    protected TupleList[] prepareInputs(MemoizationTable memoizationTable) throws Exception {
        if (!this.scanStarted) {
            startScan(memoizationTable);
            setScanStarted();
        }
        return EMPTY_LIST;
    }

    protected abstract void startScan(MemoizationTable memoizationTable) throws Exception;

    protected abstract Tuple getNextDoc(MemoizationTable memoizationTable) throws Exception;

    public final ArrayList<FieldGetter<Text>> textGetters() {
        getOutputSchema();
        return this.docTextGetters;
    }

    public Tuple getNextDocTup(MemoizationTable memoizationTable) throws Exception {
        return getNext(memoizationTable).getElemAtIndex(0);
    }

    public static DocScanInternal makeFileScan(File file) throws Exception {
        return makeFileScan(file, null, null, ',');
    }

    public static DocScanInternal makeFileScan(File file, TupleSchema tupleSchema, Map<Pair<String, String>, TupleSchema> map, char c) throws Exception {
        String name = file.getName();
        boolean matches = name.matches(".*\\.json");
        boolean matches2 = name.matches(".*\\.csv");
        boolean matches3 = name.matches(".*(\\.txt|\\.html|\\.htm|\\.xhtml|\\.xml)");
        if (null == tupleSchema) {
            if (matches) {
                throw new TextAnalyticsException("Error while creating document reader for specified file '%s'. Document schema is required to create a JSON file reader. Specify a valid document schema.", file);
            }
            if (matches2) {
            }
        } else if (!matches && !matches2) {
            validateSchemaAsDefault(tupleSchema);
        }
        if (!file.isDirectory() && matches) {
            return new JsonFileScan(file, tupleSchema, map);
        }
        if (!file.isDirectory() && matches2) {
            return new CsvFileScan(file, tupleSchema, c);
        }
        if (file.isDirectory()) {
            return null != tupleSchema ? new DirDocScan(file, tupleSchema) : new DirDocScan(file);
        }
        if (!name.matches(".*(\\.tar\\.gz|\\.tgz|\\.tar)")) {
            if (name.matches(".*\\.del")) {
                return null != tupleSchema ? new DBDumpFileScan(file, tupleSchema) : new DBDumpFileScan(file);
            }
            if (name.matches(".*\\.zip")) {
                return null != tupleSchema ? new ZipFileScan(file, tupleSchema) : new ZipFileScan(file);
            }
            if (matches3) {
                return null != tupleSchema ? new TextFileScan(file, tupleSchema) : new TextFileScan(file);
            }
            throw new UnrecognizedFileFormatException(file);
        }
        Constructor<?>[] constructors = Class.forName("com.ibm.avatar.algebra.oldscan.TarFileScan").getConstructors();
        Constructor<?> constructor = null;
        Constructor<?> constructor2 = null;
        for (int i = 0; i < constructors.length; i++) {
            if (1 == constructors[i].getParameterTypes().length) {
                constructor = constructors[i];
            } else if (2 == constructors[i].getParameterTypes().length) {
                constructor2 = constructors[i];
            }
        }
        return null != tupleSchema ? (DocScanInternal) constructor2.newInstance(file, tupleSchema) : (DocScanInternal) constructor.newInstance(file);
    }

    public TupleSchema getDocSchema() {
        return (TupleSchema) getOutputSchema();
    }

    public void overrideLang(LangCode langCode) {
        this.docLangOverride = langCode;
    }

    private static void validateSchemaAsDefault(AbstractTupleSchema abstractTupleSchema) throws TextAnalyticsException {
        boolean z = false;
        int i = 0;
        while (true) {
            if (i >= defaultSupportedSchemas.length) {
                break;
            }
            if (abstractTupleSchema.equals(defaultSupportedSchemas[i])) {
                z = true;
                break;
            }
            i++;
        }
        if (!z) {
            throw new TextAnalyticsException("Specified document schema '%s' is not supported for this input collection. The supported document schemas for non-JSON/non-CSV files are: %s.", abstractTupleSchema, Arrays.toString(defaultSupportedSchemas));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getDocText(InputStreamReader inputStreamReader) throws IOException {
        char[] cArr = new char[10000];
        StringBuilder sb = new StringBuilder();
        if (!this.stripCR) {
            while (true) {
                int read = inputStreamReader.read(cArr);
                if (-1 == read) {
                    break;
                }
                sb.append(cArr, 0, read);
            }
        } else {
            boolean z = false;
            while (true) {
                int read2 = inputStreamReader.read(cArr);
                if (read2 <= 0) {
                    break;
                }
                if (z && cArr[0] != '\n') {
                    sb.append('\r');
                    z = false;
                }
                for (int i = 0; i < read2; i++) {
                    if (cArr[i] == '\r') {
                        if (i == read2 - 1) {
                            z = true;
                        } else if (cArr[i + 1] == '\n') {
                        }
                    }
                    sb.append(cArr[i]);
                }
            }
            if (z) {
                sb.append('\r');
            }
        }
        return sb.toString();
    }
}
