package com.ibm.avatar.algebra.extract;

import com.ibm.avatar.algebra.base.MemoizationTable;
import com.ibm.avatar.algebra.base.MultiOutputOperator;
import com.ibm.avatar.algebra.base.Operator;
import com.ibm.avatar.algebra.base.OutputBuffer;
import com.ibm.avatar.algebra.datamodel.AbstractTupleSchema;
import com.ibm.avatar.algebra.datamodel.FieldSetter;
import com.ibm.avatar.algebra.datamodel.FieldType;
import com.ibm.avatar.algebra.datamodel.Span;
import com.ibm.avatar.algebra.datamodel.SpanGetter;
import com.ibm.avatar.algebra.datamodel.TLIter;
import com.ibm.avatar.algebra.datamodel.Text;
import com.ibm.avatar.algebra.datamodel.TextSetter;
import com.ibm.avatar.algebra.datamodel.Tuple;
import com.ibm.avatar.algebra.datamodel.TupleList;
import com.ibm.avatar.algebra.datamodel.TupleSchema;
import com.ibm.avatar.algebra.util.html.HTMLDetagger;
import com.ibm.avatar.algebra.util.string.StringUtils;
import com.ibm.avatar.algebra.util.tokenize.OffsetsList;
import com.ibm.avatar.logging.Log;
import com.ibm.avatar.logging.MsgType;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;

/* loaded from: input_file:com/ibm/avatar/algebra/extract/Detag.class */
public class Detag extends MultiOutputOperator {
    public static final boolean DUMP_ON_PARSE_FAILURE = true;
    public static final String MATCH_PERF_COUNTER_NAME = "HTML Detagging";
    public static final String OUTPUT_COL_NAME = "match";
    public static final String DOC_COL_NAME = "text";
    public static boolean USE_HTMLPARSER = true;
    private final String col;
    private SpanGetter inputAcc;
    private final ArrayList<FieldSetter<Span>> tagOutputAcc;
    private final ArrayList<ArrayList<TextSetter>> attrOutputAcc;
    private TextSetter outputTextAcc;
    private final String[] tags;
    private final String[] tagTypes;
    private final String[][] attrs;
    private final String[][] attrLabels;
    private final String detaggedDocType;
    private final boolean checkForHTML;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/ibm/avatar/algebra/extract/Detag$parserWrapper.class */
    public class parserWrapper extends OutputBuffer {
        HTMLDetagger detagger;

        private parserWrapper() {
        }

        @Override // com.ibm.avatar.algebra.base.OutputBuffer
        public void close() {
            this.detagger = null;
        }
    }

    private HTMLDetagger detagger(MemoizationTable memoizationTable) {
        try {
            OutputBuffer outputBuf = memoizationTable.getOutputBuf(this);
            if (null != outputBuf) {
                return ((parserWrapper) outputBuf).detagger;
            }
            parserWrapper parserwrapper = new parserWrapper();
            if (USE_HTMLPARSER) {
                try {
                    parserwrapper.detagger = (HTMLDetagger) Class.forName("com.ibm.avatar.algebra.util.html.HTMLParserDetagger").getConstructors()[0].newInstance(this.tags, this.attrs);
                } catch (Exception e) {
                    throw new RuntimeException("Error instantiating HTML detagger", e);
                }
            }
            memoizationTable.cacheOutputBuf(this, parserwrapper);
            return parserwrapper.detagger;
        } catch (NoClassDefFoundError e2) {
            System.err.printf("Caught exception:\n", new Object[0]);
            e2.printStackTrace();
            throw new RuntimeException("Sorry, detagging support not installed; please install the appropriate plugin.");
        }
    }

    public Detag(String[] strArr, String[] strArr2, String[][] strArr3, String[][] strArr4, String str, String str2, boolean z, Operator operator) {
        super(operator, strArr.length + 1);
        this.tagOutputAcc = new ArrayList<>();
        this.attrOutputAcc = new ArrayList<>();
        this.tags = strArr;
        this.tagTypes = strArr2;
        this.attrs = strArr3;
        this.detaggedDocType = str;
        this.attrLabels = strArr4;
        this.col = str2;
        this.checkForHTML = z;
        for (int i = 0; i < strArr4.length; i++) {
            this.attrOutputAcc.add(null);
            this.tagOutputAcc.add(null);
        }
    }

    private void handleParseException(Throwable th, Tuple tuple) {
        Log.log(MsgType.AQLRuntimeWarning, "While parsing HTML for tuple with OID %s, caught exception: %s", tuple.getOid(), th);
        try {
            File createTempFile = File.createTempFile("detag", ".txt");
            PrintWriter printWriter = new PrintWriter(new FileWriter(createTempFile));
            printWriter.printf("This file contains a record of a tuple that crashed the HTML detagger.\n\n", new Object[0]);
            printWriter.printf("The input tuple was: %s\n", tuple);
            printWriter.printf("The OID of the input tuple was: %s\n", tuple.getOid());
            printWriter.printf("Parsing generated the following exception:\n", new Object[0]);
            th.printStackTrace(printWriter);
            printWriter.printf("--------\n", new Object[0]);
            printWriter.printf("Document text was as follows:\n\n%s\n", this.inputAcc.getVal(tuple).getText());
            printWriter.close();
            Log.log(MsgType.AQLRuntimeWarning, "Dumped document information to file %s", createTempFile);
            if (th instanceof NoClassDefFoundError) {
                throw new RuntimeException(th);
            }
        } catch (IOException e) {
            throw new RuntimeException("IOException in error handling code");
        }
    }

    @Override // com.ibm.avatar.algebra.base.Tee
    protected AbstractTupleSchema createOutputSchema(int i) {
        TupleSchema tupleSchema;
        AbstractTupleSchema outputSchema = this.child.getOutputSchema();
        if (0 == i) {
            tupleSchema = new TupleSchema("text", FieldType.TEXT_TYPE);
            tupleSchema.setName(this.detaggedDocType);
            tupleSchema.getFieldTypeByName("text").setSourceDocType(outputSchema.getFieldTypeByName(this.col));
            this.inputAcc = outputSchema.asSpanAcc(this.col);
            this.outputTextAcc = tupleSchema.textSetter("text");
        } else {
            int i2 = i - 1;
            int length = 1 + this.attrLabels[i2].length;
            String[] strArr = new String[length];
            FieldType[] fieldTypeArr = new FieldType[length];
            for (int i3 = 0; i3 < this.attrLabels[i2].length; i3++) {
                strArr[i3] = this.attrLabels[i2][i3];
                fieldTypeArr[i3] = FieldType.TEXT_TYPE;
            }
            strArr[length - 1] = "match";
            fieldTypeArr[length - 1] = FieldType.SPAN_TYPE;
            tupleSchema = new TupleSchema(strArr, fieldTypeArr);
            tupleSchema.setName(this.tagTypes[i2]);
            this.tagOutputAcc.set(i2, tupleSchema.spanSetter("match"));
            ArrayList<TextSetter> arrayList = new ArrayList<>();
            for (int i4 = 0; i4 < this.attrLabels[i2].length; i4++) {
                arrayList.add(tupleSchema.textSetter(this.attrLabels[i2][i4]));
            }
            this.attrOutputAcc.set(i2, arrayList);
        }
        return tupleSchema;
    }

    public static boolean isNonHTML(String str) {
        String substring = str.substring(0, Math.min(100, str.length()));
        if (0 != 0) {
            Log.debug("isNonHtml(): first100 is '%s'", StringUtils.escapeForPrinting(substring));
        }
        String lowerCase = substring.toLowerCase();
        if (lowerCase.contains("<html")) {
            return false;
        }
        if (lowerCase.contains("xml") || lowerCase.contains("XML")) {
            if (0 == 0) {
                return false;
            }
            Log.debug("first100 contains '%s' or '%s'; returning false", "xml", "XML");
            return false;
        }
        if (0 != 0) {
            Log.debug("first100 does not contain '%s' or '%s'", "xml", "XML");
        }
        if (lowerCase.contains("<!doctype") || lowerCase.contains("<![cdata[")) {
            return false;
        }
        int i = 0;
        int i2 = 0;
        for (int i3 = 0; i3 < lowerCase.length(); i3++) {
            char charAt = lowerCase.charAt(i3);
            if ('<' == charAt) {
                i++;
            } else if ('>' == charAt) {
                i2++;
            }
            if (i >= 2 && i2 >= 2) {
                return false;
            }
        }
        if (0 == 0) {
            return true;
        }
        Log.debug("Document is not HTML", new Object[0]);
        return true;
    }

    @Override // com.ibm.avatar.algebra.base.MultiOutputOperator
    protected void advanceAllInternal(MemoizationTable memoizationTable, TupleList tupleList, TupleList[] tupleListArr) throws Exception {
        String detaggedText;
        int[] offsetsTable;
        HTMLDetagger detagger = detagger(memoizationTable);
        TLIter it = tupleList.iterator();
        while (it.hasNext()) {
            Tuple next = it.next();
            Span val = this.inputAcc.getVal(next);
            if (val != null) {
                String text = val.getText();
                detagger.clear();
                boolean z = this.checkForHTML && isNonHTML(text);
                if (z) {
                    detaggedText = text;
                    offsetsTable = new int[text.length()];
                    for (int i = 0; i < offsetsTable.length; i++) {
                        offsetsTable[i] = i;
                    }
                } else {
                    try {
                        detagger.detagStr(text);
                    } catch (Throwable th) {
                        handleParseException(th, next);
                        detagger.clear();
                    }
                    detaggedText = detagger.getDetaggedText();
                    offsetsTable = detagger.getOffsetsTable();
                }
                Tuple createOutputTup = createOutputTup(0);
                Text val2 = z ? this.outputTextAcc.setVal(createOutputTup, text, val.getLanguage()) : this.outputTextAcc.setVal(createOutputTup, detaggedText, val.getDocTextObj(), offsetsTable);
                tupleListArr[0].add(createOutputTup);
                for (int i2 = 1; i2 < this.outputs.length; i2++) {
                    int i3 = i2 - 1;
                    OffsetsList tagOffsets = detagger.getTagOffsets(i3);
                    ArrayList<String[]> tagAttrs = detagger.getTagAttrs(i3);
                    for (int i4 = 0; i4 < tagOffsets.size(); i4++) {
                        Span makeBaseSpan = Span.makeBaseSpan(val2, tagOffsets.begin(i4), tagOffsets.end(i4));
                        Tuple createOutputTup2 = createOutputTup(i2);
                        this.tagOutputAcc.get(i3).setVal(createOutputTup2, makeBaseSpan);
                        String[] strArr = tagAttrs.get(i4);
                        for (int i5 = 0; i5 < this.attrs[i3].length; i5++) {
                            this.attrOutputAcc.get(i3).get(i5).setVal(createOutputTup2, Text.convert(strArr[i5]));
                        }
                        tupleListArr[i2].add(createOutputTup2);
                    }
                }
            }
        }
    }
}
