package com.ibm.avatar.algebra.util.html;

import com.ibm.avatar.algebra.extract.Detag;
import com.ibm.avatar.algebra.util.tokenize.BaseOffsetsList;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Vector;
import org.htmlparser.Attribute;
import org.htmlparser.Node;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.Translate;

/* loaded from: input_file:com/ibm/avatar/algebra/util/html/HTMLParserDetagger.class */
public class HTMLParserDetagger extends HTMLDetagger {
    private static final char NULL_CHAR = 0;
    public static final boolean debug = false;
    private static final int MAX_ENTITY_LEN = 16;
    private static final boolean EXCEPTION_ON_UNKNOWN_ENTITY = false;
    private final Lexer lexer;
    private int detagOffset;
    private char lastDetagChar;
    private boolean mustHaveWhitespace;
    private int whitespaceHTMLOffset;
    private boolean amSkippingText;
    private boolean cdataMode;
    private final int numTags;
    private final String XML_HEADER_MARKUP = "<?xml";

    public HTMLParserDetagger(String[] strArr, String[][] strArr2) {
        super(strArr, strArr2);
        this.lexer = new Lexer();
        this.lastDetagChar = (char) 0;
        this.mustHaveWhitespace = false;
        this.whitespaceHTMLOffset = -1;
        this.amSkippingText = false;
        this.cdataMode = false;
        this.XML_HEADER_MARKUP = "<?xml";
        this.numTags = strArr.length;
    }

    @Deprecated
    public static ByteArrayOutputStream detagFile(InputStream inputStream, boolean z) throws IOException {
        HTMLParserDetagger hTMLParserDetagger = new HTMLParserDetagger(new String[0], new String[0][0]);
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        byte[] bArr = new byte[8192];
        StringBuilder sb = new StringBuilder();
        while (true) {
            int read = inputStream.read(bArr);
            try {
                if (read < 0) {
                    try {
                        break;
                    } catch (Throwable th) {
                        String format = String.format("While parsing HTML/XML, caught exception: %s", th);
                        hTMLParserDetagger.clear();
                        throw new IOException(format, th);
                    }
                }
                sb.append(new String(bArr, 0, read, "UTF-8"));
            } finally {
                bufferedReader.close();
                inputStream.close();
            }
        }
        String sb2 = sb.toString();
        Charset forName = Charset.forName("UTF-8");
        if (z && Detag.isNonHTML(sb2)) {
            byteArrayOutputStream.write(sb2.getBytes(forName));
        } else {
            hTMLParserDetagger.detagStr(sb2);
            byteArrayOutputStream.write(hTMLParserDetagger.getDetaggedText().getBytes(forName));
        }
        return byteArrayOutputStream;
    }

    @Override // com.ibm.avatar.algebra.util.html.HTMLDetagger
    protected void reallyDetag() throws Exception {
        this.detagOffset = 0;
        this.lastDetagChar = (char) 0;
        Page page = new Page(this.html);
        this.lexer.reset();
        this.lexer.setPage(page);
        this.amSkippingText = false;
        while (true) {
            Node nextNode = this.lexer.nextNode(false);
            if (null == nextNode) {
                drainTagStack(this.detagOffset);
                return;
            }
            if (nextNode instanceof Text) {
                handleText((Text) nextNode);
            } else if (nextNode instanceof Tag) {
                Tag tag = (Tag) nextNode;
                if (tag.isEmptyXmlTag() || tag.getTagName().equalsIgnoreCase("BR")) {
                    handleTagBegin(tag);
                    handleTagEnd(tag);
                } else if (tag.isEndTag()) {
                    handleTagEnd(tag);
                } else {
                    handleTagBegin(tag);
                }
            } else if (!(nextNode instanceof Remark)) {
                throw new IOException("Don't know how to handle node: " + nextNode.toString());
            }
        }
    }

    private void handleText(Text text) {
        int startPosition;
        int endPosition;
        if (text.getText().startsWith("<?xml") || this.amSkippingText) {
            return;
        }
        if ((this.cdataMode && handleCdataText(text)) || (startPosition = text.getStartPosition()) == (endPosition = text.getEndPosition())) {
            return;
        }
        if (this.mustHaveWhitespace) {
            this.mustHaveWhitespace = false;
            if (false == Character.isWhitespace(this.lastDetagChar) && this.lastDetagChar != 0 && false == Character.isWhitespace(this.html.charAt(startPosition))) {
                this.sb.append(' ');
                this.offsetsBuf[this.detagOffset] = this.whitespaceHTMLOffset;
                this.detagOffset++;
                this.lastDetagChar = ' ';
                fixTagStack();
            }
        }
        int i = endPosition - startPosition;
        int i2 = 0;
        while (i2 < i) {
            char charAt = this.html.charAt(startPosition + i2);
            boolean z = false;
            if ('&' == charAt && startPosition + i2 < this.html.length()) {
                int i3 = i2;
                char c = 0;
                char charAt2 = (startPosition + i3) + 1 < this.html.length() - 1 ? this.html.charAt(startPosition + i3 + 1) : (char) 0;
                while (true) {
                    char c2 = charAt2;
                    if (i3 >= i2 + 16 || c == ';' || c2 == '&' || false != Character.isSpaceChar(c) || startPosition + i3 >= this.html.length() - 1) {
                        break;
                    }
                    i3++;
                    c = this.html.charAt(startPosition + i3);
                    charAt2 = (startPosition + i3) + 1 < this.html.length() - 1 ? this.html.charAt(startPosition + i3 + 1) : (char) 0;
                }
                if (';' == c) {
                    String substring = this.html.substring(startPosition + i2, startPosition + i3 + 1);
                    String decode = ("&nbsp;".equals(substring) || "&#160;".equals(substring) || "&#0160;".equals(substring)) ? " " : "&apos;".equals(substring) ? "'" : Translate.decode(substring);
                    if (decode.length() == substring.length() && substring.length() > 3 && substring.length() < 7 && '#' == substring.charAt(1)) {
                        char c3 = 0;
                        if ('0' == substring.charAt(2)) {
                            for (int i4 = 3; i4 < substring.length() - 1; i4++) {
                                c3 = (char) (((char) (c3 * '\b')) + (substring.charAt(i4) - '0'));
                            }
                        } else {
                            for (int i5 = 3; i5 < substring.length() - 1; i5++) {
                                c3 = (char) (((char) (c3 * '\n')) + (substring.charAt(i5) - '0'));
                            }
                        }
                        decode = new String(new char[]{c3});
                    }
                    if (1 == decode.length()) {
                        this.sb.append(decode.charAt(0));
                        this.offsetsBuf[this.detagOffset] = startPosition + i2;
                        this.detagOffset++;
                        i2 = i3;
                        z = true;
                    } else if (decode.length() != substring.length()) {
                    }
                }
            }
            if (false == z) {
                if (charAt == 160) {
                    this.sb.append(' ');
                } else {
                    this.sb.append(charAt);
                }
                this.offsetsBuf[this.detagOffset] = startPosition + i2;
                this.detagOffset++;
            }
            i2++;
        }
        this.lastDetagChar = this.html.charAt(endPosition - 1);
    }

    private boolean handleCdataText(Text text) {
        String text2 = text.getText();
        int indexOf = text2.indexOf("]]>");
        boolean z = -1 != indexOf;
        if (z) {
            if (indexOf > 0) {
                String substring = text2.substring(0, indexOf);
                TextNode textNode = new TextNode(substring);
                textNode.setStartPosition(text.getStartPosition());
                textNode.setEndPosition(textNode.getStartPosition() + substring.length());
                handleText(textNode);
            }
            this.cdataMode = false;
            int length = indexOf + "]]>".length();
            if (length < text2.length() - 1) {
                String substring2 = text2.substring(length);
                TextNode textNode2 = new TextNode(substring2);
                textNode2.setStartPosition(text.getStartPosition() + length);
                textNode2.setEndPosition(textNode2.getStartPosition() + substring2.length());
                handleText(textNode2);
            }
        }
        return z;
    }

    private void fixTagStack() {
        for (int i = 0; i < this.numTags; i++) {
            this.tagBegins[i].correctOffset(this.detagOffset - 1, this.detagOffset);
            if (((BaseOffsetsList) getTagOffsets(i)).updatePoppedEmptyTagEntries(this.detagOffset - 1, this.detagOffset)) {
                this.offsetsBuf[this.detagOffset] = this.offsetsBuf[this.detagOffset - 1];
            }
        }
    }

    private void handleTagBegin(Tag tag) {
        int endPosition;
        String tagName = tag.getTagName();
        if ("SCRIPT".equalsIgnoreCase(tagName) || "STYLE".equalsIgnoreCase(tagName)) {
            r9 = this.amSkippingText ? false : true;
            this.amSkippingText = true;
        }
        if (false == this.amSkippingText && tagName.startsWith("![CDATA[")) {
            String substring = tag.getText().substring("![CDATA[".length());
            if (substring.endsWith("]]") && (endPosition = tag.getEndPosition()) < this.html.length() && '>' == this.html.charAt(endPosition - 1)) {
                substring = substring + ">";
            }
            this.cdataMode = true;
            TextNode textNode = new TextNode(substring);
            textNode.setStartPosition(tag.getStartPosition() + "![CDATA[".length() + 1);
            textNode.setEndPosition(textNode.getStartPosition() + substring.length());
            handleText(textNode);
            return;
        }
        if ((r9 || !this.amSkippingText) && getAnnotateTags()) {
            for (int i = 0; i < this.tags.length; i++) {
                if (this.tags[i].equalsIgnoreCase(tagName)) {
                    String[] strArr = null;
                    if (this.attrs[i].length > 0) {
                        strArr = new String[this.attrs[i].length];
                        for (int i2 = 0; i2 < this.attrs[i].length; i2++) {
                            String str = this.attrs[i][i2];
                            String str2 = null;
                            if (str.equalsIgnoreCase(tagName)) {
                                Vector attributesEx = tag.getAttributesEx();
                                if (1 == attributesEx.size()) {
                                    break;
                                }
                                int i3 = 1;
                                while (true) {
                                    if (i3 >= attributesEx.size()) {
                                        break;
                                    }
                                    if (attributesEx.get(i3) instanceof Attribute) {
                                        Attribute attribute = (Attribute) attributesEx.get(i3);
                                        if (str.equalsIgnoreCase(attribute.getName())) {
                                            str2 = attribute.getValue();
                                            break;
                                        }
                                    }
                                    i3++;
                                }
                            } else {
                                str2 = tag.getAttribute(str);
                            }
                            strArr[i2] = str2;
                        }
                    }
                    pushTag(i, this.detagOffset, tag.getStartPosition(), strArr);
                }
            }
        }
        ensureWhiteSpace(tag.getStartPosition());
    }

    private void handleTagEnd(Tag tag) {
        String tagName = tag.getTagName();
        if (this.amSkippingText && ("SCRIPT".equalsIgnoreCase(tagName) || "STYLE".equalsIgnoreCase(tagName))) {
            this.amSkippingText = false;
        }
        if (!this.amSkippingText && getAnnotateTags()) {
            for (int i = 0; i < this.tags.length; i++) {
                if (this.tags[i].equalsIgnoreCase(tagName) && peekTag(i)) {
                    if (tag.isEmptyXmlTag() || this.tagBegins[i].peekHtmlOffset() + tag.getText().length() == tag.getStartPosition() - 1) {
                        this.offsetsBuf[this.detagOffset] = tag.getStartPosition();
                        popTag(i, this.detagOffset);
                    } else {
                        popTag(i, this.detagOffset);
                    }
                }
            }
        }
        ensureWhiteSpace(tag.getEndPosition() - 1);
    }

    private void ensureWhiteSpace(int i) {
        this.mustHaveWhitespace = true;
        this.whitespaceHTMLOffset = i;
    }
}
