/*
 * Decompiled with CFR 0.152.
 */
package org.apache.stanbol.enhancer.engines.htmlextractor.impl;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.CharsetRecognizer;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionComponent;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

public class HtmlExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(HtmlExtractor.class);
    public static String DEFAULT_CONFIGURATION = "htmlextractors.xml";
    private HtmlParser htmlParser;
    public HtmlExtractionRegistry registry = null;

    public HtmlExtractor() {
        if (this.registry == null) {
            try {
                this.htmlParser = new HtmlParser();
                this.registry = new HtmlExtractionRegistry(DEFAULT_CONFIGURATION);
            }
            catch (InitializationException e) {
                LOG.error("Registry Initialization Error: " + e.getMessage());
            }
        }
    }

    public HtmlExtractor(HtmlExtractionRegistry registry, HtmlParser parser) {
        this.registry = registry;
        this.htmlParser = parser;
    }

    public HtmlExtractor(String configFileName) throws InitializationException {
        this.htmlParser = new HtmlParser();
        this.registry = new HtmlExtractionRegistry(configFileName);
    }

    public void extract(String id, InputStream input, Charset charset, String mimeType, MGraph result) throws ExtractorException {
        String encoding;
        if (this.registry == null) {
            return;
        }
        if (charset == null) {
            if (!input.markSupported()) {
                input = new BufferedInputStream(input);
            }
            try {
                encoding = CharsetRecognizer.detect(input, "html", "UTF-8");
            }
            catch (IOException e) {
                LOG.error("Charset detection problem: " + e.getMessage());
                throw new ExtractorException("Charset detection problem: " + e.getMessage());
            }
        } else {
            encoding = charset.name();
        }
        Document doc = this.htmlParser.getDOM(input, encoding);
        HashMap<String, HtmlExtractionComponent> extractors = this.registry.getRegistry();
        ArrayList formats = new ArrayList();
        long modelSize = result.size();
        for (String s : this.registry.getActiveExtractors()) {
            LOG.debug("Extractor: {}", (Object)s);
            HtmlExtractionComponent extractor = extractors.get(s);
            if (extractor == null) continue;
            extractor.extract(id, doc, null, result);
            long tmpSize = result.size();
            if (modelSize >= tmpSize) continue;
            LOG.debug("{} Statements added: {}", (Object)(tmpSize - modelSize), (Object)s);
            modelSize = tmpSize;
        }
    }

    public static void main(String[] args) throws Exception {
        int argv = 0;
        HtmlExtractor inst = new HtmlExtractor();
        for (int i = argv; i < args.length; ++i) {
            File file = new File(args[i]);
            FileInputStream input = new FileInputStream(file);
            Charset charset = Charset.forName("UTF-8");
            String mimeType = "text/html";
            UriRef uri = new UriRef(file.toURI().toString());
            SimpleMGraph container = new SimpleMGraph();
            inst.extract(uri.getUnicodeString(), input, charset, mimeType, (MGraph)container);
            System.out.println("Model for " + args[i]);
            System.out.println();
        }
    }
}

