/*
 * Decompiled with CFR 0.152.
 */
package org.apache.stanbol.enhancer.engines.htmlextractor.impl;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CharsetRecognizer {
    private static final Logger LOG = LoggerFactory.getLogger(CharsetRecognizer.class);

    private static String checkPattern(String str, String pattern, int group) {
        Pattern pat = Pattern.compile(pattern);
        Matcher m = pat.matcher(str);
        if (m.find()) {
            return m.group(group);
        }
        return null;
    }

    private static String checkFormat(String format, InputStream in) throws IOException {
        String result = null;
        String defaultValue = null;
        in.mark(4096);
        if (format.equalsIgnoreCase("xml")) {
            defaultValue = "UTF-8";
            byte[] bytes = new byte[80];
            int read = in.read(bytes);
            in.reset();
            String decl = new String(bytes, 0, read, "US-ASCII");
            result = CharsetRecognizer.checkPattern(decl, "encoding=\"(\\w[-\\w]+)\"", 1);
        } else if (format.equalsIgnoreCase("html")) {
            byte[] bytes = new byte[2048];
            int read = in.read(bytes);
            in.reset();
            String decl = new String(bytes, 0, read, "US-ASCII");
            result = CharsetRecognizer.checkPattern(decl, "<meta .*?content=\".*charset=(\\w[-\\w]+).*?/>", 1);
        }
        if (result == null) {
            return defaultValue;
        }
        result = result.toUpperCase();
        LOG.debug(format.toUpperCase() + " encoding: " + result);
        return result;
    }

    public static String detect(InputStream in) throws IOException {
        return CharsetRecognizer.detect(in, null, null);
    }

    public static String detect(InputStream in, String format, String encoding) throws IOException {
        if (!in.markSupported()) {
            throw new IOException("Mark not supported by input stream");
        }
        String result = null;
        if (format != null && (result = CharsetRecognizer.checkFormat(format, in)) != null) {
            return result;
        }
        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }

    public static void main(String[] args) {
        String format = null;
        String encoding = null;
        for (int argv = 0; argv < args.length && args[argv].startsWith("-"); ++argv) {
            String option = args[argv].substring(1);
            if (option.startsWith("f")) {
                format = args[++argv];
                continue;
            }
            if (option.startsWith("e")) {
                encoding = args[++argv];
                continue;
            }
            System.err.println("illegal option: " + option);
            System.exit(1);
        }
        for (int i = argv; i < args.length; ++i) {
            try {
                BufferedInputStream fstream = new BufferedInputStream(new FileInputStream(args[i]));
                String found = CharsetRecognizer.detect(fstream, format, encoding);
                System.out.println("Encoding: " + found + ": " + args[i]);
                fstream.close();
                continue;
            }
            catch (IOException e) {
                LOG.error(e.getMessage());
            }
        }
    }
}

