/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.dumpcheck;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.regex.Pattern;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.dev.dumpcheck.WikipediaSentenceExtractor;
import org.languagetool.dev.dumpcheck.WikipediaSentenceSource;

class WikipediaSpecificCaseExpressionExtractor {
    private static HashMap<String, Integer> specificCaseExpressionsCounter = new HashMap();
    private final int NUMBER_OF_EXPRESSIONS = 100;

    WikipediaSpecificCaseExpressionExtractor() {
    }

    private void extractSpecificCaseExpressions(Language language, String xmlDumpPath, String outputFile) throws IOException, CompressorException {
        try (FileInputStream fis = new FileInputStream(xmlDumpPath);
             BufferedInputStream bis = new BufferedInputStream(fis);
             FileWriter fw = new FileWriter(outputFile);){
            Object input;
            if (xmlDumpPath.endsWith(".bz2")) {
                input = new CompressorStreamFactory().createCompressorInputStream((InputStream)bis);
            } else if (xmlDumpPath.endsWith(".xml")) {
                input = bis;
            } else {
                throw new IllegalArgumentException("Unknown file name, expected '.xml' or '.bz2': " + xmlDumpPath);
            }
            WikipediaSentenceSource source = new WikipediaSentenceSource((InputStream)input, language);
            while (source.hasNext()) {
                String sentence = source.next().getText();
                if (this.skipSentence(sentence)) continue;
                this.detectSpecificCaseExpressions(sentence);
            }
            specificCaseExpressionsCounter = WikipediaSpecificCaseExpressionExtractor.sortByValue(specificCaseExpressionsCounter);
            int number_of_expressions_added = 0;
            for (String foundExpression : specificCaseExpressionsCounter.keySet()) {
                fw.write(foundExpression);
                fw.write(10);
                if (++number_of_expressions_added != 100) continue;
                break;
            }
        }
    }

    private void detectSpecificCaseExpressions(String sentence) {
        LinkedList<String> specificCaseQueue = new LinkedList<String>();
        String[] words = sentence.split(" ");
        boolean isExpressionFound = false;
        boolean isExpressionFinished = false;
        for (int i = 1; i < words.length; ++i) {
            String word = words[i];
            if (!WikipediaSpecificCaseExpressionExtractor.containsLetter(word)) continue;
            if (isExpressionFound) {
                if (!Character.isLetter(word.charAt(0))) {
                    this.incrementSpecificCaseExpressionsCounter(specificCaseQueue);
                    isExpressionFound = false;
                    isExpressionFinished = false;
                } else if (!Character.isLetter(word.charAt(word.length() - 1))) {
                    isExpressionFinished = true;
                }
            }
            if (Character.isUpperCase((word = word.replaceAll("^[^\u03b1-\u03c9\u0391-\u03a9\u03af\u03ca\u0390\u03cc\u03ac\u03ad\u03cd\u03cb\u03b0\u03ae\u03ce'\\s]+|[^\u03b1-\u03c9\u0391-\u03a9\u03af\u03ca\u0390\u03cc\u03ac\u03ad\u03cd\u03cb\u03b0\u03ae\u03ce'\\s]+$", "")).charAt(0))) {
                specificCaseQueue.add(word);
                if (specificCaseQueue.size() > 1) {
                    isExpressionFound = true;
                }
            } else if (isExpressionFound) {
                isExpressionFinished = true;
            } else if (specificCaseQueue.size() > 0) {
                specificCaseQueue.remove();
            }
            if (!isExpressionFound || !isExpressionFinished) continue;
            this.incrementSpecificCaseExpressionsCounter(specificCaseQueue);
            isExpressionFound = false;
            isExpressionFinished = false;
        }
    }

    public static HashMap<String, Integer> sortByValue(HashMap<String, Integer> hm) {
        LinkedList<Map.Entry<String, Integer>> list = new LinkedList<Map.Entry<String, Integer>>(hm.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){

            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());
            }
        });
        LinkedHashMap<String, Integer> temp = new LinkedHashMap<String, Integer>();
        for (Map.Entry entry : list) {
            temp.put((String)entry.getKey(), (Integer)entry.getValue());
        }
        return temp;
    }

    private void incrementSpecificCaseExpressionsCounter(Queue<String> queueContainingExpression) {
        String specificCaseExpression = "";
        while (!queueContainingExpression.isEmpty()) {
            String wordInExpression = queueContainingExpression.remove();
            specificCaseExpression = specificCaseExpression + wordInExpression + " ";
        }
        int count = specificCaseExpressionsCounter.getOrDefault(specificCaseExpression, 0);
        specificCaseExpressionsCounter.put(specificCaseExpression, count + 1);
    }

    private static boolean containsLetter(String word) {
        return Pattern.matches(".*[\u03b1-\u03c9\u0391-\u03a9\u03af\u03ca\u0390\u03cc\u03ac\u03ad\u03cd\u03cb\u03b0\u03ae\u03ce\\s].*", word);
    }

    private boolean skipSentence(String sentence) {
        return sentence.trim().length() == 0 || Character.isLowerCase(sentence.trim().charAt(0)) || !sentence.matches("^[\u03b1-\u03c9\u0391-\u03a9\u03af\u03ca\u0390\u03cc\u03ac\u03ad\u03cd\u03cb\u03b0\u03ae\u03ce\\s].*$");
    }

    public static void main(String[] args) throws IOException, CompressorException {
        if (args.length != 2) {
            System.out.println("Usage: " + WikipediaSentenceExtractor.class.getSimpleName() + " <langCode> <wikipediaXmlDump> <output>");
            System.exit(1);
        }
        WikipediaSpecificCaseExpressionExtractor extractor = new WikipediaSpecificCaseExpressionExtractor();
        extractor.extractSpecificCaseExpressions(Languages.getLanguageForShortCode((String)"el"), args[0], args[1]);
    }
}

