/*
 * Decompiled with CFR 0.152.
 */
package tratz.jwikt;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WiktionaryReader {
    public static void main(String[] args) throws Exception {
        String wiktionaryPagesArticlesXml = args[0];
        String outputFile = args[1];
        BufferedReader reader = new BufferedReader(new FileReader(wiktionaryPagesArticlesXml));
        PrintWriter writer = new PrintWriter(new FileWriter(outputFile));
        String line = null;
        HashSet<String> ignoreSet = new HashSet<String>();
        ignoreSet.add("Literature");
        ignoreSet.add("Etymology");
        ignoreSet.add("Etymology 1");
        ignoreSet.add("Etymology 2");
        ignoreSet.add("Etymology 3");
        ignoreSet.add("Etymology 4");
        ignoreSet.add("Etymology 5");
        ignoreSet.add("Etymology 6");
        ignoreSet.add("Etymology 7");
        ignoreSet.add("Pronunciation");
        ignoreSet.add("Pronunciation 1");
        ignoreSet.add("Pronunciation 2");
        ignoreSet.add("References");
        ignoreSet.add("Trivia");
        ignoreSet.add("External links");
        ignoreSet.add("Anagrams");
        ignoreSet.add("Translations");
        ignoreSet.add("Notes");
        ignoreSet.add("Usage notes");
        ignoreSet.add("Quotations");
        ignoreSet.add("Hyphenation");
        ignoreSet.add("Shorthand");
        ignoreSet.add("Descendants");
        ignoreSet.add("Inflection");
        ignoreSet.add("Homophones");
        ignoreSet.add("Dictionary notes");
        ignoreSet.add("Homographs");
        ignoreSet.add("Homonyms");
        ignoreSet.add("Alternative spellings");
        ignoreSet.add("Variant spellings");
        ignoreSet.add("Alternate forms");
        ignoreSet.add("Alternative forms");
        ignoreSet.add("Alternative abbreviations");
        ignoreSet.add("Particle");
        ignoreSet.add("Interjection");
        ignoreSet.add("Contraction");
        ignoreSet.add("Cardinal number");
        ignoreSet.add("Ordinal number");
        ignoreSet.add("Letter");
        ignoreSet.add("Proverb");
        ignoreSet.add("Prefix");
        ignoreSet.add("Suffix");
        ignoreSet.add("Infix");
        ignoreSet.add("History");
        ignoreSet.add("Combining form");
        ignoreSet.add("Affix");
        ignoreSet.add("Interfix");
        ignoreSet.add("Numeral");
        ignoreSet.add("Number");
        ignoreSet.add("Postposition");
        ignoreSet.add("Abbreviations");
        ignoreSet.add("Symbol");
        ignoreSet.add("Phrase");
        ignoreSet.add("Idiom");
        ignoreSet.add("Meronyms");
        ignoreSet.add("Interjection");
        ignoreSet.add("Personal noun");
        ignoreSet.add("Similar terms");
        ignoreSet.add("Taxonomic names");
        ignoreSet.add("Adverb phrase");
        ignoreSet.add("Possessive adjective");
        ignoreSet.add("Standard");
        ignoreSet.add("Variant");
        ignoreSet.add("America");
        ignoreSet.add("Canada");
        ignoreSet.add("Cyrillic spelling");
        ignoreSet.add("Phrases");
        ignoreSet.add("Translingual");
        ignoreSet.add("Related");
        ignoreSet.add("Collocations");
        ignoreSet.add("Verb form");
        ignoreSet.add("Derived phrase");
        ignoreSet.add("Use");
        ignoreSet.add(" Alternate forms ");
        ignoreSet.add(" Usage ");
        ignoreSet.add("Usage");
        ignoreSet.add("Variation");
        ignoreSet.add("Compounds");
        ignoreSet.add("Conjugation");
        ignoreSet.add("Declension");
        ignoreSet.add("Relation");
        ignoreSet.add("Misspelling");
        ignoreSet.add("Lujvo");
        ignoreSet.add("Gregg");
        ignoreSet.add("Examples");
        ignoreSet.add("Circumfix");
        ignoreSet.add("Derived Meanings");
        ignoreSet.add("Combining Form");
        ignoreSet.add("Preposition phrase");
        ignoreSet.add("Prepositional phrase");
        int count = 0;
        HashSet<String> allTerms = new HashSet<String>();
        boolean printFromPage = false;
        boolean inEnglishSection = false;
        boolean inInterestingSubsection = false;
        Matcher languageMatcher = Pattern.compile("==([^\\]=]*)==\\s*").matcher("");
        Matcher typeMatcher = Pattern.compile("====*([^\\]=]*)=*===\\s*").matcher("");
        Matcher titleMatcher = Pattern.compile("\\s*<title>(.*)</title>\\s*").matcher("");
        Matcher appendixTitleMatcher = Pattern.compile("\\s*<title>((Wikisaurus|Image|WS|WT|Concordance|Special|User|File|MediaWiki|Rhymes|Transwiki|Citations|MediaWiki|Index|Template|Category|Help|Appendix|Wiktionary)([\\s_]talk)?):(.*)</title>\\s*").matcher("");
        String title = null;
        StringBuilder buf = new StringBuilder();
        int sinceLastMatch = 1000;
        boolean bad = false;
        boolean bad2 = false;
        int numHashes = 0;
        boolean inComment = false;
        while ((line = reader.readLine()) != null) {
            line = line.replace("WikiSaurus", "Wikisaurus");
            line = line.replace("[[wikisaurus:", "[[Wikisaurus");
            if ((line = line.replace("&mdash;", "\u2014")).trim().startsWith("#: All drinks are free")) {
                System.err.println("Got it.");
                System.err.println(line);
            }
            line = line.replace("&lt;", "<").replace("&gt;", ">").replaceAll("<\\!--.*-->", "");
            line = line.replaceAll("<ref [^>]*/>", "");
            line = line.replaceAll("<ref [^>]*>.*</ref>", "");
            line = line.replaceAll("\\s*<text[^>]*>", "");
            if (titleMatcher.reset(line = line.replaceAll(".*<comment>.*</comment>\\s*", "")).matches()) {
                printFromPage = !appendixTitleMatcher.reset(line).matches();
                inEnglishSection = false;
                title = titleMatcher.group(1);
                if (title.contains("managee")) {
                    sinceLastMatch = 0;
                    System.err.println(line);
                }
                writer.print(buf.toString());
                bad = false;
                bad2 = false;
                numHashes = 0;
                buf.setLength(0);
                inComment = false;
            }
            if (sinceLastMatch < 40) {
                System.err.println(line);
            }
            if (line.contains("<!--") && !line.contains("-->")) {
                inComment = true;
                if (!line.startsWith("<!--")) {
                    line = line.substring(0, line.indexOf("<!--"));
                }
            }
            if (line.contains("-->") && !line.contains("<!--")) {
                line = line.trim();
                inComment = false;
                line = !line.endsWith("-->") ? line.substring(line.indexOf("-->") + 3) : "";
            }
            if (inComment) continue;
            ++sinceLastMatch;
            if (printFromPage) {
                if (line.trim().startsWith("[[Category:English plurals")) {
                    bad = true;
                }
                if (languageMatcher.reset(line).matches()) {
                    inEnglishSection = languageMatcher.group(1).equals("English");
                    if (!inEnglishSection) continue;
                    buf.append("TITLE:" + title.replace("&amp;", "&")).append("\n");
                    inInterestingSubsection = true;
                    continue;
                }
            }
            if (line.contains("</text>")) {
                inEnglishSection = false;
                inInterestingSubsection = false;
            }
            if (inEnglishSection) {
                if (typeMatcher.reset(line).matches()) {
                    String subsection = typeMatcher.group(1).trim();
                    if (!ignoreSet.contains(subsection)) {
                        allTerms.add(subsection);
                    }
                    boolean bl = inInterestingSubsection = !ignoreSet.contains(subsection);
                }
                if (inInterestingSubsection) {
                    if ((line = line.trim()).startsWith("{{rel-") || line.startsWith("{{trans-") || line.equals("{{top4}}") || line.equals("{{mid4}}") || line.equals("{{top3}}") || line.equals("{{mid3}}") || line.equals("{{top2}}") || line.equals("{{mid2}}") || line.equals("{{top}}") || line.equals("{{mid}}") || line.equals("{{bottom}}") || line.equals("{{der-top}}") || line.equals("{{der-mid}}") || line.equals("{{der-bottom}}") || line.matches("\\[\\[([a-z]{2,3}|simple|zh-min-nan):.*") || line.startsWith("[[Image:") || line.matches("\\* ?\\{\\{audio.*") || line.matches("\\* ?\\{\\{pedialite") || line.equals("{{wikipedia}}") || line.startsWith("{{wikipedia|") || line.startsWith("{{rfe") || line.startsWith("{{rank|") || line.matches("\\* ?\\{\\{Wikisource") || line.matches("\\[\\[Category:Dictionary notes\\]\\]") || line.trim().startsWith("[[Category:English plurals") || line.equals("----")) continue;
                    if (line.startsWith("# {{plural of|")) {
                        bad2 = true;
                    }
                    if (line.startsWith("# ")) {
                        ++numHashes;
                    }
                    buf.append(line.replace("'''", "").replace("''", "").replace("&amp;", "&").replace("&quot;", "\"")).append("\n");
                }
            }
            ++count;
        }
        if (!bad) {
            writer.print(buf.toString());
        }
        writer.close();
        reader.close();
        ArrayList list = new ArrayList(allTerms);
        Collections.sort(list);
        System.err.println("Print:");
        for (String s : list) {
            System.err.println(s);
        }
        System.err.println(list.size());
    }
}

