/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.util;

import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.FeatureCountPipe;
import cc.mallet.pipe.FixedVocabTokenizer;
import cc.mallet.pipe.NGramPreprocessor;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.SimpleTokenizer;
import cc.mallet.pipe.StringList2FeatureSequence;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.AlphabetFactory;
import cc.mallet.types.Instance;
import cc.mallet.util.CommandOption;
import cc.mallet.util.DBInstanceStore;
import cc.mallet.util.MalletLogger;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;

public class DBBulkLoader {
    protected static Logger logger = MalletLogger.getLogger(DBBulkLoader.class.getName());
    static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "input", "FILE [FILE ...]", true, null, "The file containing data, one instance per line", null);
    static CommandOption.String outputDatabase = new CommandOption.String(DBBulkLoader.class, "output", "STRING", true, "mallet-db", "Write the instance list to this database", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(DBBulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.File vocabularyFile = new CommandOption.File(DBBulkLoader.class, "vocabulary", "FILE", true, null, "Read newline-separated words from this file.", null);
    static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "replacement-files", "FILE [FILE ...]", true, null, "files containing string replacements, one per line:\n\t 'A B [tab] C' replaces A B with C,\n\t 'A B' replaces A B with A_B", null);
    static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "deletion-files", "FILE [FILE ...]", true, null, "files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
    static CommandOption.File stoplistFile = new CommandOption.File(DBBulkLoader.class, "stoplist", "FILE", true, null, "Read newline-separated words from this file and remove them from text.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(DBBulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, true, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Integer pruneCount = new CommandOption.Integer(DBBulkLoader.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null);

    public static void generateStoplist(SimpleTokenizer prunedTokenizer, NGramPreprocessor preprocessor) throws IOException {
        ArrayList<Pipe> pipes = new ArrayList<Pipe>();
        Alphabet alphabet = new Alphabet();
        SimpleTokenizer st = prunedTokenizer.deepClone();
        StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
        FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
        pipes.add(preprocessor);
        pipes.add(st);
        pipes.add(sl2fs);
        pipes.add(featureCounter);
        SerialPipes serialPipe = new SerialPipes(pipes);
        String[] stringArray = DBBulkLoader.inputFiles.value;
        int n = DBBulkLoader.inputFiles.value.length;
        int n2 = 0;
        while (n2 < n) {
            String filename = stringArray[n2];
            logger.info("pruning from " + filename);
            CsvIterator reader = new CsvIterator((Reader)new FileReader(filename), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
            Iterator<Instance> iterator = ((Pipe)serialPipe).newIteratorFrom(reader);
            int count = 0;
            while (iterator.hasNext()) {
                if (++count % 100000 == 0) {
                    System.out.println(count);
                }
                iterator.next();
            }
            ++n2;
        }
        featureCounter.addPrunedWordsToStoplist(prunedTokenizer, DBBulkLoader.pruneCount.value);
    }

    public static void writeInstanceList(ArrayList<Pipe> pipes) throws Exception {
        SerialPipes serialPipe = new SerialPipes(pipes);
        DBInstanceStore saver = new DBInstanceStore(DBBulkLoader.outputDatabase.value);
        String[] stringArray = DBBulkLoader.inputFiles.value;
        int n = DBBulkLoader.inputFiles.value.length;
        int n2 = 0;
        while (n2 < n) {
            String filename = stringArray[n2];
            logger.info("importing from " + filename);
            CsvIterator reader = new CsvIterator((Reader)new FileReader(filename), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
            saver.saveInstances(((Pipe)serialPipe).newIteratorFrom(reader));
            ++n2;
        }
        saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
        saver.cleanup();
    }

    public static void main(String[] args) throws Exception {
        String filename;
        int n;
        int n2;
        String[] stringArray;
        logger.info("starting");
        CommandOption.setSummary(DBBulkLoader.class, "Efficient tool for importing large amounts of text and saving to an embedded Java database");
        CommandOption.process(DBBulkLoader.class, args);
        NGramPreprocessor preprocessor = new NGramPreprocessor();
        if (DBBulkLoader.replacementFiles.value != null) {
            stringArray = DBBulkLoader.replacementFiles.value;
            n2 = DBBulkLoader.replacementFiles.value.length;
            n = 0;
            while (n < n2) {
                filename = stringArray[n];
                System.out.println("including replacements from " + filename);
                preprocessor.loadReplacements(filename);
                ++n;
            }
        }
        if (DBBulkLoader.deletionFiles.value != null) {
            stringArray = DBBulkLoader.deletionFiles.value;
            n2 = DBBulkLoader.deletionFiles.value.length;
            n = 0;
            while (n < n2) {
                filename = stringArray[n];
                System.out.println("including deletions from " + filename);
                preprocessor.loadDeletions(filename);
                ++n;
            }
        }
        if (DBBulkLoader.vocabularyFile.value != null) {
            Alphabet alphabet = AlphabetFactory.loadFromFile(DBBulkLoader.vocabularyFile.value);
            alphabet.stopGrowth();
            logger.info("loaded alphabet of size " + alphabet.size());
            ArrayList<Pipe> pipes = new ArrayList<Pipe>();
            pipes.add(preprocessor);
            pipes.add(new FixedVocabTokenizer(alphabet));
            DBBulkLoader.writeInstanceList(pipes);
        } else {
            SimpleTokenizer tokenizer = new SimpleTokenizer(DBBulkLoader.stoplistFile.value);
            if (DBBulkLoader.pruneCount.value > 0) {
                DBBulkLoader.generateStoplist(tokenizer, preprocessor);
            }
            ArrayList<Pipe> pipes = new ArrayList<Pipe>();
            Alphabet alphabet = new Alphabet();
            CharSequenceLowercase csl = new CharSequenceLowercase();
            StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
            pipes.add(preprocessor);
            pipes.add(tokenizer);
            pipes.add(sl2fs);
            DBBulkLoader.writeInstanceList(pipes);
        }
    }
}

