/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.classify.tui;

import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.NGramPreprocessor;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintInputAndTarget;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TargetStringToFeatures;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequence2FeatureSequenceWithBigrams;
import cc.mallet.pipe.TokenSequenceRemoveNonAlpha;
import cc.mallet.pipe.TokenSequenceRemoveStopPatterns;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

public class Csv2Vectors {
    private static Logger logger = MalletLogger.getLogger(Csv2Vectors.class.getName());
    public static String defaultLineRegex = "^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$";
    public static String defaultTokenRegex = "\\p{L}[\\p{L}\\p{P}]+\\p{L}";
    static CommandOption.File inputFile = new CommandOption.File(Csv2Vectors.class, "input", "FILE", true, null, "The file containing data to be classified, one instance per line", null);
    static CommandOption.File outputFile = new CommandOption.File(Csv2Vectors.class, "output", "FILE", true, new File("text.vectors"), "Write the instance list to this file; Using - indicates stdout.", null);
    static CommandOption.String lineRegex = new CommandOption.String(Csv2Vectors.class, "line-regex", "REGEX", true, defaultLineRegex, "Regular expression containing regex-groups for label, name and data.", null);
    static CommandOption.Integer labelOption = new CommandOption.Integer(Csv2Vectors.class, "label", "INTEGER", true, 2, "The index of the group containing the label string.\n   Use 0 to indicate that the label field is not used.", null);
    static CommandOption.Integer nameOption = new CommandOption.Integer(Csv2Vectors.class, "name", "INTEGER", true, 1, "The index of the group containing the instance name.\n   Use 0 to indicate that the name field is not used.", null);
    static CommandOption.Integer dataOption = new CommandOption.Integer(Csv2Vectors.class, "data", "INTEGER", true, 3, "The index of the group containing the data.", null);
    static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(Csv2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"), "Use the pipe and alphabets from a previously created vectors file.\n   Allows the creation, for example, of a test set of vectors that are\n   compatible with a previously created set of training vectors", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(Csv2Vectors.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(Csv2Vectors.class, "keep-sequence-bigrams", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.", null);
    static CommandOption.Boolean targetAsFeatures = new CommandOption.Boolean(Csv2Vectors.class, "label-as-features", "[TRUE|FALSE]", false, false, "If true, parse the 'label' field as space-delimited features.\n     Use feature=[number] to specify values for non-binary features.", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(Csv2Vectors.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove a default list of common English \"stop words\" from the text.", null);
    static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings(Csv2Vectors.class, "replacement-files", "FILE [FILE ...]", true, null, "files containing string replacements, one per line:\n    'A B [tab] C' replaces A B with C,\n    'A B' replaces A B with A_B", null);
    static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings(Csv2Vectors.class, "deletion-files", "FILE [FILE ...]", true, null, "files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
    static CommandOption.File stoplistFile = new CommandOption.File(Csv2Vectors.class, "stoplist-file", "FILE", true, null, "Instead of the default list, read stop words from a file, one per line. Implies --remove-stopwords", null);
    static CommandOption.File extraStopwordsFile = new CommandOption.File(Csv2Vectors.class, "extra-stopwords", "FILE", true, null, "Read whitespace-separated words from this file, and add them to either \n   the default English stoplist or the list specified by --stoplist-file.", null);
    static CommandOption.File stopPatternFile = new CommandOption.File(Csv2Vectors.class, "stop-pattern-file", "FILE", true, null, "Read regular expressions from a file, one per line. Tokens matching these regexps will be removed.", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(Csv2Vectors.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.String encoding = new CommandOption.String(Csv2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(), "Character encoding for input file", null);
    static CommandOption.String tokenRegex = new CommandOption.String(Csv2Vectors.class, "token-regex", "REGEX", true, defaultTokenRegex, "Regular expression used for tokenization.\n   Example: \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and underscore OR all punctuation) ", null);
    static CommandOption.Boolean printOutput = new CommandOption.Boolean(Csv2Vectors.class, "print-output", "[TRUE|FALSE]", false, false, "If true, print a representation of the processed data\n   to standard output. This option is intended for debugging.", null);

    public static void main(String[] args) throws FileNotFoundException, IOException {
        Pipe instancePipe;
        CommandOption.setSummary(Csv2Vectors.class, "A tool for creating instance lists of feature vectors from comma-separated-values");
        CommandOption.process(Csv2Vectors.class, args);
        if (args.length == 0) {
            CommandOption.getList(Csv2Vectors.class).printUsage(false);
            System.exit(-1);
        }
        if (inputFile == null) {
            throw new IllegalArgumentException("You must include `--input FILE ...' in order to specify afile containing the instances, one per line.");
        }
        InstanceList previousInstanceList = null;
        if (usePipeFromVectorsFile.wasInvoked()) {
            previousInstanceList = InstanceList.load(Csv2Vectors.usePipeFromVectorsFile.value);
            instancePipe = previousInstanceList.getPipe();
        } else {
            TokenSequenceRemoveStopwords stopwordFilter;
            ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
            if (Csv2Vectors.labelOption.value > 0) {
                if (Csv2Vectors.targetAsFeatures.value) {
                    pipeList.add(new TargetStringToFeatures());
                } else {
                    pipeList.add(new Target2Label());
                }
            }
            Pattern tokenPattern = null;
            if (Csv2Vectors.keepSequenceBigrams.value) {
                tokenPattern = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
            } else {
                try {
                    tokenPattern = Pattern.compile(Csv2Vectors.tokenRegex.value);
                }
                catch (PatternSyntaxException pse) {
                    throw new IllegalArgumentException("The token regular expression (" + Csv2Vectors.tokenRegex.value + ") was invalid: " + pse.getMessage());
                }
            }
            if (!preserveCase.value()) {
                pipeList.add(new CharSequenceLowercase());
            }
            if (Csv2Vectors.replacementFiles.value != null || Csv2Vectors.deletionFiles.value != null) {
                String filename;
                int n;
                int n2;
                String[] stringArray;
                NGramPreprocessor preprocessor = new NGramPreprocessor();
                if (Csv2Vectors.replacementFiles.value != null) {
                    stringArray = Csv2Vectors.replacementFiles.value;
                    n2 = Csv2Vectors.replacementFiles.value.length;
                    n = 0;
                    while (n < n2) {
                        filename = stringArray[n];
                        preprocessor.loadReplacements(filename);
                        ++n;
                    }
                }
                if (Csv2Vectors.deletionFiles.value != null) {
                    stringArray = Csv2Vectors.deletionFiles.value;
                    n2 = Csv2Vectors.deletionFiles.value.length;
                    n = 0;
                    while (n < n2) {
                        filename = stringArray[n];
                        preprocessor.loadDeletions(filename);
                        ++n;
                    }
                }
                pipeList.add(preprocessor);
            }
            pipeList.add(new CharSequence2TokenSequence(tokenPattern));
            if (Csv2Vectors.keepSequenceBigrams.value) {
                pipeList.add(new TokenSequenceRemoveNonAlpha(true));
            }
            if (stoplistFile.wasInvoked()) {
                stopwordFilter = new TokenSequenceRemoveStopwords(Csv2Vectors.stoplistFile.value, Csv2Vectors.encoding.value, false, false, Csv2Vectors.keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    stopwordFilter.addStopWords(Csv2Vectors.extraStopwordsFile.value);
                }
                pipeList.add(stopwordFilter);
            } else if (Csv2Vectors.removeStopWords.value) {
                stopwordFilter = new TokenSequenceRemoveStopwords(false, Csv2Vectors.keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    stopwordFilter.addStopWords(Csv2Vectors.extraStopwordsFile.value);
                }
                pipeList.add(stopwordFilter);
            }
            if (stopPatternFile.wasInvoked()) {
                TokenSequenceRemoveStopPatterns stopPatternFilter = new TokenSequenceRemoveStopPatterns(Csv2Vectors.stopPatternFile.value);
                pipeList.add(stopPatternFilter);
            }
            if (Csv2Vectors.keepSequenceBigrams.value) {
                pipeList.add(new TokenSequence2FeatureSequenceWithBigrams());
            } else if (Csv2Vectors.keepSequence.value) {
                pipeList.add(new TokenSequence2FeatureSequence());
            } else {
                pipeList.add(new TokenSequence2FeatureSequence());
                pipeList.add(new FeatureSequence2AugmentableFeatureVector());
            }
            if (Csv2Vectors.printOutput.value) {
                pipeList.add(new PrintInputAndTarget());
            }
            instancePipe = new SerialPipes(pipeList);
        }
        InstanceList instances = new InstanceList(instancePipe);
        InputStreamReader fileReader = Csv2Vectors.inputFile.value.toString().equals("-") ? new InputStreamReader(System.in) : new InputStreamReader((InputStream)new FileInputStream(Csv2Vectors.inputFile.value), Csv2Vectors.encoding.value);
        instances.addThruPipe(new CsvIterator((Reader)fileReader, Pattern.compile(Csv2Vectors.lineRegex.value), Csv2Vectors.dataOption.value, Csv2Vectors.labelOption.value, Csv2Vectors.nameOption.value));
        ObjectOutputStream oos = Csv2Vectors.outputFile.value.toString().equals("-") ? new ObjectOutputStream(System.out) : new ObjectOutputStream(new FileOutputStream(Csv2Vectors.outputFile.value));
        oos.writeObject(instances);
        oos.close();
        if (usePipeFromVectorsFile.wasInvoked()) {
            System.out.println(" Rewriting extended pipe from " + Csv2Vectors.usePipeFromVectorsFile.value);
            System.out.println("  Instance ID = " + previousInstanceList.getPipe().getInstanceId());
            oos = new ObjectOutputStream(new FileOutputStream(Csv2Vectors.usePipeFromVectorsFile.value));
            oos.writeObject(previousInstanceList);
            oos.close();
        }
    }
}

