/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.cluster.tui;

import cc.mallet.classify.MaxEnt;
import cc.mallet.classify.MaxEntTrainer;
import cc.mallet.classify.Trial;
import cc.mallet.cluster.Clusterer;
import cc.mallet.cluster.Clustering;
import cc.mallet.cluster.Clusterings;
import cc.mallet.cluster.GreedyAgglomerativeByDensity;
import cc.mallet.cluster.Record;
import cc.mallet.cluster.evaluate.AccuracyEvaluator;
import cc.mallet.cluster.evaluate.BCubedEvaluator;
import cc.mallet.cluster.evaluate.ClusteringEvaluator;
import cc.mallet.cluster.evaluate.ClusteringEvaluators;
import cc.mallet.cluster.evaluate.MUCEvaluator;
import cc.mallet.cluster.evaluate.PairF1Evaluator;
import cc.mallet.cluster.iterator.PairSampleIterator;
import cc.mallet.cluster.neighbor_evaluator.AgglomerativeNeighbor;
import cc.mallet.cluster.neighbor_evaluator.PairwiseEvaluator;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.PropertyList;
import cc.mallet.util.Randoms;
import cc.mallet.util.Strings;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.logging.Logger;

public class Clusterings2Clusterer {
    private static Logger logger = MalletLogger.getLogger(Clusterings2Clusterer.class.getName());
    static CommandOption.File loadClusterer = new CommandOption.File(Clusterings2Clusterer.class, "load-clusterer", "FILE", false, null, "The file from which to read the clusterer.", null);
    static CommandOption.File saveClusterer = new CommandOption.File(Clusterings2Clusterer.class, "save-clusterer", "FILE", false, new File("clusterer.mallet"), "The filename in which to write the clusterer after it has been trained.", null);
    static CommandOption.String outputClusterings = new CommandOption.String(Clusterings2Clusterer.class, "output-clusterings", "FILENAME", false, "predictions", "The filename in which to write the predicted clusterings.", null);
    static CommandOption.String trainingFile = new CommandOption.String(Clusterings2Clusterer.class, "train", "FILENAME", false, "text.clusterings.train", "Read the training set Clusterings from this file. If this is specified, the input file parameter is ignored", null);
    static CommandOption.String testingFile = new CommandOption.String(Clusterings2Clusterer.class, "test", "FILENAME", false, "text.clusterings.test", "Read the test set Clusterings from this file. If this option is specified, the training-file parameter must be specified and  the input-file parameter is ignored", null);
    static CommandOption.Object clusteringEvaluatorOption = new CommandOption.Object(Clusterings2Clusterer.class, "clustering-evaluator", "CONSTRUCTOR", true, null, "Java code for constructing a ClusteringEvaluator object", null);
    static CommandOption.SpacedStrings exactMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "exact-match-fields", "STRING...", false, null, "The field names to be checked for exactly matching values", null);
    static CommandOption.SpacedStrings approxMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "approx-match-fields", "STRING...", false, null, "The field names to be checked for approx matching values", null);
    static CommandOption.SpacedStrings substringMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "substring-match-fields", "STRING...", false, null, "The field names to be checked for substring matching values. Note that values fewer than 3 characters are ignored.", null);

    public static void main(String[] args) throws Exception {
        CommandOption.setSummary(Clusterings2Clusterer.class, "A tool to train and test a Clusterer.");
        CommandOption.process(Clusterings2Clusterer.class, args);
        Randoms random = new Randoms(123);
        Clusterer clusterer = null;
        if (!Clusterings2Clusterer.loadClusterer.value.exists()) {
            Clusterings training = Clusterings2Clusterer.readClusterings(Clusterings2Clusterer.trainingFile.value);
            Alphabet fieldAlphabet = ((Record)((Instance)training.get(0).getInstances().get(0)).getData()).fieldAlphabet();
            ClusteringPipe pipe = new ClusteringPipe(Clusterings2Clusterer.string2ints(Clusterings2Clusterer.exactMatchFields.value, fieldAlphabet), Clusterings2Clusterer.string2ints(Clusterings2Clusterer.approxMatchFields.value, fieldAlphabet), Clusterings2Clusterer.string2ints(Clusterings2Clusterer.substringMatchFields.value, fieldAlphabet));
            InstanceList trainingInstances = new InstanceList(pipe);
            int i = 0;
            while (i < training.size()) {
                PairSampleIterator iterator = new PairSampleIterator(training.get(i), random, 0.5, training.get(i).getNumInstances());
                while (iterator.hasNext()) {
                    Instance inst = iterator.next();
                    trainingInstances.add(((Pipe)pipe).pipe(inst));
                }
                ++i;
            }
            logger.info("generated " + trainingInstances.size() + " training instances");
            MaxEnt classifier = new MaxEntTrainer().train(trainingInstances);
            logger.info("InfoGain:\n");
            new InfoGain(trainingInstances).printByRank(System.out);
            logger.info("pairwise training accuracy=" + new Trial(classifier, trainingInstances).getAccuracy());
            PairwiseEvaluator neval = new PairwiseEvaluator(classifier, "YES", new PairwiseEvaluator.Average(), true);
            clusterer = new GreedyAgglomerativeByDensity(training.get(0).getInstances().getPipe(), neval, 0.5, false, random);
            training = null;
            trainingInstances = null;
        } else {
            ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Clusterings2Clusterer.loadClusterer.value));
            clusterer = (Clusterer)ois.readObject();
        }
        Clusterings testing = Clusterings2Clusterer.readClusterings(Clusterings2Clusterer.testingFile.value);
        ClusteringEvaluator evaluator = (ClusteringEvaluator)Clusterings2Clusterer.clusteringEvaluatorOption.value;
        if (evaluator == null) {
            evaluator = new ClusteringEvaluators(new ClusteringEvaluator[]{new BCubedEvaluator(), new PairF1Evaluator(), new MUCEvaluator(), new AccuracyEvaluator()});
        }
        ArrayList<Clustering> predictions = new ArrayList<Clustering>();
        int i = 0;
        while (i < testing.size()) {
            Clustering clustering = testing.get(i);
            Clustering predicted = clusterer.cluster(clustering.getInstances());
            predictions.add(predicted);
            logger.info(evaluator.evaluate(clustering, predicted));
            ++i;
        }
        logger.info(evaluator.evaluateTotals());
        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(Clusterings2Clusterer.saveClusterer.value));
        oos.writeObject(clusterer);
        oos.close();
        if (Clusterings2Clusterer.outputClusterings.value != null) {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(Clusterings2Clusterer.outputClusterings.value)));
            writer.write(predictions.toString());
            writer.flush();
            writer.close();
        }
    }

    public static int[] string2ints(String[] ss, Alphabet alph) {
        int[] ret = new int[ss.length];
        int i = 0;
        while (i < ss.length) {
            ret[i] = alph.lookupIndex(ss[i]);
            ++i;
        }
        return ret;
    }

    public static Clusterings readClusterings(String f) throws Exception {
        ObjectInputStream ois = new ObjectInputStream(new FileInputStream(new File(f)));
        return (Clusterings)ois.readObject();
    }

    public static class ClusteringPipe
    extends Pipe {
        private static final long serialVersionUID = 1L;
        int[] exactMatchFields;
        int[] approxMatchFields;
        int[] substringMatchFields;
        double approxMatchThreshold;

        public ClusteringPipe(int[] exactMatchFields, int[] approxMatchFields, int[] substringMatchFields) {
            super(new Alphabet(), new LabelAlphabet());
            this.exactMatchFields = exactMatchFields;
            this.approxMatchFields = approxMatchFields;
            this.substringMatchFields = substringMatchFields;
        }

        private Record[] array2Records(int[] a, InstanceList list) {
            ArrayList<Record> records = new ArrayList<Record>();
            int i = 0;
            while (i < a.length) {
                records.add((Record)((Instance)list.get(a[i])).getData());
                ++i;
            }
            return records.toArray(new Record[0]);
        }

        @Override
        public Instance pipe(Instance carrier) {
            AgglomerativeNeighbor neighbor = (AgglomerativeNeighbor)carrier.getData();
            Clustering original = neighbor.getOriginal();
            int[] cluster1 = neighbor.getOldClusters()[0];
            int[] cluster2 = neighbor.getOldClusters()[1];
            InstanceList list = original.getInstances();
            int[] mergedIndices = neighbor.getNewCluster();
            Record[] records = this.array2Records(mergedIndices, list);
            Alphabet fieldAlph = records[0].fieldAlphabet();
            Alphabet valueAlph = records[0].valueAlphabet();
            PropertyList features = null;
            features = this.addExactMatch(records, fieldAlph, valueAlph, features);
            features = this.addApproxMatch(records, fieldAlph, valueAlph, features);
            features = this.addSubstringMatch(records, fieldAlph, valueAlph, features);
            carrier.setData(new FeatureVector(this.getDataAlphabet(), features, true));
            LabelAlphabet ldict = (LabelAlphabet)this.getTargetAlphabet();
            String label = original.getLabel(cluster1[0]) == original.getLabel(cluster2[0]) ? "YES" : "NO";
            carrier.setTarget(ldict.lookupLabel(label));
            return carrier;
        }

        private PropertyList addExactMatch(Record[] records, Alphabet fieldAlph, Alphabet valueAlph, PropertyList features) {
            int fi = 0;
            while (fi < this.exactMatchFields.length) {
                int matches = 0;
                int comparisons = 0;
                int i = 0;
                while (i < records.length && this.exactMatchFields.length > 0) {
                    FeatureVector valsi = records[i].values(this.exactMatchFields[fi]);
                    int j = i + 1;
                    while (j < records.length && valsi != null) {
                        FeatureVector valsj = records[j].values(this.exactMatchFields[fi]);
                        if (valsj != null) {
                            ++comparisons;
                            int ii = 0;
                            while (ii < valsi.numLocations()) {
                                if (valsj.contains(valueAlph.lookupObject(valsi.indexAtLocation(ii)))) {
                                    ++matches;
                                    break;
                                }
                                ++ii;
                            }
                        }
                        ++j;
                    }
                    if (matches == comparisons && comparisons > 1) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.exactMatchFields[fi]) + "_all_match", 1.0, features);
                    }
                    if (matches > 0) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.exactMatchFields[fi]) + "_exists_match", 1.0, features);
                    }
                    ++i;
                }
                ++fi;
            }
            return features;
        }

        private PropertyList addApproxMatch(Record[] records, Alphabet fieldAlph, Alphabet valueAlph, PropertyList features) {
            int fi = 0;
            while (fi < this.approxMatchFields.length) {
                int matches = 0;
                int comparisons = 0;
                int i = 0;
                while (i < records.length && this.approxMatchFields.length > 0) {
                    FeatureVector valsi = records[i].values(this.approxMatchFields[fi]);
                    int j = i + 1;
                    while (j < records.length && valsi != null) {
                        FeatureVector valsj = records[j].values(this.approxMatchFields[fi]);
                        if (valsj != null) {
                            ++comparisons;
                            int ii = 0;
                            while (ii < valsi.numLocations()) {
                                String si = (String)valueAlph.lookupObject(valsi.indexAtLocation(ii));
                                int jj = 0;
                                while (jj < valsj.numLocations()) {
                                    String sj = (String)valueAlph.lookupObject(valsj.indexAtLocation(jj));
                                    if (Strings.levenshteinDistance(si, sj) < this.approxMatchThreshold) {
                                        ++matches;
                                        break;
                                    }
                                    ++jj;
                                }
                                ++ii;
                            }
                        }
                        ++j;
                    }
                    if (matches == comparisons && comparisons > 1) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.approxMatchFields[fi]) + "_all_approx_match", 1.0, features);
                    }
                    if (matches > 0) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.approxMatchFields[fi]) + "_exists_approx_match", 1.0, features);
                    }
                    ++i;
                }
                ++fi;
            }
            return features;
        }

        private PropertyList addSubstringMatch(Record[] records, Alphabet fieldAlph, Alphabet valueAlph, PropertyList features) {
            int fi = 0;
            while (fi < this.substringMatchFields.length) {
                int matches = 0;
                int comparisons = 0;
                int i = 0;
                while (i < records.length && this.substringMatchFields.length > 0) {
                    FeatureVector valsi = records[i].values(this.substringMatchFields[fi]);
                    int j = i + 1;
                    while (j < records.length && valsi != null) {
                        FeatureVector valsj = records[j].values(this.substringMatchFields[fi]);
                        if (valsj != null) {
                            ++comparisons;
                            int ii = 0;
                            while (ii < valsi.numLocations()) {
                                String si = (String)valueAlph.lookupObject(valsi.indexAtLocation(ii));
                                if (si.length() < 2) break;
                                int jj = 0;
                                while (jj < valsj.numLocations()) {
                                    String sj = (String)valueAlph.lookupObject(valsj.indexAtLocation(jj));
                                    if (sj.length() > 2 && (si.contains(si) || sj.contains(si))) {
                                        ++matches;
                                        break;
                                    }
                                    ++jj;
                                }
                                ++ii;
                            }
                        }
                        ++j;
                    }
                    if (matches == comparisons && comparisons > 1) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.exactMatchFields[fi]) + "_all_substring_match", 1.0, features);
                    }
                    if (matches > 0) {
                        features = PropertyList.add(fieldAlph.lookupObject(this.exactMatchFields[fi]) + "_exists_substring_match", 1.0, features);
                    }
                    ++i;
                }
                ++fi;
            }
            return features;
        }
    }
}

