/*
 * Decompiled with CFR 0.152.
 */
package org.apache.spark.ml.odkl.texts;

import org.apache.spark.annotation.DeveloperApi;
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator$;
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator$$anonfun$1$;
import org.apache.spark.ml.param.DoubleParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.param.ParamValidators$;
import org.apache.spark.ml.util.Identifiable;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.mllib.linalg.BLAS$;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors$;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Row$;
import org.apache.spark.sql.types.StructType;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.collection.Iterator;
import scala.collection.Seq;
import scala.collection.mutable.ArrayBuffer;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;
import scala.runtime.LongRef;

@ScalaSignature(bytes="\u0006\u0001}4A!\u0001\u0002\u0001\u001f\t)\u0002*Y:i\u0005\u0006\u001cX\r\u001a#fIV\u0004H.[2bi>\u0014(BA\u0002\u0005\u0003\u0015!X\r\u001f;t\u0015\t)a!\u0001\u0003pI.d'BA\u0004\t\u0003\tiGN\u0003\u0002\n\u0015\u0005)1\u000f]1sW*\u00111\u0002D\u0001\u0007CB\f7\r[3\u000b\u00035\t1a\u001c:h\u0007\u0001\u0019B\u0001\u0001\t\u00155A\u0011\u0011CE\u0007\u0002\r%\u00111C\u0002\u0002\f)J\fgn\u001d4pe6,'\u000f\u0005\u0002\u001615\taC\u0003\u0002\u0018\r\u0005)\u0001/\u0019:b[&\u0011\u0011D\u0006\u0002\u0007!\u0006\u0014\u0018-\\:\u0011\u0005maR\"\u0001\u0005\n\u0005uA!a\u0002'pO\u001eLgn\u001a\u0005\t?\u0001\u0011)\u0019!C!A\u0005\u0019Q/\u001b3\u0016\u0003\u0005\u0002\"A\t\u0015\u000f\u0005\r2S\"\u0001\u0013\u000b\u0003\u0015\nQa]2bY\u0006L!a\n\u0013\u0002\rA\u0013X\rZ3g\u0013\tI#F\u0001\u0004TiJLgn\u001a\u0006\u0003O\u0011B\u0001\u0002\f\u0001\u0003\u0002\u0003\u0006I!I\u0001\u0005k&$\u0007\u0005C\u0003/\u0001\u0011\u0005q&\u0001\u0004=S:LGO\u0010\u000b\u0003aI\u0002\"!\r\u0001\u000e\u0003\tAQaH\u0017A\u0002\u0005Bq\u0001\u000e\u0001C\u0002\u0013\u0005Q'A\ntS6LG.\u0019:jif$\u0006N]3tQ>dG-F\u00017!\t)r'\u0003\u00029-\tYAi\\;cY\u0016\u0004\u0016M]1n\u0011\u0019Q\u0004\u0001)A\u0005m\u0005!2/[7jY\u0006\u0014\u0018\u000e^=UQJ,7\u000f[8mI\u0002Bq\u0001\u0010\u0001C\u0002\u0013\u0005Q(\u0001\u0007j]B,HoQ8m\u0011\u0006\u001c\b.F\u0001?!\r)r(I\u0005\u0003\u0001Z\u0011Q\u0001U1sC6DaA\u0011\u0001!\u0002\u0013q\u0014!D5oaV$8i\u001c7ICND\u0007\u0005C\u0004E\u0001\t\u0007I\u0011A\u001f\u0002\u001d%t\u0007/\u001e;D_24Vm\u0019;pe\"1a\t\u0001Q\u0001\ny\nq\"\u001b8qkR\u001cu\u000e\u001c,fGR|'\u000f\t\u0005\u0006\u0011\u0002!\t!S\u0001\u0012g\u0016$\u0018J\u001c9vi\u000e{GNV3di>\u0014HC\u0001&L\u001b\u0005\u0001\u0001\"\u0002'H\u0001\u0004\t\u0013!\u0002<bYV,\u0007\"\u0002(\u0001\t\u0003y\u0015aD:fi&s\u0007/\u001e;D_2D\u0015m\u001d5\u0015\u0005)\u0003\u0006\"\u0002'N\u0001\u0004\t\u0003\"\u0002*\u0001\t\u0003\u0019\u0016!F:fiNKW.\u001b7be&$\u0018\u0010\u0016:fg\"|G\u000e\u001a\u000b\u0003\u0015RCQ\u0001T)A\u0002U\u0003\"a\t,\n\u0005]##A\u0002#pk\ndW\rC\u0003/\u0001\u0011\u0005\u0011\fF\u00011\u0011\u0015Y\u0006\u0001\"\u0011]\u0003%!(/\u00198tM>\u0014X\u000e\u0006\u0002^GB\u0011a,Y\u0007\u0002?*\u0011\u0001\rC\u0001\u0004gFd\u0017B\u00012`\u0005%!\u0015\r^1Ge\u0006lW\rC\u0003e5\u0002\u0007Q,A\u0004eCR\f7/\u001a;\t\u000b\u0019\u0004A\u0011I4\u0002\u001fQ\u0014\u0018M\\:g_Jl7k\u00195f[\u0006$\"\u0001\u001b8\u0011\u0005%dW\"\u00016\u000b\u0005-|\u0016!\u0002;za\u0016\u001c\u0018BA7k\u0005)\u0019FO];diRK\b/\u001a\u0005\u0006_\u0016\u0004\r\u0001[\u0001\u0007g\u000eDW-\\1)\u0005\u0015\f\bC\u0001:v\u001b\u0005\u0019(B\u0001;\t\u0003)\tgN\\8uCRLwN\\\u0005\u0003mN\u0014A\u0002R3wK2|\u0007/\u001a:Ba&DQ\u0001\u001f\u0001\u0005Be\fAaY8qsR\u0011\u0001C\u001f\u0005\u0006w^\u0004\r\u0001`\u0001\u0006Kb$(/\u0019\t\u0003+uL!A \f\u0003\u0011A\u000b'/Y7NCB\u0004")
public class HashBasedDeduplicator
extends Transformer {
    private final String uid;
    private final DoubleParam similarityThreshold;
    private final Param<String> inputColHash;
    private final Param<String> inputColVector;

    public String uid() {
        return this.uid;
    }

    public DoubleParam similarityThreshold() {
        return this.similarityThreshold;
    }

    public Param<String> inputColHash() {
        return this.inputColHash;
    }

    public Param<String> inputColVector() {
        return this.inputColVector;
    }

    public HashBasedDeduplicator setInputColVector(String value) {
        return (HashBasedDeduplicator)this.set(this.inputColVector(), value);
    }

    public HashBasedDeduplicator setInputColHash(String value) {
        return (HashBasedDeduplicator)this.set(this.inputColHash(), value);
    }

    public HashBasedDeduplicator setSimilarityTreshold(double value) {
        return (HashBasedDeduplicator)this.set((Param)this.similarityThreshold(), BoxesRunTime.boxToDouble((double)value));
    }

    public DataFrame transform(DataFrame dataset) {
        RDD qual$1 = dataset.repartition((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{dataset.col((String)this.$(this.inputColHash()))})).sortWithinPartitions((String)this.$(this.inputColHash()), (Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[0])).rdd();
        Serializable x$2 = new Serializable(this){
            public static final long serialVersionUID = 0L;
            private final /* synthetic */ HashBasedDeduplicator $outer;

            public final Iterator<Row> apply(Iterator<Row> f) {
                Iterator iterator;
                if (f.hasNext()) {
                    LongRef curHash = LongRef.create((long)-1L);
                    ArrayBuffer vectorsBuffer = new ArrayBuffer(0);
                    iterator = f.map((Function1)new Serializable(this, curHash, vectorsBuffer){
                        public static final long serialVersionUID = 0L;
                        private final /* synthetic */ $anonfun$1 $outer;
                        private final LongRef curHash$1;
                        private final ArrayBuffer vectorsBuffer$1;

                        public final Row apply(Row it) {
                            Row row;
                            long newHash = BoxesRunTime.unboxToLong((Object)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColHash())));
                            if (newHash == this.curHash$1.elem) {
                                Vector currentVector = (Vector)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColVector()));
                                boolean isUnique = this.vectorsBuffer$1.forall((Function1)new Serializable(this, currentVector){
                                    public static final long serialVersionUID = 0L;
                                    private final /* synthetic */ $anonfun$1$$anonfun$apply$1 $outer;
                                    private final Vector currentVector$1;

                                    public final boolean apply(Vector storedVector) {
                                        return BLAS$.MODULE$.dot(storedVector, this.currentVector$1) / (Vectors$.MODULE$.norm(storedVector, 2.0) * Vectors$.MODULE$.norm(this.currentVector$1, 2.0)) < BoxesRunTime.unboxToDouble((Object)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer().org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$((Param)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer().org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().similarityThreshold()));
                                    }
                                    {
                                        if ($outer == null) {
                                            throw null;
                                        }
                                        this.$outer = $outer;
                                        this.currentVector$1 = currentVector$1;
                                    }
                                });
                                if (isUnique) {
                                    this.vectorsBuffer$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Vector[]{currentVector}));
                                    row = it;
                                } else {
                                    row = Row$.MODULE$.empty();
                                }
                            } else {
                                this.vectorsBuffer$1.clear();
                                this.vectorsBuffer$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Vector[]{(Vector)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColVector()))}));
                                this.curHash$1.elem = newHash;
                                row = it;
                            }
                            return row;
                        }

                        public /* synthetic */ $anonfun$1 org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer() {
                            return this.$outer;
                        }
                        {
                            if ($outer == null) {
                                throw null;
                            }
                            this.$outer = $outer;
                            this.curHash$1 = curHash$1;
                            this.vectorsBuffer$1 = vectorsBuffer$1;
                        }
                    });
                } else {
                    iterator = Predef$.MODULE$.refArrayOps((Object[])new Row[0]).toIterator();
                }
                return iterator;
            }

            public /* synthetic */ HashBasedDeduplicator org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer() {
                return this.$outer;
            }
            {
                if ($outer == null) {
                    throw null;
                }
                this.$outer = $outer;
            }
        };
        boolean x$3 = qual$1.mapPartitions$default$2();
        return dataset.sqlContext().createDataFrame(qual$1.mapPartitions((Function1)x$2, x$3, ClassTag$.MODULE$.apply(Row.class)).filter((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final boolean apply(Row x$1) {
                return !x$1.equals((Object)Row$.MODULE$.empty());
            }
        }), this.transformSchema(dataset.schema()));
    }

    @DeveloperApi
    public StructType transformSchema(StructType schema) {
        return schema;
    }

    public Transformer copy(ParamMap extra) {
        return (Transformer)this.defaultCopy(extra);
    }

    public HashBasedDeduplicator(String uid) {
        this.uid = uid;
        this.similarityThreshold = new DoubleParam((Identifiable)this, "simTresh", "cosine similarity Treshold for dedupolication in one hash-bucket for vectors to be marked as 'similar' \n 0.9 by default", ParamValidators$.MODULE$.inRange(0.0, 1.0, false, true));
        this.inputColHash = new Param((Identifiable)this, "inputColHash", "column with LSH(local sensitive hashing) as Long \n \"hash\" by default");
        this.inputColVector = new Param((Identifiable)this, "inputColVector", "column with Vector data representation");
        this.setDefault((Seq)Predef$.MODULE$.wrapRefArray((Object[])new ParamPair[]{new ParamPair(this.inputColHash(), (Object)"hash"), new ParamPair((Param)this.similarityThreshold(), (Object)BoxesRunTime.boxToDouble((double)0.9))}));
    }

    public HashBasedDeduplicator() {
        this(Identifiable$.MODULE$.randomUID("hashBasedDeduplication"));
    }
}

