package com.ibm.avatar.algebra.util.document;

import com.ibm.avatar.algebra.datamodel.FieldGetter;
import com.ibm.avatar.algebra.datamodel.Span;
import com.ibm.avatar.algebra.datamodel.Text;
import com.ibm.avatar.algebra.datamodel.TextGetter;
import com.ibm.avatar.algebra.datamodel.Tuple;
import com.ibm.avatar.algebra.datamodel.TupleSchema;
import com.ibm.avatar.algebra.util.string.StringEscaper;
import com.ibm.avatar.algebra.util.test.ReservoirSampler;
import com.ibm.avatar.api.Constants;
import com.ibm.avatar.api.DocReader;
import com.ibm.avatar.api.exceptions.TextAnalyticsException;
import com.ibm.avatar.aql.AQLParserConstants;
import com.ibm.avatar.logging.Log;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipOutputStream;

/* loaded from: input_file:com/ibm/avatar/algebra/util/document/DocUtils.class */
public class DocUtils {
    public static int PROGRESS_INTERVAL_DOCS = 10000;

    public static void toCSV(File file, File file2) throws Exception {
        DocReader docReader = new DocReader(file);
        FieldGetter<Span> spanAcc = docReader.getDocSchema().spanAcc("text");
        FieldGetter<Span> spanAcc2 = docReader.getDocSchema().spanAcc(Constants.LABEL_COL_NAME);
        OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(file2), "UTF-8");
        long currentTimeMillis = System.currentTimeMillis();
        long j = 0;
        int i = 0;
        while (docReader.hasNext()) {
            Tuple next = docReader.next();
            String text = spanAcc.getVal(next).getText();
            String text2 = spanAcc2.getVal(next).getText();
            i++;
            j += text.length();
            outputStreamWriter.append((CharSequence) Long.toString(next.getOid().getIDInType()));
            outputStreamWriter.append((CharSequence) ",\"");
            outputStreamWriter.append((CharSequence) StringEscaper.escapeStr(text2));
            outputStreamWriter.append((CharSequence) "\",\"");
            outputStreamWriter.append((CharSequence) StringEscaper.escapeStr(text));
            outputStreamWriter.append((CharSequence) "\"\n");
            if (0 == i % PROGRESS_INTERVAL_DOCS) {
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                long j2 = (j / 1024) / 1024;
                long j3 = currentTimeMillis2 / 1000;
                System.err.printf("Read %d documents (%d MB) in %d sec (%1.2f MB/sec)...\n", Integer.valueOf(i), Long.valueOf(j2), Long.valueOf(j3), Double.valueOf(j2 / j3));
            }
        }
        outputStreamWriter.close();
    }

    public static void toZip(File file, File file2) throws Exception {
        DocReader docReader = new DocReader(file);
        FieldGetter<Span> spanAcc = docReader.getDocSchema().spanAcc("text");
        FieldGetter<Span> spanAcc2 = docReader.getDocSchema().spanAcc(Constants.LABEL_COL_NAME);
        ZipOutputStream zipOutputStream = new ZipOutputStream(new FileOutputStream(file2));
        long currentTimeMillis = System.currentTimeMillis();
        long j = 0;
        int i = 0;
        while (docReader.hasNext()) {
            Tuple next = docReader.next();
            String text = spanAcc.getVal(next).getText();
            String text2 = spanAcc2.getVal(next).getText();
            try {
                zipOutputStream.putNextEntry(new ZipEntry(text2));
                zipOutputStream.write(text.getBytes("UTF-8"));
                i++;
                j += text.length();
                if (0 == i % PROGRESS_INTERVAL_DOCS) {
                    long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                    long j2 = (j / 1024) / 1024;
                    long j3 = currentTimeMillis2 / 1000;
                    System.err.printf("Read %d documents (%d MB) in %d sec (%1.2f MB/sec)...\n", Integer.valueOf(i), Long.valueOf(j2), Long.valueOf(j3), Double.valueOf(j2 / j3));
                }
            } catch (ZipException e) {
                Log.info("Skipping doc with label '%s' due to exception:\n%s", text2, e.getMessage());
            }
        }
        zipOutputStream.close();
        System.err.printf("Done.\n", new Object[0]);
    }

    public static ArrayList<Tuple> getDocSample(File file, int i) throws Exception {
        return getDocSample(file, i, 42L, Integer.MAX_VALUE);
    }

    public static ArrayList<Tuple> getDocSample(File file, int i, long j, int i2) throws Exception {
        DocReader docReader = new DocReader(file);
        if (!docReader.getDocSchema().containsField("text")) {
            throw new TextAnalyticsException("Document utility getDocSample() expects a schema with column '%s'.", "text");
        }
        TextGetter textAcc = docReader.getDocSchema().textAcc("text");
        ReservoirSampler reservoirSampler = new ReservoirSampler(i, j);
        long currentTimeMillis = System.currentTimeMillis();
        long j2 = 0;
        int i3 = 0;
        while (docReader.hasNext() && i3 < i2) {
            reservoirSampler.add(docReader.next());
            i3++;
            j2 += textAcc.getVal(r0).getText().length();
            if (0 == i3 % PROGRESS_INTERVAL_DOCS) {
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                long j3 = (j2 / 1024) / 1024;
                double d = currentTimeMillis2 / 1000.0d;
                System.err.printf("Read %d documents (%d MB) in %1.2f sec (%1.2f MB/sec)...\n", Integer.valueOf(i3), Long.valueOf(j3), Double.valueOf(d), Double.valueOf(j3 / d));
            }
        }
        return reservoirSampler.getReservoir();
    }

    public static ArrayList<Tuple> getEvenDocSample(File file, int i) throws Exception {
        FieldGetter<Text> docTextAcc = docTextAcc(file);
        HashMap hashMap = new HashMap();
        DocReader docReader = new DocReader(file);
        while (docReader.hasNext()) {
            int length = docTextAcc.getVal(docReader.next()).getText().length();
            int i2 = 0;
            if (hashMap.containsKey(Integer.valueOf(length))) {
                i2 = ((Integer) hashMap.get(Integer.valueOf(length))).intValue();
            }
            hashMap.put(Integer.valueOf(length), Integer.valueOf(i2 + 1));
        }
        int[] iArr = new int[AQLParserConstants.SELECT];
        int[] iArr2 = new int[AQLParserConstants.SELECT];
        Iterator it = hashMap.keySet().iterator();
        while (it.hasNext()) {
            int intValue = ((Integer) it.next()).intValue();
            int i3 = intValue / 100;
            if (i3 < iArr.length) {
                iArr[i3] = iArr[i3] + ((Integer) hashMap.get(Integer.valueOf(intValue))).intValue();
            }
        }
        int i4 = 0;
        while (i4 < i) {
            for (int i5 = 0; i5 < 101; i5++) {
                if (iArr[i5] > 0) {
                    int i6 = i5;
                    iArr2[i6] = iArr2[i6] + 1;
                    int i7 = i5;
                    iArr[i7] = iArr[i7] + 1;
                    i4++;
                }
            }
        }
        ReservoirSampler[] reservoirSamplerArr = new ReservoirSampler[iArr2.length];
        for (int i8 = 0; i8 < reservoirSamplerArr.length; i8++) {
            reservoirSamplerArr[i8] = new ReservoirSampler(iArr2[i8], 42L);
        }
        DocReader docReader2 = new DocReader(file);
        while (docReader2.hasNext()) {
            Tuple next = docReader2.next();
            int length2 = docTextAcc.getVal(next).getText().length() / 100;
            if (length2 < reservoirSamplerArr.length) {
                reservoirSamplerArr[length2].add(next);
            }
        }
        ArrayList<Tuple> arrayList = new ArrayList<>();
        for (ReservoirSampler reservoirSampler : reservoirSamplerArr) {
            arrayList.addAll(reservoirSampler.getReservoir());
        }
        return arrayList;
    }

    public static FieldGetter<Text> docTextAcc(File file) throws Exception {
        return new DocReader(file).getDocSchema().textAcc("text");
    }

    public static FieldGetter<Text> docLabelAcc(File file) throws Exception {
        return new DocReader(file).getDocSchema().textAcc(Constants.LABEL_COL_NAME);
    }

    public static ArrayList<TextGetter> getAllTextGetters(TupleSchema tupleSchema) {
        ArrayList<TextGetter> arrayList = new ArrayList<>();
        for (int i = 0; i < tupleSchema.size(); i++) {
            if (tupleSchema.getFieldTypeByIx(i).getIsText()) {
                arrayList.add(tupleSchema.textAcc(tupleSchema.getFieldNameByIx(i)));
            }
        }
        return arrayList;
    }
}
