/*
 * Decompiled with CFR 0.152.
 */
package edu.washington.cs.knowitall.nlp;

import com.google.common.collect.Iterables;
import edu.washington.cs.knowitall.commonlib.FileUtils;
import edu.washington.cs.knowitall.extractor.ExtractorException;
import edu.washington.cs.knowitall.extractor.SentenceExtractor;
import edu.washington.cs.knowitall.nlp.ChunkedDocument;
import edu.washington.cs.knowitall.nlp.ChunkedSentence;
import edu.washington.cs.knowitall.nlp.ChunkerException;
import edu.washington.cs.knowitall.nlp.OpenNlpSentenceChunker;
import edu.washington.cs.knowitall.nlp.SentenceChunker;
import edu.washington.cs.knowitall.util.DefaultObjects;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;

public class ChunkedDocumentReader {
    private SentenceExtractor sentExtractor;
    private SentenceChunker sentChunker;

    public ChunkedDocumentReader(SentenceExtractor sentExtractor, SentenceChunker sentChunker) throws IOException {
        this.sentExtractor = sentExtractor;
        this.sentChunker = sentChunker;
    }

    public ChunkedDocumentReader(SentenceExtractor sentExtractor) throws IOException {
        this(sentExtractor, new OpenNlpSentenceChunker());
    }

    public ChunkedDocumentReader(SentenceChunker sentChunker) throws IOException {
        this(DefaultObjects.getDefaultHtmlSentenceExtractor(), sentChunker);
    }

    public ChunkedDocumentReader() throws IOException {
        this(DefaultObjects.getDefaultHtmlSentenceExtractor(), new OpenNlpSentenceChunker());
    }

    public SentenceExtractor getSentenceExtractor() {
        return this.sentExtractor;
    }

    public SentenceChunker getSentenceChunker() {
        return this.sentChunker;
    }

    public ChunkedDocument readDocument(InputStream input, String id) throws ExtractorException {
        StringWriter writer = new StringWriter();
        InputStreamReader reader = new InputStreamReader(input);
        try {
            FileUtils.pipe((Reader)reader, (Writer)writer);
            return this.readDocument(writer.toString(), id);
        }
        catch (IOException e) {
            String msg = String.format("Could not read document %s", id);
            throw new ExtractorException(msg, e);
        }
    }

    public ChunkedDocument readDocument(File file) throws ExtractorException {
        try {
            return this.readDocument(new FileInputStream(file), file.getAbsolutePath());
        }
        catch (IOException e) {
            String msg = String.format("Could not extract from %s", file);
            throw new ExtractorException(msg, e);
        }
    }

    public ChunkedDocument readDocument(String docStr, String id) throws ExtractorException {
        ArrayList sents = new ArrayList();
        Iterables.addAll(sents, this.sentExtractor.extract(docStr));
        ArrayList<ChunkedSentence> chunkedSents = new ArrayList<ChunkedSentence>(sents.size());
        int sentNum = 1;
        for (String sent : sents) {
            try {
                chunkedSents.add(this.sentChunker.chunkSentence(sent));
                ++sentNum;
            }
            catch (ChunkerException e) {
                String msg = String.format("Could not chunk sentence %s in document %s", sentNum, id);
                throw new ExtractorException(msg);
            }
        }
        return new ChunkedDocument(id, chunkedSents);
    }
}

