/*
 * Decompiled with CFR 0.152.
 */
package org.icij.extract.extractor;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Objects;
import java.util.function.Function;
import org.apache.commons.io.TaggedIOException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.icij.extract.document.DocumentFactory;
import org.icij.extract.document.PathIdentifier;
import org.icij.extract.document.TikaDocument;
import org.icij.extract.extractor.EmbedBlocker;
import org.icij.extract.extractor.EmbedParser;
import org.icij.extract.extractor.EmbedSpawner;
import org.icij.extract.extractor.ExtractionStatus;
import org.icij.extract.parser.CachingTesseractOCRParser;
import org.icij.extract.parser.FallbackParser;
import org.icij.extract.parser.HTML5Serializer;
import org.icij.extract.parser.ParsingReader;
import org.icij.extract.report.Reporter;
import org.icij.spewer.MetadataTransformer;
import org.icij.spewer.Spewer;
import org.icij.task.Options;
import org.icij.task.annotation.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

@org.icij.task.annotation.Options(value={@Option(name="digestMethod", description="The hash digest method used for documents, for example \"SHA256\".", parameter="name"), @Option(name="outputFormat", description="Set the output format. Either \"text\" or \"HTML\". Defaults to text output.", parameter="type"), @Option(name="embedHandling", description="Set the embed handling mode. Either \"ignore\", \"concatenate\" or \"spawn\". When set to concatenate, embeds are parsed and the output is in-lined into the main output.Defaults to spawning, which spawns new documents for each embedded document encountered.", parameter="type"), @Option(name="embedOutput", description="Path to a directory for outputting attachments en masse.", parameter="path"), @Option(name="ocrCache", description="Output path for OCR cache files.", parameter="path"), @Option(name="ocrLanguage", description="Set the languages used by Tesseract. Multiple  languages may be specified, separated by plus characters. Tesseract uses 3-character ISO 639-2 language codes.", parameter="language"), @Option(name="ocrTimeout", description="Set the timeout for the Tesseract process to finish e.g. \"5s\" or \"1m\". Defaults to 12 hours.", parameter="duration"), @Option(name="ocr", description="Enable or disable automatic OCR. On by default.")})
public class Extractor {
    private static final Logger logger = LoggerFactory.getLogger(Extractor.class);
    private boolean ocrDisabled = false;
    private DigestingParser.Digester digester = null;
    private Parser defaultParser = TikaConfig.getDefaultConfig().getParser();
    private final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
    private final PDFParserConfig pdfConfig = new PDFParserConfig();
    private final DocumentFactory documentFactory;
    private OutputFormat outputFormat = OutputFormat.TEXT;
    private EmbedHandling embedHandling = EmbedHandling.getDefault();
    private Path embedOutput = null;

    public Extractor(DocumentFactory factory) {
        this.documentFactory = factory;
        this.setDigestAlgorithm(CommonsDigester.DigestAlgorithm.SHA256.toString());
        this.pdfConfig.setExtractInlineImages(true);
        this.pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
        this.pdfConfig.setExtractUniqueInlineImagesOnly(false);
        this.setOcrTimeout(Duration.ofDays(1L));
        this.ocrConfig.setEnableImageProcessing(0);
        this.ocrConfig.setLanguage("eng");
    }

    public Extractor() {
        this(new DocumentFactory().withIdentifier(new PathIdentifier()));
    }

    public Extractor configure(Options<String> options) {
        this.documentFactory.configure(options);
        options.get("outputFormat").parse().asEnum(OutputFormat::parse).ifPresent(this::setOutputFormat);
        options.get("embedHandling").parse().asEnum(EmbedHandling::parse).ifPresent(this::setEmbedHandling);
        options.get("embedOutput").parse().asPath().ifPresent(this::setEmbedOutputPath);
        options.get("ocrLanguage").value().ifPresent(this::setOcrLanguage);
        options.get("ocrTimeout").parse().asDuration().ifPresent(this::setOcrTimeout);
        options.get("digestMethod").value().ifPresent(this::setDigestAlgorithm);
        if (options.get("ocr").parse().isOff()) {
            this.disableOcr();
        }
        options.get("ocrCache").parse().asPath().ifPresent(path -> this.replaceParser(TesseractOCRParser.class, (Parser)new CachingTesseractOCRParser((Path)path)));
        return this;
    }

    public void setOutputFormat(OutputFormat outputFormat) {
        this.outputFormat = outputFormat;
    }

    public OutputFormat getOutputFormat() {
        return this.outputFormat;
    }

    public void setEmbedHandling(EmbedHandling embedHandling) {
        this.embedHandling = embedHandling;
    }

    public EmbedHandling getEmbedHandling() {
        return this.embedHandling;
    }

    public void setEmbedOutputPath(Path embedOutput) {
        this.embedOutput = embedOutput;
    }

    public Path getEmbedOutputPath() {
        return this.embedOutput;
    }

    public void setOcrLanguage(String ocrLanguage) {
        this.ocrConfig.setLanguage(ocrLanguage);
    }

    private void setOcrTimeout(int ocrTimeout) {
        this.ocrConfig.setTimeout(ocrTimeout);
    }

    public void setOcrTimeout(Duration duration) {
        this.setOcrTimeout(Math.toIntExact(duration.getSeconds()));
    }

    public void setDigestAlgorithm(String digestAlgorithm) {
        this.digester = new CommonsDigester(0x1400000, digestAlgorithm);
    }

    public void setDigester(DigestingParser.Digester digester) {
        this.digester = digester;
    }

    public void disableOcr() {
        if (!this.ocrDisabled) {
            this.excludeParser(TesseractOCRParser.class);
            this.ocrDisabled = true;
            this.pdfConfig.setExtractInlineImages(false);
        }
    }

    public void extract(Path path, Spewer spewer) throws IOException {
        long before = System.currentTimeMillis();
        TikaDocument document = this.extract(path);
        logger.info("{} extracted in {}ms", (Object)path, (Object)(System.currentTimeMillis() - before));
        spewer.write(document);
    }

    public void extract(Path path, Spewer spewer, Reporter reporter) {
        Objects.requireNonNull(reporter);
        if (reporter.skip(path)) {
            logger.info(String.format("File already extracted; skipping: \"%s\".", path));
            return;
        }
        ExtractionStatus status = ExtractionStatus.SUCCESS;
        Exception exception = null;
        try {
            this.extract(path, spewer);
        }
        catch (Exception e) {
            status = this.status(e, spewer);
            this.log(e, status, path);
            exception = e;
        }
        if (null != exception && exception instanceof TaggedIOException) {
            exception = ((TaggedIOException)exception).getCause();
        }
        reporter.save(path, status, exception);
    }

    private void log(Exception e, ExtractionStatus status, Path file) {
        switch (status) {
            case FAILURE_NOT_SAVED: {
                logger.error(String.format("The extraction result could not be outputted: \"%s\".", file), e.getCause());
                break;
            }
            case FAILURE_NOT_FOUND: {
                logger.error(String.format("File not found: \"%s\".", file), (Throwable)e);
                break;
            }
            case FAILURE_NOT_DECRYPTED: {
                logger.warn(String.format("Skipping encrypted file: \"%s\".", file), (Throwable)e);
                break;
            }
            case FAILURE_NOT_PARSED: {
                logger.error(String.format("The file could not be parsed: \"%s\".", file), (Throwable)e);
                break;
            }
            case FAILURE_UNREADABLE: {
                logger.error(String.format("The file stream could not be read: \"%s\".", file), (Throwable)e);
                break;
            }
            default: {
                logger.error(String.format("Unknown exception during extraction or output: \"%s\".", file), (Throwable)e);
            }
        }
    }

    private ExtractionStatus status(Exception e, Spewer spewer) {
        if (TaggedIOException.isTaggedWith((Throwable)e, (Object)spewer)) {
            return ExtractionStatus.FAILURE_NOT_SAVED;
        }
        if (TaggedIOException.isTaggedWith((Throwable)e, MetadataTransformer.class)) {
            return ExtractionStatus.FAILURE_NOT_PARSED;
        }
        if (e instanceof FileNotFoundException) {
            return ExtractionStatus.FAILURE_NOT_FOUND;
        }
        if (!(e instanceof IOException)) {
            return ExtractionStatus.FAILURE_UNKNOWN;
        }
        Throwable cause = e.getCause();
        if (cause instanceof EncryptedDocumentException) {
            return ExtractionStatus.FAILURE_NOT_DECRYPTED;
        }
        if (cause instanceof TikaException) {
            return ExtractionStatus.FAILURE_NOT_PARSED;
        }
        return ExtractionStatus.FAILURE_UNREADABLE;
    }

    public TikaDocument extract(Path path) throws IOException {
        TikaDocument rootDocument = this.documentFactory.create(path);
        TikaInputStream tikaInputStream = TikaInputStream.get((Path)path, (Metadata)rootDocument.getMetadata());
        ParseContext context = new ParseContext();
        AutoDetectParser autoDetectParser = new AutoDetectParser(new Parser[]{this.defaultParser});
        autoDetectParser.setFallback(FallbackParser.INSTANCE);
        Object parser = null != this.digester ? new DigestingParser((Parser)autoDetectParser, this.digester) : autoDetectParser;
        if (!this.ocrDisabled) {
            context.set(TesseractOCRConfig.class, (Object)this.ocrConfig);
        }
        context.set(PDFParserConfig.class, (Object)this.pdfConfig);
        context.set(HtmlMapper.class, (Object)DefaultHtmlMapper.INSTANCE);
        Function<Writer, ContentHandler> handler = OutputFormat.HTML == this.outputFormat ? writer -> new ExpandedTitleContentHandler((ContentHandler)new HTML5Serializer((Writer)writer)) : WriteOutContentHandler::new;
        if (EmbedHandling.SPAWN == this.embedHandling) {
            context.set(Parser.class, parser);
            context.set(EmbeddedDocumentExtractor.class, (Object)new EmbedSpawner(rootDocument, context, this.embedOutput, handler));
        } else if (EmbedHandling.CONCATENATE == this.embedHandling) {
            context.set(Parser.class, parser);
            context.set(EmbeddedDocumentExtractor.class, (Object)new EmbedParser(rootDocument, context));
        } else {
            context.set(Parser.class, (Object)EmptyParser.INSTANCE);
            context.set(EmbeddedDocumentExtractor.class, (Object)new EmbedBlocker());
        }
        Object reader = OutputFormat.HTML == this.outputFormat ? new ParsingReader((Parser)parser, (InputStream)tikaInputStream, rootDocument.getMetadata(), context, handler) : new org.apache.tika.parser.ParsingReader((Parser)parser, (InputStream)tikaInputStream, rootDocument.getMetadata(), context);
        rootDocument.setReader((Reader)reader);
        return rootDocument;
    }

    private void excludeParser(Class<? extends Parser> exclude) {
        this.replaceParser(exclude, null);
    }

    private void replaceParser(Class<? extends Parser> exclude, Parser replacement) {
        if (this.defaultParser instanceof CompositeParser) {
            CompositeParser composite = (CompositeParser)this.defaultParser;
            ArrayList parsers = new ArrayList();
            composite.getAllComponentParsers().forEach(parser -> {
                if (parser.getClass().equals(exclude) || exclude.isAssignableFrom(parser.getClass())) {
                    if (null != replacement) {
                        parsers.add(replacement);
                    }
                } else {
                    parsers.add(parser);
                }
            });
            this.defaultParser = new CompositeParser(composite.getMediaTypeRegistry(), parsers);
        }
    }

    public static enum EmbedHandling {
        CONCATENATE,
        SPAWN,
        IGNORE;


        public static EmbedHandling parse(String outputFormat) {
            return EmbedHandling.valueOf(outputFormat.toUpperCase(Locale.ROOT));
        }

        public static EmbedHandling getDefault() {
            return SPAWN;
        }
    }

    public static enum OutputFormat {
        HTML,
        TEXT;


        public static OutputFormat parse(String outputFormat) {
            return OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT));
        }
    }
}

