/*
 * Decompiled with CFR 0.152.
 */
package org.imixs.archive.ocr;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.Stateless;
import javax.inject.Inject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.PluginException;

@Stateless
public class OCRService {
    public static final String FILE_ATTRIBUTE_TEXT = "text";
    public static final String DEFAULT_ENCODING = "UTF-8";
    public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
    public static final String ENV_TIKA_SERVICE_ENDPOINT = "tika.service.endpoint";
    public static final String ENV_TIKA_SERVICE_MODE = "tika.service.mode";
    public static final String ENV_TIKA_OCR_MODE = "tika.ocr.mode";
    private static Logger logger = Logger.getLogger(OCRService.class.getName());
    @Inject
    @ConfigProperty(name="tika.service.endpoint")
    Optional<String> serviceEndpoint;
    @Inject
    @ConfigProperty(name="tika.service.mode", defaultValue="auto")
    String serviceMode;
    @Inject
    @ConfigProperty(name="tika.ocr.mode", defaultValue="PDF_AND_OCR")
    String ocrMode;

    public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
        this.extractText(workitem, snapshot, this.ocrMode, null);
    }

    public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrmode, List<String> options) throws PluginException {
        boolean debug = logger.isLoggable(Level.FINE);
        if (_ocrmode != null) {
            this.ocrMode = _ocrmode;
        }
        long l = System.currentTimeMillis();
        List files = workitem.getFileData();
        for (FileData fileData : files) {
            FileData originFileData;
            if (this.hasOCRContent(fileData) || (originFileData = this.fetchOriginFileData(fileData, snapshot)) == null) continue;
            String ocrContent = null;
            try {
                if (debug) {
                    logger.fine("...text extraction '" + originFileData.getName() + "'...");
                }
                if (this.isPDF(originFileData)) {
                    if ("OCR_ONLY".equals(this.ocrMode)) {
                        if (debug) {
                            logger.fine("...force orc scan for pdfs...");
                        }
                        ocrContent = this.doORCProcessing(originFileData, options);
                    } else {
                        ocrContent = this.doPDFTextExtraction(originFileData);
                        if (ocrContent != null && ocrContent.length() < 16) {
                            ocrContent = null;
                        }
                        if (ocrContent == null && "MIXED".equals(this.ocrMode)) {
                            ocrContent = this.doORCProcessing(originFileData, options);
                        }
                    }
                } else if (!"PDF_ONLY".equals(this.ocrMode)) {
                    ocrContent = this.doORCProcessing(originFileData, options);
                }
                if (ocrContent == null) {
                    logger.warning("Unable to extract ocr-content for '" + fileData.getName() + "'");
                    ocrContent = "";
                }
                ArrayList<String> list = new ArrayList<String>();
                list.add(ocrContent);
                fileData.setAttribute(FILE_ATTRIBUTE_TEXT, list);
            }
            catch (IOException e) {
                throw new PluginException(OCRService.class.getSimpleName(), PLUGIN_ERROR, "Unable to scan attached document '" + fileData.getName() + "'", (Exception)e);
            }
        }
        if (debug) {
            logger.fine("...extracted textual information in " + (System.currentTimeMillis() - l) + "ms");
        }
    }

    private boolean hasOCRContent(FileData fileData) {
        List ocrContentList;
        return fileData != null && (ocrContentList = (List)fileData.getAttribute(FILE_ATTRIBUTE_TEXT)) != null && ocrContentList.size() > 0 && ocrContentList.get(0) != null;
    }

    private FileData fetchOriginFileData(FileData fileData, ItemCollection snapshot) {
        FileData snapshotFileData;
        byte[] fileContent = fileData.getContent();
        if (fileContent != null && fileContent.length > 1) {
            return fileData;
        }
        if (snapshot != null && (snapshotFileData = snapshot.getFileData(fileData.getName())) != null && (fileContent = snapshotFileData.getContent()) != null && fileContent.length > 1) {
            return snapshotFileData;
        }
        logger.warning("no content found for fileData '" + fileData.getName() + "'!");
        return null;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public String doORCProcessing(FileData fileData, List<String> options) throws IOException {
        String contentType;
        boolean debug = logger.isLoggable(Level.FINE);
        if (!this.serviceEndpoint.isPresent() || this.serviceEndpoint.get().isEmpty()) {
            return null;
        }
        if (debug) {
            logger.fine("...ocr scanning....");
        }
        if (!this.acceptContentType(contentType = this.adaptContentType(fileData))) {
            if (debug) {
                logger.fine("contentType '" + contentType + " is not supported by Tika Server");
            }
            return null;
        }
        PrintWriter printWriter = null;
        HttpURLConnection urlConnection = null;
        PrintWriter writer = null;
        try {
            urlConnection = (HttpURLConnection)new URL(this.serviceEndpoint.get()).openConnection();
            urlConnection.setRequestMethod("PUT");
            urlConnection.setDoOutput(true);
            urlConnection.setDoInput(true);
            urlConnection.setAllowUserInteraction(false);
            urlConnection.setRequestProperty("Content-Type", contentType + "; charset=" + DEFAULT_ENCODING);
            urlConnection.setRequestProperty("Accept", "text/plain");
            if (options != null && options.size() > 0) {
                for (String option : options) {
                    int i = option.indexOf("=");
                    if (i > -1) {
                        String key = option.substring(0, i);
                        String value = option.substring(i + 1);
                        if (key.startsWith("X-Tika")) {
                            urlConnection.setRequestProperty(key, value);
                            continue;
                        }
                        logger.warning("Invalid tika option : '" + option + "'  key must start with 'X-Tika'");
                        continue;
                    }
                    logger.warning("Invalid tika option : '" + option + "'  character '=' expeced!");
                }
            }
            urlConnection.setRequestProperty("Content-Length", "" + Integer.valueOf(fileData.getContent().length));
            OutputStream output = urlConnection.getOutputStream();
            writer = new PrintWriter((Writer)new OutputStreamWriter(output, DEFAULT_ENCODING), true);
            output.write(fileData.getContent());
            writer.flush();
            int resposeCode = urlConnection.getResponseCode();
            if (resposeCode >= 200 && resposeCode <= 299) {
                String string = this.readResponse(urlConnection, DEFAULT_ENCODING);
                return string;
            }
            String string = null;
            return string;
        }
        finally {
            if (printWriter != null) {
                printWriter.close();
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public String doPDFTextExtraction(FileData fileData) {
        boolean debug = logger.isLoggable(Level.FINE);
        if (debug) {
            logger.fine("...pdf text extraction....");
        }
        PDDocument doc = null;
        String result = null;
        try {
            doc = PDDocument.load((byte[])fileData.getContent());
            PDFTextStripper pdfStripper = new PDFTextStripper();
            result = pdfStripper.getText(doc);
            if (debug) {
                logger.finest("<RESULT>" + result + "</RESULT>");
            }
            doc.close();
        }
        catch (IOException e) {
            logger.warning("unable to load pdf : " + e.getMessage());
        }
        finally {
            if (doc != null) {
                try {
                    doc.close();
                }
                catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private String readResponse(URLConnection urlConnection, String encoding) throws IOException {
        boolean debug = logger.isLoggable(Level.FINE);
        if (debug) {
            logger.finest("......readResponse....");
        }
        StringWriter writer = new StringWriter();
        try (BufferedReader in = null;){
            String inputLine;
            String sContentEncoding = urlConnection.getContentEncoding();
            if ((sContentEncoding == null || sContentEncoding.isEmpty()) && encoding != null && !encoding.isEmpty()) {
                sContentEncoding = encoding;
            }
            in = sContentEncoding != null && !sContentEncoding.isEmpty() ? new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), sContentEncoding)) : new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
            while ((inputLine = in.readLine()) != null) {
                if (debug) {
                    logger.finest("......" + inputLine);
                }
                writer.write(inputLine + "\n");
            }
        }
        return writer.toString();
    }

    private boolean acceptContentType(String contentType) {
        if (contentType == null || contentType.isEmpty()) {
            return false;
        }
        return !"application/octet-stream".equalsIgnoreCase(contentType);
    }

    private String adaptContentType(FileData fileData) {
        String contentType = fileData.getContentType();
        if (contentType == null || contentType.isEmpty() || "*/*".equals(contentType)) {
            contentType = fileData.getName().toLowerCase().endsWith(".pdf") ? "application/pdf" : "application/xml";
        }
        return contentType;
    }

    private boolean isPDF(FileData fileData) {
        if (fileData.getName().toLowerCase().endsWith(".pdf")) {
            return true;
        }
        return fileData.getContentType().contains("pdf");
    }
}

