/*******************************************************************************
 *  Imixs Workflow Technology
 *  Copyright (C) 2001, 2008 Imixs Software Solutions GmbH,  
 *  http://www.imixs.com
 *  
 *  This program is free software; you can redistribute it and/or 
 *  modify it under the terms of the GNU General Public License 
 *  as published by the Free Software Foundation; either version 2 
 *  of the License, or (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful, 
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 *  General Public License for more details.
 *  
 *  You can receive a copy of the GNU General Public
 *  License at http://www.gnu.org/licenses/gpl.html
 *  
 *  Contributors:  
 *  	Imixs Software Solutions GmbH - initial API and implementation
 *  	Ralph Soika
 *******************************************************************************/
package org.imixs.ml.service;

import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.ejb.Stateless;
import javax.enterprise.event.Event;
import javax.inject.Inject;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;

import org.imixs.melman.RestAPIException;
import org.imixs.melman.WorkflowClient;
import org.imixs.ml.api.TrainingApplication;
import org.imixs.ml.core.MLClient;
import org.imixs.ml.core.MLContentBuilder;
import org.imixs.ml.events.EntityObjectEvent;
import org.imixs.ml.training.TrainingDataBuilder;
import org.imixs.ml.xml.XMLTrainingData;
import org.imixs.ml.xml.XMLTrainingEntity;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.PluginException;

/**
 * The TrainingService loads a list of documents and creates a TraingData object
 * for each document.
 * <p>
 * The TrainingService extracts the text contained in a document attached to a
 * snapshot workitem. The file content is created by a OCR scan.
 * <p>
 * If a valuable training data set can be build for a document, than the
 * XMLTraingData object is send to the Imixs-ML service to train a ml-model.
 * 
 * 
 * @version 1.0
 * @author rsoika
 */

@Stateless
public class TrainingService {
    private static Logger logger = Logger.getLogger(TrainingService.class.getName());
    public static final String FILE_ATTRIBUTE_TEXT = "text";
    @Inject
    TikaHelperService tikaService;

    @Inject
    protected Event<EntityObjectEvent> entityObjectEvents;

    /**
     * This method is used to extract the text contained in a snapshot document and
     * search for 'known' entities within the text. If a valuable training data set
     * can be build the training data is send to the Imixs-ML service.
     * <p>
     * The method generates statistical data.
     * 
     * @param config         - a config object providing the training configuration
     * @param workitem       - a workitem providing the data
     * @param workflowClient - a rest client instance
     * @return - quality result
     */
    @SuppressWarnings("unchecked")
    public int trainWorkitemData(ItemCollection config, ItemCollection workitem, WorkflowClient workflowClient) {
        boolean debug = logger.isLoggable(Level.FINE);
        int qualityResult = -1;
        Pattern mlFilenamePattern = null;

        logger.info("...create new training data for: " + workitem.getUniqueID());

        String model = config.getItemValueString(TrainingApplication.ITEM_ML_ANALYSE_MODEL);
        List<String> trainingItemNames = config.getItemValue(TrainingApplication.ITEM_ENTITIES);
        List<String> tikaOptions = config.getItemValue(TrainingApplication.ITEM_TIKA_OPTIONS);
        String ocrMode = config.getItemValueString(TrainingApplication.ITEM_TIKA_OCR_MODE);
        String qualityLevel = config.getItemValueString(TrainingApplication.ITEM_ML_TRAINING_QUALITYLEVEL);
        if (qualityLevel.isEmpty()) {
            qualityLevel = "FULL"; // default level!
        }
        // parse optional filename regex pattern...
        String _FilenamePattern = config.getItemValueString("filename.pattern");
        if (_FilenamePattern != null && !_FilenamePattern.isEmpty()) {
            logger.info("......apply filename.pattern=" + _FilenamePattern);
            mlFilenamePattern = Pattern.compile(_FilenamePattern);
        }

        // build locales....
        List<String> sLocales = config.getItemValue(TrainingApplication.ITEM_LOCALES);
        List<Locale> locals = new ArrayList<Locale>();
        for (String _locale : sLocales) {
            Locale aLocale = new Locale(_locale);
            locals.add(aLocale);
            if (debug) {
                logger.finest("......suporting locale " + aLocale);
            }
        }

        logger.info("......model=" + model);
        logger.info("......qualityLevel=" + qualityLevel);
        logger.info("......ocrMode=" + ocrMode);
        logger.info("......locales=" + Arrays.toString(sLocales.toArray()));

        try {

            // update ocr information if needed....
            workitem = doVerifyOCRContent(workitem, mlFilenamePattern, workflowClient, tikaOptions);

            // build the ml content....
            String ocrText = new MLContentBuilder(workitem, null, false, mlFilenamePattern).build();

            // String ocrText = getTextContent(workitem, mlFilenamePattern, workflowClient,
            // ocrMode, tikaOptions);

            if (ocrText == null || ocrText.isEmpty()) {
                return XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_BAD;
            }

            logger.fine("extracted text content to be analysed=");
            logger.fine(ocrText);
            // build training data set...
            XMLTrainingData trainingData = new TrainingDataBuilder(ocrText, workitem, trainingItemNames, locals)
                    .setAnalyzerEntityEvents(entityObjectEvents).build();

            // compute stats rate for found entities
            List<String> entitysFound = new ArrayList<String>();
            for (XMLTrainingEntity trainingEntity : trainingData.getEntities()) {
                if (!entitysFound.contains(trainingEntity.getLabel())) {
                    entitysFound.add(trainingEntity.getLabel());
                }
            }

            qualityResult = trainingData.getQuality();
            // we only send the training data in case of quality level is sufficient
            if (XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_BAD == trainingData.getQuality()) {
                if ("REDUCED".equalsIgnoreCase(qualityLevel)) {
                    logger.info("...document '" + workitem.getUniqueID()
                            + "' TRAININGDATA_QUALITY_LEVEL=BAD but REDUCED is accepted - document will be trained...");
                    qualityResult = XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_PARTIAL;
                } else {
                    logger.severe("...document '" + workitem.getUniqueID()
                            + "' TRAININGDATA_QUALITY_LEVEL=BAD - document will be ignored!");
                }
            } else {
                if (XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_PARTIAL == trainingData.getQuality()
                        && "FULL".equalsIgnoreCase(qualityLevel)) {
                    logger.severe("...document '" + workitem.getUniqueID()
                            + "' TRAININGDATA_QUALITY_LEVEL=PARTIAL but FULL is required - document will be ignored!");
                    qualityResult = XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_BAD;
                } else {
                    logger.info("...document '" + workitem.getUniqueID() + "' TRAININGDATA_QUALITY_LEVEL=" + qualityResult +"...");
                }
            }

            // trainingData if quality level is sufficient
            if (qualityResult == XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_PARTIAL
                    || qualityResult == XMLTrainingData.TRAININGDATA_QUALITY_LEVEL_FULL) {

                // log the XMLTrainingData object....
                if (debug) {
                    printXML(trainingData);
                }
                String serviceEndpoint = config.getItemValueString(TrainingApplication.ITEM_ML_TRAINING_ENDPOINT);
                MLClient mlClient = new MLClient(serviceEndpoint);
                mlClient.postTrainingData(trainingData, model);
            }

        } catch (PluginException | RestAPIException e1) {
            logger.severe("Error parsing documents: " + e1.getMessage());
        }

        return qualityResult;

    }

    /**
     * This method is used to test an existing model. The method extracts the text
     * contained in a snapshot document and sends the text to the Imixs-ML service
     * to be analyzed. The resuls are printed out.
     * 
     * @param doc            - a workitem providing the attachments and the entity
     *                       data
     * @param items          - String list with items
     * @param workflowClient - a rest client instance
     */
    @SuppressWarnings("unchecked")
    public void testWorkitemData(ItemCollection config, ItemCollection doc, WorkflowClient workflowClient) {
        logger.info("......anaysing: " + doc.getUniqueID());
        Pattern mlFilenamePattern = null;
        List<String> tikaOptions = config.getItemValue(TrainingApplication.ITEM_TIKA_OPTIONS);
        String serviceEndpoint = config.getItemValueString(TrainingApplication.ITEM_ML_ANALYSE_ENDPOINT);
        String model = config.getItemValueString(TrainingApplication.ITEM_ML_ANALYSE_MODEL);
        // parse optional filename regex pattern...
        String _FilenamePattern = config.getItemValueString("filename.pattern");
        if (_FilenamePattern != null && !_FilenamePattern.isEmpty()) {
            logger.info("......apply filename.pattern=" + _FilenamePattern);
            mlFilenamePattern = Pattern.compile(_FilenamePattern);
        }
        try {

            // update ocr information if needed....
            doc = doVerifyOCRContent(doc, mlFilenamePattern, workflowClient, tikaOptions);

            // build the ml content....
            String ocrText = new MLContentBuilder(doc, null, false, mlFilenamePattern).build();

            // String ocrText = getTextContent(doc, mlFilenamePattern, workflowClient,
            // ocrMode, tikaOptions);
            if (ocrText != null && !ocrText.isEmpty()) {
                MLClient mlClient = new MLClient(serviceEndpoint);
                mlClient.postAnalyseData(ocrText, model);
            }
        } catch (PluginException | RestAPIException e1) {
            logger.severe("Error parsing documents: " + e1.getMessage());
        }

    }

    /**
     * Log the training data into the server log
     * 
     * @param trainingData
     */
    public void printXML(XMLTrainingData trainingData) {

        JAXBContext context;
        try {
            context = JAXBContext.newInstance(XMLTrainingData.class);
            Marshaller marshaller = context.createMarshaller();
            marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true);
            StringWriter out = new StringWriter();
            marshaller.marshal(trainingData, out);
            String xml = out.toString();
            logger.info(xml);
        } catch (JAXBException e) {
            e.printStackTrace();
        }
    }

    /**
     * This method tests if we already have OCR text in the workitem. If not we load
     * the snapshot and post the files first to the tika service. In a normal setup
     * of Imixs-Office-Workfow this task should not be necessary here. But we need
     * to be abel to parse old data.
     * 
     * 
     * @param workitem - workitem containing file attachments
     * @return text to be analyzed
     * @throws RestAPIException
     * @throws PluginException
     */
    private ItemCollection doVerifyOCRContent(ItemCollection workitem, Pattern mlFilenamePattern,
            WorkflowClient workflowClient, List<String> tikaOptions) throws RestAPIException, PluginException {

        // do we have file data?
        List<FileData> files = workitem.getFileData();

        if (files == null || files.size() == 0) {
            // no op
            return workitem;
        }

        // test if the workitem already have ocr content
        for (FileData file : files) {
            ItemCollection metadata = new ItemCollection(file.getAttributes());
            String _text = metadata.getItemValueString("text");
            if (!_text.isEmpty()) {
                // ocr contentent already exists
                return workitem;
            }
        }

        // no existing content - we need to ocr .....
        // first load the snapshot
        ItemCollection snapshot = null;
        String snapshotID = workitem.getItemValueString("$snapshotid");
        if (!snapshotID.isEmpty()) {
            snapshot = workflowClient.getDocument(snapshotID);
        }

        if (snapshot == null) {
            logger.warning("Unable to load snapshot for document " + workitem.getUniqueID());
            return workitem;

        }

        workitem = snapshot;
        files = workitem.getFileData();

        for (FileData fileData : files) {
            // add ocr content to each filedata ...
            try {
                String ocrContent;
                ocrContent = tikaService.doORCProcessing(fileData, tikaOptions);
                // store the ocrContent....
                List<Object> list = new ArrayList<Object>();
                list.add(ocrContent);
                fileData.setAttribute(FILE_ATTRIBUTE_TEXT, list);

            } catch (IOException e) {

                e.printStackTrace();
            }
        }

        return workitem;
    }

}
