/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2011 CASSIDIAN
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.language;

import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import javax.annotation.PostConstruct;
import javax.jws.WebService;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.ontologies.DCTerms;
import org.ow2.weblab.core.extended.ontologies.DublinCore;
import org.ow2.weblab.core.extended.ontologies.WebLabProcessing;
import org.ow2.weblab.core.extended.properties.PropertiesLoader;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.PoKHelper;
import org.ow2.weblab.core.helper.impl.JenaPoKHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.MediaUnit;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.AccessDeniedException;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InsufficientResourcesException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.ServiceNotConfiguredException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.UnsupportedRequestException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.service.language.NGramProfilesPatched.RankResult;
import org.ow2.weblab.service.language.NGramProfilesPatched.Ranker;

/**
 * This class is a WebLab Web service for identifying the language of a Text.<br />
 * 
 * It's a wrapper of the NGramJ project: {@link "http://ngramj.sourceforge.net/"}. It uses the CNGram system that can computes character string instead of raw
 * text files.<br />
 * 
 * This algorithm return for each input text a score associated to every language profile previously learned (.ngp files). The score is a double between 0 and
 * 1. 1 meaning that this text is written in this language for sure. 0 on the opposite means that this text is not written in this language. The sum of score
 * equals 1.<br />
 * 
 * Our wrapper annotate every Text section of a ComposedUnit in input (or the Text if the input is a Text). It fails if the input is something else. On each
 * Text it uses CGram to determine which language profile are the best candidate to be annotated (using DC:language property).
 * 
 * It can be configured using a property file named <code>ngram.properties</code>. In this file you can handle 7 properties.
 * <ul>
 * <li>minSingleValue: It's a double value between 0 and 1. If the best language score is greater than this value, it will be the only one annotated on a given
 * Text</li>
 * <li>minMultipleValue: It's a double value between 0 and 1. Every language score that are greater than this value, will be annotated on a given Text.</li>
 * <li>maxNbValues: It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.</li>
 * <li>profilesFolderPath: It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28
 * languages.</li>
 * <li>addTopLevelAnnot: It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of
 * every Text content.</li>
 * <li>addMediaUnitLevelAnnot: It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.</li>
 * <li>isProducedByObject: It's a String value that should be a valid URI. It defines the URI to be used as object of every isProducedBy statements on
 * annotations created by the service.</li>
 * </ul>
 * Those 7 properties are optional. Default values are:
 * <ul>
 * <li>minSingleValue: '0.75'</li>
 * <li>minMultipleValue: '0.15'</li>
 * <li>maxNbValues: '1'</li>
 * <li>profilesFolderPath: in this case, we use the default constructor for CNGram profile that will use default profile given in their jar file. These 28
 * profiles are named using ISO 639-1 two letters language code; it means that the DC:language annotation resulting will be in this format. If you want to use
 * another format, you have use a custom profiles folder (containing .ngp files).</li>
 * <li>addTopLevelAnnot: <code>false</code></li>
 * <li>addTopLevelAnnot: <code>true</code></li>
 * <li>isProducedByObject: <code>null</code> in this case, no isProducedBy annotation will be created.</li>
 * </ul>
 * 
 * @author EADS IPCC Team
 * @date 2009-11-05
 */
@WebService(endpointInterface = "org.ow2.weblab.core.services.Analyser")
public class LanguageExtraction implements Analyser {

	private final static String PROPERTY_FILE = "ngram.properties";

	private static final String DEFAULT_UNKONWED_CODE = "UNKNOWN";

	private final static double DEFAULT_MIN_SINGLE_VALUE = 0.75;

	private final static double DEFAULT_MIN_MULTIPLE_VALUE = 0.15;

	private final static int DEFAULT_MAX_NB_VALUES = 1;

	private final static String MIN_SINGLE_VALUE = "minSingleValue";

	private final static String MIN_MULTIPLE_VALUE = "minMultipleValue";

	private final static String MAX_NB_VALUES = "maxNbValues";

	private final static String PROFILES_FOLDER_PATH = "profilesFolderPath";

	private final static String ADD_TOP_LEVEL_ANNOT = "addTopLevelAnnot";

	private final static String ADD_MEDIA_UNIT_LEVEL_ANNOT = "addMediaUnitLevelAnnot";

	private final static String IS_PRODUCED_BY_OBJECT = "isProducedByObject";

	private final static String UNKONWED_CODE = "unknowedCode";

	private final Log log = LogFactory.getLog(this.getClass());

	private double minSingleValue;

	private double minMultipleValue;

	private int maxNbValues;

	private boolean addTopLevelAnnot;

	private String isProducedByObject;

	private String unknowedCode;

	private NGramProfilesPatched ngps;

	private boolean addMediaUnitLevelAnnot;

	/**
	 * Read the property file to get fields values.
	 */
	@PostConstruct
	public void init() throws LanguageExtractionException {
		final Map<String, String> props = PropertiesLoader.loadProperties(LanguageExtraction.PROPERTY_FILE);
		final String minSingleValueP = props.get(LanguageExtraction.MIN_SINGLE_VALUE);
		if ((minSingleValueP != null) && !minSingleValueP.isEmpty()) {
			try {
				this.minSingleValue = Double.parseDouble(minSingleValueP);
			} catch (final NumberFormatException nfe) {
				this.log.warn("Unable to parse double for " + LanguageExtraction.MIN_SINGLE_VALUE + " property. Value was: '" + minSingleValueP + "'.");
				this.minSingleValue = LanguageExtraction.DEFAULT_MIN_SINGLE_VALUE;
			}
		} else {
			this.minSingleValue = LanguageExtraction.DEFAULT_MIN_SINGLE_VALUE;
		}

		final String minMultipleValueP = props.get(LanguageExtraction.MIN_MULTIPLE_VALUE);
		if ((minMultipleValueP != null) && !minMultipleValueP.isEmpty()) {
			try {
				this.minMultipleValue = Double.parseDouble(minMultipleValueP);
			} catch (final NumberFormatException nfe) {
				this.log.warn("Unable to parse double for " + LanguageExtraction.MIN_MULTIPLE_VALUE + " property. Value was: '" + minMultipleValueP + "'.");
				this.minMultipleValue = LanguageExtraction.DEFAULT_MIN_MULTIPLE_VALUE;
			}
		} else {
			this.minMultipleValue = LanguageExtraction.DEFAULT_MIN_MULTIPLE_VALUE;
		}

		if (this.minSingleValue < this.minMultipleValue) {
			this.log.warn(LanguageExtraction.MIN_SINGLE_VALUE + " was smaller than " + LanguageExtraction.MIN_MULTIPLE_VALUE
					+ ". Use the two default value instead.");
			this.minSingleValue = LanguageExtraction.DEFAULT_MIN_SINGLE_VALUE;
			this.minMultipleValue = LanguageExtraction.DEFAULT_MIN_MULTIPLE_VALUE;
		}

		this.log.debug("LanguageExtraction initialised with " + LanguageExtraction.MIN_SINGLE_VALUE + "=" + this.minSingleValue);
		this.log.debug("LanguageExtraction initialised with " + LanguageExtraction.MIN_MULTIPLE_VALUE + "=" + this.minMultipleValue);

		final String maxNbValuesP = props.get(LanguageExtraction.MAX_NB_VALUES);
		if ((maxNbValuesP != null) && !maxNbValuesP.isEmpty()) {
			try {
				this.maxNbValues = Integer.parseInt(maxNbValuesP);
			} catch (final NumberFormatException nfe) {
				this.log.warn("Unable to parse double for " + LanguageExtraction.MAX_NB_VALUES + " property. Value was: '" + maxNbValuesP + "'.");
				this.maxNbValues = LanguageExtraction.DEFAULT_MAX_NB_VALUES;
			}
		} else {
			this.maxNbValues = LanguageExtraction.DEFAULT_MAX_NB_VALUES;
		}

		if (this.maxNbValues < 1) {
			this.log.warn(LanguageExtraction.MAX_NB_VALUES + " was smaller than 1. Use the two default value instead.");
			this.maxNbValues = LanguageExtraction.DEFAULT_MAX_NB_VALUES;
		}

		this.log.debug("LanguageExtraction initialised with " + LanguageExtraction.MAX_NB_VALUES + "=" + this.maxNbValues);

		final String profilesFolderPathP = props.get(LanguageExtraction.PROFILES_FOLDER_PATH);
		if ((profilesFolderPathP != null) && !profilesFolderPathP.isEmpty()) {
			final File file = new File(profilesFolderPathP);
			if (!file.exists()) {
				this.log.warn("File '" + file.getAbsolutePath() + "' does not exists. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else if (!file.canRead()) {
				this.log.warn("File '" + file.getAbsolutePath() + "' is not readable. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else if (!file.isDirectory()) {
				this.log.warn("File '" + file.getAbsolutePath() + "' is not a directory. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else {
				try {
					this.ngps = new NGramProfilesPatched(file);
				} catch (final IOException ioe) {
					this.log.warn("Unable to create NGramProfilesPatched using value of " + LanguageExtraction.PROFILES_FOLDER_PATH + " property. Value was: '"
							+ file.getAbsolutePath() + "'. Try to create default one.", ioe);
					try {
						this.ngps = new NGramProfilesPatched();
					} catch (final IOException ioe2) {
						throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe2);
					}
				}
			}
		} else {
			try {
				this.ngps = new NGramProfilesPatched();
			} catch (final IOException ioe) {
				throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
			}
		}

		final String unknowedCode = props.get(LanguageExtraction.UNKONWED_CODE);
		if ((unknowedCode != null) && !unknowedCode.isEmpty()) {
			this.unknowedCode = unknowedCode.trim();
		} else {
			this.unknowedCode = LanguageExtraction.DEFAULT_UNKONWED_CODE;
		}

		if (this.log.isDebugEnabled()) {
			final StringBuilder sb = new StringBuilder();
			sb.append("LanguageExtraction initialised with the following " + this.ngps.getProfileCount() + " language profiles: [");
			for (int p = 0; p < this.ngps.getProfileCount(); p++) {
				sb.append(this.ngps.getProfileName(p));
				if (p < (this.ngps.getProfileCount() - 1)) {
					sb.append(", ");
				} else {
					sb.append("]");
				}
			}
			this.log.debug(sb.toString());
		}

		final String addTopLevelAnnotP = props.get(LanguageExtraction.ADD_TOP_LEVEL_ANNOT);
		if ((addTopLevelAnnotP != null) && !addTopLevelAnnotP.isEmpty()) {
			this.addTopLevelAnnot = Boolean.parseBoolean(addTopLevelAnnotP);
		}
		final String addMuLevelAnnotP = props.get(LanguageExtraction.ADD_MEDIA_UNIT_LEVEL_ANNOT);
		if ((addMuLevelAnnotP != null) && !addMuLevelAnnotP.isEmpty()) {
			this.addMediaUnitLevelAnnot = Boolean.parseBoolean(addMuLevelAnnotP);
		} else {
			this.addMediaUnitLevelAnnot = true;
		}

		// May be null
		this.isProducedByObject = props.get(LanguageExtraction.IS_PRODUCED_BY_OBJECT);
	}

	@Override
	public ProcessReturn process(final ProcessArgs processArgs) throws AccessDeniedException, ContentNotAvailableException, InsufficientResourcesException,
			InvalidParameterException, ServiceNotConfiguredException, UnexpectedException, UnsupportedRequestException {
		final List<Text> texts = this.checkArgs(processArgs);
		final boolean topLevelAnnot = this.addTopLevelAnnot && (processArgs.getResource() instanceof Document);
		final StringBuilder sb = new StringBuilder();
		for (final Text text : texts) {
			if ((text.getContent() == null) || text.getContent().isEmpty()) {
				this.log.debug("Text '" + text.getUri() + "' has no content; ignored.");
				continue;
			}
			if (this.addMediaUnitLevelAnnot) {
				final List<String> profileToAnnotate = this.checkLanguage(text.getContent(), text.getUri());
				this.annotate(text, profileToAnnotate);
			}
			if (topLevelAnnot) {
				sb.append(text.getContent());
				sb.append("\n\n");
			}
		}

		if (topLevelAnnot && (sb.length() > 0)) {
			final Document cu = (Document) processArgs.getResource();
			final List<String> profileToAnnotate = this.checkLanguage(sb.toString(), cu.getUri());
			this.annotate(cu, profileToAnnotate);
		}

		final ProcessReturn pr = new ProcessReturn();
		pr.setResource(processArgs.getResource());
		return pr;
	}

	/**
	 * @param res
	 *            The resource to be annotated
	 * @param profileToAnnotate
	 *            The language to annotate using dc:language property statements on res.
	 */
	private void annotate(final Resource res, final List<String> profileToAnnotate) {
		final Annotation annot = AnnotationFactory.createAndLinkAnnotation(res);
		final PoKHelper pokH = new JenaPoKHelper(annot);
		pokH.setAutoCommitMode(false);
		for (final String language : profileToAnnotate) {
			this.log.debug("Language annotator found " + language + " in unit " + res.getUri());
			pokH.createLitStat(res.getUri(), DublinCore.LANGUAGE_PROPERTY_NAME, language);
		}
		if (this.isProducedByObject != null) {
			pokH.createResStat(annot.getUri(), WebLabProcessing.IS_PRODUCED_BY, this.isProducedByObject);
			pokH.createLitStat(annot.getUri(), DCTerms.CREATED, new DateTime().toString());
		}
		pokH.commit();
	}

	/**
	 * @param content
	 *            The text to identify language
	 * @param uri
	 *            The uri, used for logging purpose.
	 * @return An ordered list of language identified according to parameters (minSingleValue, maxNbValues and minMultipleValue).
	 */
	private List<String> checkLanguage(final String content, final String uri) {
		final List<String> profileToAnnotate = new LinkedList<String>();
		final Ranker ranker = this.ngps.getRanker();
		ranker.account(content);
		final RankResult result = ranker.getRankResult();

		boolean warn = false;
		// Profile are listed in their rank order
		final double bestScore = result.getScore(0);
		if (bestScore > this.minSingleValue) {
			profileToAnnotate.add(result.getName(0));
		} else if (bestScore < this.minMultipleValue) {
			profileToAnnotate.add(this.unknowedCode);
			warn = true;
		} else {
			final int max = Math.min(result.getLength(), this.maxNbValues);
			for (int p = 0; p < max; p++) {
				if (result.getScore(p) >= this.minMultipleValue) {
					profileToAnnotate.add(result.getName(p));
				} else {
					break;
				}
			}
		}

		if (this.log.isDebugEnabled() || warn) {
			final StringBuilder sb = new StringBuilder();
			sb.append("Language detected for MediaUnit '" + uri + "' are: [");
			for (int p = 0; p < result.getLength(); p++) {
				sb.append(result.getName(p));
				sb.append(" - ");
				sb.append(result.getScore(p));
				if (p < (result.getLength() - 1)) {
					sb.append(" --|-- ");
				} else {
					sb.append("]");
				}
			}
			if (warn) {
				this.log.warn(sb.toString());
				this.log.warn("Unable to identify language for MediaUnit '" + uri + "'; " + profileToAnnotate + " will be annotated.");
			} else {
				this.log.debug(sb.toString());
				this.log.debug("Language to be annotated for MediaUnit '" + uri + "' are: " + profileToAnnotate);
			}

		}

		return profileToAnnotate;
	}

	/**
	 * @param processArg
	 *            The processArgs; i.e. a usageContext not used and a Resource that must either be a composedUnit or a text.
	 * @return A list of text contained in resource of processArgs
	 * @throws ProcessException
	 *             If processArgs is null; or if resource is null; or if resource is neither a ComposedUnit nor a Text.
	 */
	private List<Text> checkArgs(final ProcessArgs processArg) throws InsufficientResourcesException, InvalidParameterException {
		if (processArg == null) {
			throw new InvalidParameterException("ProcessArgs was null.", "ProcessArgs was null.");
		}
		final Resource res = processArg.getResource();
		if (res == null) {
			throw new InsufficientResourcesException("Resource in ProcessArgs was null.", "Resource in ProcessArgs was null.");
		}
		if (!(res instanceof MediaUnit)) {
			throw new InvalidParameterException("Resource in ProcessArgs was not an instance of MediaUnit but of [" + res.getClass().getCanonicalName() + "].",
					"");
		}
		final List<Text> texts;
		if (res instanceof Document) {
			texts = ResourceUtil.getSelectedSubResources(res, Text.class);
		} else if (res instanceof Text) {
			texts = new LinkedList<Text>();
			texts.add((Text) res);
		} else {
			throw new InvalidParameterException("Resource in ProcessArgs was not neither an instance of Document nor of Text but of '"
					+ res.getClass().getCanonicalName() + "'.", "");
		}
		return texts;
	}

}
