/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2012 Cassidian, an EADS company
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.language;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Calendar;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

import javax.jws.WebService;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.model.processing.WProcessingAnnotator;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.service.language.NGramProfilesPatched.RankResult;
import org.ow2.weblab.service.language.NGramProfilesPatched.Ranker;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.purl.dc.terms.DCTermsAnnotator;

/**
 * This class is a WebLab Web service for identifying the language of a Text.<br />
 * 
 * It's a wrapper of the NGramJ project: {@link "http://ngramj.sourceforge.net/"}. It uses the CNGram system that can computes character string instead of raw
 * text files.<br />
 * 
 * This algorithm return for each input text a score associated to every language profile previously learned (.ngp files). The score is a double between '0' and
 * '1'. '1' meaning that this text is written in this language for sure. '0' on the opposite means that this text is not written in this language. The sum of
 * score equals '1'.<br />
 * 
 * Our wrapper annotate every Text section of a Document in input (or the Text if the input is a Text). It fails if the input is something else. On each Text it
 * uses CNGram to determine which language profile are the best candidate to be annotated (using DC:language property).
 * 
 * It can be configured using one of its custom constructor that enable the specification of those 8 parameters:
 * <ul>
 * <li>maxNbValues: It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.</li>
 * <li>addTopLevelAnnot: It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of
 * every Text content.</li>
 * <li>addMediaUnitLevelAnnot: It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.</li>
 * <li>profilesFolderPath: It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28
 * languages.</li>
 * <li>isProducedByObject: It's a String value that should be a valid URI. It defines the URI to be used as object of every isProducedBy statements on
 * annotations created by the service.</li>
 * <li>unknownLanguageCode: It's the String value that will be annotated when no language can be clearly identified. When <code>null</code>, nothing is
 * annotated.</li>
 * <li>minSingleValue: It's a double value between 0 and 1. If the best language score is greater than this value, it will be the only one annotated on a given
 * Text.</li>
 * <li>minMultipleValue: It's a double value between 0 and 1. Every language score that are greater than this value, will be annotated on a given Text.</li>
 * </ul>
 * 
 * Those 8 properties are optional. Default values are:
 * <ul>
 * <li>maxNbValues: '1'</li>
 * <li>addTopLevelAnnot: <code>false</code></li>
 * <li>addMediaUnitLevelAnnot: <code>true</code></li>
 * <li>profilesFolderPath: in this case, we use the default constructor for CNGram profile that will use default profile given in their jar file. These 28
 * profiles are named using ISO 639-1 two letters language code; it means that the DC:language annotation resulting will be in this format. If you want to use
 * another format, you have use a custom profiles folder (containing .ngp files).</li>
 * <li>isProducedByObject: <code>null</code> in this case, no isProducedBy annotation will be created.</li>
 * <li>unknownLanguageCode: 'und' is used, since it is the code for undetermined in ISO-639-x.</li>
 * <li>minSingleValue: '0.75'</li>
 * <li>minMultipleValue: '0.15'</li>
 * </ul>
 * 
 * @author EADS IPCC Team
 * @date 2009-11-05
 */
@WebService(endpointInterface = "org.ow2.weblab.core.services.Analyser")
public class LanguageExtraction implements Analyser {


	private static final boolean DEFAULT_ADD_MEDIA_UNIT_LEVEL_ANNOT = true;


	private static final boolean DEFAULT_ADD_TOP_LEVEL_ANNOT = false;


	private static final String DEFAULT_IS_PRODUCED_BY_OBJECT = null;


	private static final int DEFAULT_MAX_NB_VALUES = 1;


	private static final double DEFAULT_MIN_MULTIPLE_VALUE = 0.15;


	private static final double DEFAULT_MIN_SINGLE_VALUE = 0.75;


	private static final String DEFAULT_PROFILES_FOLDER_PATH = null;


	private static final String DEFAULT_UNKNOWN_LANGUAGE_CODE = "und";



	/**
	 * Whether to annotate each text section received.
	 */
	private final boolean addMediaUnitLevelAnnot;


	/**
	 * Whether to annotate the document if the received resource is a document.
	 */
	private final boolean addTopLevelAnnot;


	/**
	 * The URI to be annotated with isProducedBy statement on each Annotation created. If null, no statement is added.
	 */
	private final URI isProducedByObject;


	/**
	 * The logger used inside the class
	 */
	private final Log log;


	/**
	 * The maximum number of language to be annotated on a single mediaUnit.
	 */
	private final int maxNbValues;


	/**
	 * The minimum score of second, third and following recognised profiles to be annotated (if maxBnValues not reached and if the first is smaller than
	 * minSingleValue).
	 */
	private final double minMultipleValue;


	/**
	 * The minimum score to be reached to force a single language to be identified
	 */
	private final double minSingleValue;


	/**
	 * The profiles used to guess language. These profiles are loaded from a folder or from the classpath.
	 */
	private final NGramProfilesPatched ngps;


	/**
	 * The language code to be used when no majority is found when guessing language. If null, nothing is annotated.
	 */
	private final String unknownLanguageCode;


	/**
	 * The default constructor that initialises each parameter as state in the class header.
	 */
	public LanguageExtraction() {
		this(LanguageExtraction.DEFAULT_MAX_NB_VALUES, LanguageExtraction.DEFAULT_ADD_TOP_LEVEL_ANNOT, LanguageExtraction.DEFAULT_ADD_MEDIA_UNIT_LEVEL_ANNOT,
				LanguageExtraction.DEFAULT_PROFILES_FOLDER_PATH);
	}


	/**
	 * @param maxNbValues
	 *            It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.
	 * @param addTopLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of every
	 *            Text content.
	 * @param addMediaUnitLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.
	 * @param profilesFolderPath
	 *            It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28 languages.
	 */
	public LanguageExtraction(final int maxNbValues, final boolean addTopLevelAnnot, final boolean addMediaUnitLevelAnnot, final String profilesFolderPath) {
		this(maxNbValues, addTopLevelAnnot, addMediaUnitLevelAnnot, profilesFolderPath, LanguageExtraction.DEFAULT_IS_PRODUCED_BY_OBJECT);
	}


	/**
	 * @param maxNbValues
	 *            It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.
	 * @param addTopLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of every
	 *            Text content.
	 * @param addMediaUnitLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.
	 * @param profilesFolderPath
	 *            It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28 languages.
	 * @param isProducedByObject
	 *            It's a String value that should be a valid URI. It defines the URI to be used as object of every isProducedBy statements on
	 *            annotations created by the service.
	 */
	public LanguageExtraction(final int maxNbValues, final boolean addTopLevelAnnot, final boolean addMediaUnitLevelAnnot, final String profilesFolderPath,
			final String isProducedByObject) {
		this(maxNbValues, addTopLevelAnnot, addMediaUnitLevelAnnot, profilesFolderPath, isProducedByObject, LanguageExtraction.DEFAULT_UNKNOWN_LANGUAGE_CODE);
	}


	/**
	 * @param maxNbValues
	 *            It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.
	 * @param addTopLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of every
	 *            Text content.
	 * @param addMediaUnitLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.
	 * @param profilesFolderPath
	 *            It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28 languages.
	 * @param isProducedByObject
	 *            It's a String value that should be a valid URI. It defines the URI to be used as object of every isProducedBy statements on annotations
	 *            created by the service.
	 * @param unknownLanguageCode
	 *            It's the String value that will be annotated when no language can be clearly identified. When <code>null</code>, nothing is annotated.
	 */
	public LanguageExtraction(final int maxNbValues, final boolean addTopLevelAnnot, final boolean addMediaUnitLevelAnnot, final String profilesFolderPath,
			final String isProducedByObject, final String unknownLanguageCode) {
		this(maxNbValues, addTopLevelAnnot, addMediaUnitLevelAnnot, profilesFolderPath, isProducedByObject, unknownLanguageCode,
				LanguageExtraction.DEFAULT_MIN_SINGLE_VALUE, LanguageExtraction.DEFAULT_MIN_MULTIPLE_VALUE);
	}


	/**
	 * @param maxNbValues
	 *            It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.
	 * @param addTopLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of every
	 *            Text content.
	 * @param addMediaUnitLevelAnnot
	 *            It's a boolean value. It defines whether or not to annotate the each Text section with the language guessed.
	 * @param profilesFolderPath
	 *            It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28 languages.
	 * @param isProducedByObject
	 *            It's a String value that should be a valid URI. It defines the URI to be used as object of every isProducedBy statements on annotations
	 *            created by the service.
	 * @param unknownLanguageCode
	 *            It's the String value that will be annotated when no language can be clearly identified. When <code>null</code>, nothing is annotated.
	 * @param minSingleValue
	 *            It's a double value between 0 and 1. If the best language score is greater than this value, it will be the only one annotated on a given Text.
	 * @param minMultipleValue
	 *            It's a double value between 0 and 1. Every language score that are greater than this value, will be annotated on a given Text.
	 */
	public LanguageExtraction(final int maxNbValues, final boolean addTopLevelAnnot, final boolean addMediaUnitLevelAnnot, final String profilesFolderPath,
			final String isProducedByObject, final String unknownLanguageCode, final double minSingleValue, final double minMultipleValue) {
		this.log = LogFactory.getLog(this.getClass());

		// Transfer simple values
		this.addTopLevelAnnot = addTopLevelAnnot;
		this.addMediaUnitLevelAnnot = addMediaUnitLevelAnnot;
		this.unknownLanguageCode = unknownLanguageCode;

		// Transfer and test other simple values
		this.maxNbValues = maxNbValues;
		if (this.maxNbValues < 1) {
			final String msg = "Parameter maxNbValues (" + this.maxNbValues + ") must be greater than 0 to enable annotation.";
			this.log.error(msg);
			throw new InitialisationException(msg);
		}
		this.minSingleValue = minSingleValue;
		if ((this.minSingleValue < 0) || (this.minSingleValue > 1)) {
			final String msg = "Parameter minSingleValue (" + this.minSingleValue + ") must be between 0 and 1.";
			this.log.error(msg);
			throw new InitialisationException(msg);
		}
		this.minMultipleValue = minMultipleValue;
		if ((this.minMultipleValue < 0) || (this.minMultipleValue > 1)) {
			final String msg = "Parameter minMultipleValue (" + this.minMultipleValue + ") must be between 0 and 1.";
			this.log.error(msg);
			throw new InitialisationException(msg);
		}
		if (this.minSingleValue < this.minMultipleValue) {
			final String msg = "Parameter minSingleValue (" + this.minSingleValue + ") must be bigger than minMultipleValue (" + this.minMultipleValue + ").";
			this.log.error(msg);
			throw new InitialisationException(msg);
		}

		// Transfer and test complex values
		if ((isProducedByObject == null) || isProducedByObject.isEmpty()) {
			this.isProducedByObject = null;
		} else {
			try {
				this.isProducedByObject = new URI(isProducedByObject);
			} catch (final URISyntaxException urise) {
				final String msg = "Parameter isProducedByObject (" + this.isProducedByObject + ") must be a valid URI or null.";
				this.log.error(msg, urise);
				throw new InitialisationException(msg, urise);
			}
		}

		this.ngps = this.loadProfiles(profilesFolderPath);

		if (this.log.isDebugEnabled()) {
			this.log.info("LanguageExtraction successfully initialised with configuration: " + this.toString());
		} else {
			this.log.info("LanguageExtraction successfully initialised.");
		}
	}



	/**
	 * Loads a NGramProfilesPatched from the parameter or uses default one (if null or empty).
	 * 
	 * @param profilesFolderPath
	 *            The path to a folder to be loaded. If null or empty, uses default profiles shipped with NGramJ
	 * @return An instance of NGramProfilesPatched
	 * @throws InitialisationException
	 *             If the initialisation fails
	 */
	public NGramProfilesPatched loadProfiles(final String profilesFolderPath) {
		final NGramProfilesPatched profiles;
		if ((profilesFolderPath == null) || profilesFolderPath.isEmpty()) {
			this.log.debug("Loading default NGramProfile.");
			try {
				profiles = new NGramProfilesPatched();
			} catch (final IOException ioe) {
				final String msg = "Unable to load the default NGramProfilesPatched.";
				this.log.fatal(msg, ioe);
				throw new InitialisationException(msg, ioe);
			}
		} else {
			final File file = new File(profilesFolderPath);
			final String defaultErrorMsg = "Unable to load NGramProfilesPatched using parameter profilesFolderPath (" + profilesFolderPath + "). The file '"
					+ file.getAbsolutePath() + "'";
			if (!file.exists()) {
				final String msg = defaultErrorMsg + " does not exists.";
				this.log.error(msg);
				throw new InitialisationException(msg);
			}
			if (!file.canRead()) {
				final String msg = defaultErrorMsg + " is not readable.";
				this.log.error(msg);
				throw new InitialisationException(msg);
			}
			if (!file.isDirectory()) {
				final String msg = defaultErrorMsg + " is not a directory.";
				this.log.error(msg);
				throw new InitialisationException(msg);
			}
			try {
				profiles = new NGramProfilesPatched(file);
			} catch (final IOException ioe) {
				final String msg = defaultErrorMsg + " has been used but an internal error occurs.";
				this.log.error(msg, ioe);
				throw new InitialisationException(msg, ioe);
			}
		}
		return profiles;
	}


	@Override
	public ProcessReturn process(final ProcessArgs processArgs) throws InvalidParameterException, UnexpectedException {
		this.log.trace("Process method called.");

		final List<Text> texts = this.checkArgs(processArgs);
		final Resource resource = processArgs.getResource();

		this.log.debug("Process method called on Resource " + resource.getUri() + ".");

		final boolean topLevelAnnot = this.addTopLevelAnnot && !texts.isEmpty() && !(resource instanceof Text);

		final StringBuilder sb = new StringBuilder();
		for (final Text text : texts) {
			if ((text.getContent() == null) || text.getContent().isEmpty()) {
				this.log.debug("Text '" + text.getUri() + "' has no content. Ignoring it.");
				continue;
			}
			if (this.addMediaUnitLevelAnnot) {
				final List<String> profileToAnnotate = this.checkLanguage(text.getContent(), text.getUri());
				this.annotate(text, profileToAnnotate);
			}
			if (topLevelAnnot) {
				sb.append(text.getContent());
				sb.append("\n\n");
			}
		}

		if (topLevelAnnot && (sb.length() > 0)) {
			final List<String> profileToAnnotate = this.checkLanguage(sb.toString(), resource.getUri());
			this.annotate(resource, profileToAnnotate);
		}

		final ProcessReturn pr = new ProcessReturn();
		pr.setResource(resource);
		return pr;
	}


	@Override
	public String toString() {
		final StringBuilder builder = new StringBuilder();
		builder.append("LanguageExtraction [minSingleValue=");
		builder.append(this.minSingleValue);
		builder.append(", minMultipleValue=");
		builder.append(this.minMultipleValue);
		builder.append(", maxNbValues=");
		builder.append(this.maxNbValues);
		builder.append(", addTopLevelAnnot=");
		builder.append(this.addTopLevelAnnot);
		builder.append(", addMediaUnitLevelAnnot=");
		builder.append(this.addMediaUnitLevelAnnot);
		builder.append(", isProducedByObject=");
		builder.append(this.isProducedByObject);
		builder.append(", unknownLanguageCode=");
		builder.append(this.unknownLanguageCode);
		builder.append(", ngps=");
		builder.append(this.ngps);
		builder.append("]");
		return builder.toString();
	}


	/**
	 * @param res
	 *            The resource to be annotated
	 * @param profileToAnnotate
	 *            The language to annotate using dc:language property statements on res.
	 */
	private void annotate(final Resource res, final List<String> profileToAnnotate) {
		if (profileToAnnotate.isEmpty()) {
			this.log.debug("No language to annotate on resource " + res.getUri() + ".");
		} else {
			this.log.debug("Annotate Resource " + res.getUri() + " with following languages " + profileToAnnotate + ".");

			final Annotation annot = AnnotationFactory.createAndLinkAnnotation(res);
			final DublinCoreAnnotator dca = new DublinCoreAnnotator(URI.create(res.getUri()), annot);
			for (final String language : profileToAnnotate) {
				dca.writeLanguage(language);
			}
			if (this.isProducedByObject != null) {
				new DCTermsAnnotator(URI.create(annot.getUri()), annot).writeCreated(Calendar.getInstance().getTime());
				new WProcessingAnnotator(URI.create(annot.getUri()), annot).writeProducedBy(this.isProducedByObject);
			}
		}
	}


	/**
	 * @param processArg
	 *            The processArgs; i.e. a usageContext not used and a Resource that should contain at least one text.
	 * @return A list of text contained in the resource in processArgs
	 * @throws InvalidParameterException
	 *             If processArgs is null, or if the resource is null.
	 */
	private List<Text> checkArgs(final ProcessArgs processArg) throws InvalidParameterException {
		if (processArg == null) {
			final String msg = "ProcessArgs was null.";
			this.log.error(msg);
			throw new InvalidParameterException(msg, msg);
		}
		final Resource res = processArg.getResource();
		if (res == null) {
			final String msg = "Resource in ProcessArgs was null.";
			this.log.error(msg);
			throw new InvalidParameterException(msg, msg);
		}
		final List<Text> texts;

		if (res instanceof Text) {
			texts = Collections.singletonList((Text) res);
		} else {
			texts = ResourceUtil.getSelectedSubResources(res, Text.class);
		}

		if (texts.isEmpty()) {
			final String msg = "No Text unit found in resource " + res.getUri() + ". Nothing will be done.";
			this.log.warn(msg);
		}

		return texts;
	}


	/**
	 * @param content
	 *            The text to identify language
	 * @param uri
	 *            The uri, used for logging purpose.
	 * @return An ordered list of language identified according to parameters (minSingleValue, maxNbValues and minMultipleValue).
	 */
	private synchronized List<String> checkLanguage(final String content, final String uri) {

		final Ranker ranker = this.ngps.getRanker();
		ranker.account(content);
		final RankResult result = ranker.getRankResult();

		if (this.log.isDebugEnabled()) {
			this.log.debug("Language detected for MediaUnit '" + uri + "' are " + result.toString());
		}

		final List<String> profileToAnnotate;
		// Profiles are listed in their rank order
		final double bestScore = result.getScore(0);
		if (bestScore > this.minSingleValue) {
			profileToAnnotate = Collections.singletonList(result.getName(0));
		} else if (bestScore < this.minMultipleValue) {
			this.log.warn("Unable to guess language for Resource " + uri + ". (" + result.toString() + ").");
			if (this.unknownLanguageCode == null) {
				profileToAnnotate = Collections.emptyList();
			} else {
				profileToAnnotate = Collections.singletonList(this.unknownLanguageCode);
			}
		} else {
			final int max = Math.min(result.getLength(), this.maxNbValues);
			profileToAnnotate = new LinkedList<String>();
			for (int p = 0; p < max; p++) {
				if (result.getScore(p) >= this.minMultipleValue) {
					profileToAnnotate.add(result.getName(p));
				} else {
					break;
				}
			}
		}

		return profileToAnnotate;
	}

}
