/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.language;

import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import javax.annotation.PostConstruct;
import javax.jws.WebService;

import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.service.language.NGramProfilesPatched.RankResult;
import org.ow2.weblab.service.language.NGramProfilesPatched.Ranker;
import org.weblab_project.core.factory.AnnotationFactory;
import org.weblab_project.core.helper.PoKHelper;
import org.weblab_project.core.helper.RDFHelperFactory;
import org.weblab_project.core.model.Annotation;
import org.weblab_project.core.model.ComposedUnit;
import org.weblab_project.core.model.MediaUnit;
import org.weblab_project.core.model.Resource;
import org.weblab_project.core.model.text.Text;
import org.weblab_project.core.ontologies.DublinCore;
import org.weblab_project.core.properties.PropertiesLoader;
import org.weblab_project.core.util.ResourceUtil;
import org.weblab_project.services.analyser.Analyser;
import org.weblab_project.services.analyser.ProcessException;
import org.weblab_project.services.analyser.types.ProcessArgs;
import org.weblab_project.services.analyser.types.ProcessReturn;
import org.weblab_project.services.exception.WebLabException;


/**
 * This class is a WebLab Web service for identifying the language of a Text.<br />
 * 
 * It's a wrapper of the NGramJ project: {@link "http://ngramj.sourceforge.net/"}. It uses the CNGram system that can computes character string instead of raw text files.<br />
 * 
 * This algorithm return for each input text a score associated to every language profile previously learned (.ngp files). The score is a double between 0 and 1. 1 meaning that this text is written in
 * this language for sure. 0 on the opposite means that this text is not written in this language. The sum of score equals 1.<br />
 * 
 * Our wrapper annotate every Text section of a ComposedUnit in input (or the Text if the input is a Text). It fails if the input is something else. On each Text it uses CGram to determine which
 * language profile are the best candidate to be annotated (using DC:language property).
 * 
 * It can be configured using a property file named <code>ngram.properties</code>. In this file you can handle 5 properties.
 * <ul>
 * <li>minSingleValue: It's a double value between 0 and 1. If the best language score is greater than this value, it will be the only one annotated on a given Text</li>
 * <li>minMultipleValue: It's a double value between 0 and 1. Every language score that are greater than this value, will be annotated on a given Text.</li>
 * <li>maxNbValues: It's a positive integer value. The list of annotated language on a given Text could not be greater that this value.</li>
 * <li>profilesFolderPath: It's a String that represents a folder path; This folder contains .ngp files that will be loaded instead of default CNGram 28 languages.</li>
 * <li>addTopLevelAnnot: It's a boolean value. It defines whether or not to annotate the whole document with the language extracted from the concatenation of every Text content.</li>
 * </ul>
 * Those 5 properties are optional. Default values are:
 * <ul>
 * <li>minSingleValue: '0.75'</li>
 * <li>minMultipleValue: '0.15'</li>
 * <li>maxNbValues: '1'</li>
 * <li>profilesFolderPath: in this case, we use the default constructor for CNGram profile that will use default profile given in their jar file. These 28 profiles are named using ISO 639-1 two
 * letters language code; it means that the DC:language annotation resulting will be in this format. If you want to use another format, you have use a custom profiles folder (containing .ngp files).</li>
 * <li>addTopLevelAnnot: false</li>
 * </ul>
 * 
 * @author EADS IPCC Team
 * @date 2009-11-05
 */
@WebService(endpointInterface = "org.weblab_project.services.analyser.Analyser")
public class LanguageExtraction implements Analyser {

	private final static String PROPERTY_FILE = "ngram.properties";

	private final static double DEFAULT_MIN_SINGLE_VALUE = 0.75;
	private final static double DEFAULT_MIN_MULTIPLE_VALUE = 0.15;
	private final static int DEFAULT_MAX_NB_VALUES = 1;

	private final static String MIN_SINGLE_VALUE = "minSingleValue";
	private final static String MIN_MULTIPLE_VALUE = "minMultipleValue";
	private final static String MAX_NB_VALUES = "maxNbValues";
	private final static String PROFILES_FOLDER_PATH = "profilesFolderPath";
	private final static String ADD_TOP_LEVEL_ANNOT = "addTopLevelAnnot";

	private static final String UNKNOWN = "UNKNOWN";

	private double minSingleValue;
	private double minMultipleValue;
	private int maxNbValues;
	private boolean addTopLevelAnnot;

	private NGramProfilesPatched ngps;

	/**
	 * Read the property file to get fields values.
	 */
	@PostConstruct
	public void init() throws LanguageExtractionException {
		Map<String, String> props = PropertiesLoader.loadProperties(PROPERTY_FILE);
		final String minSingleValueP = props.get(MIN_SINGLE_VALUE);
		if (minSingleValueP != null && !minSingleValueP.isEmpty()) {
			try {
				this.minSingleValue = Double.parseDouble(minSingleValueP);
			} catch (final NumberFormatException nfe) {
				LogFactory.getLog(this.getClass()).warn("Unable to parse double for " + MIN_SINGLE_VALUE + " property. Value was: '" + minSingleValueP + "'.");
				this.minSingleValue = DEFAULT_MIN_SINGLE_VALUE;
			}
		} else {
			this.minSingleValue = DEFAULT_MIN_SINGLE_VALUE;
		}

		final String minMultipleValueP = props.get(MIN_MULTIPLE_VALUE);
		if (minMultipleValueP != null && !minMultipleValueP.isEmpty()) {
			try {
				this.minMultipleValue = Double.parseDouble(minMultipleValueP);
			} catch (final NumberFormatException nfe) {
				LogFactory.getLog(this.getClass()).warn("Unable to parse double for " + MIN_MULTIPLE_VALUE + " property. Value was: '" + minMultipleValueP + "'.");
				this.minMultipleValue = DEFAULT_MIN_MULTIPLE_VALUE;
			}
		} else {
			this.minMultipleValue = DEFAULT_MIN_MULTIPLE_VALUE;
		}

		if (this.minSingleValue < this.minMultipleValue) {
			LogFactory.getLog(this.getClass()).warn(MIN_SINGLE_VALUE + " was smaller than " + MIN_MULTIPLE_VALUE + ". Use the two default value instead.");
			this.minSingleValue = DEFAULT_MIN_SINGLE_VALUE;
			this.minMultipleValue = DEFAULT_MIN_MULTIPLE_VALUE;
		}

		LogFactory.getLog(this.getClass()).debug("LanguageExtraction initialised with " + MIN_SINGLE_VALUE + "=" + this.minSingleValue);
		LogFactory.getLog(this.getClass()).debug("LanguageExtraction initialised with " + MIN_MULTIPLE_VALUE + "=" + this.minMultipleValue);

		final String maxNbValuesP = props.get(MAX_NB_VALUES);
		if (maxNbValuesP != null && !maxNbValuesP.isEmpty()) {
			try {
				this.maxNbValues = Integer.parseInt(maxNbValuesP);
			} catch (final NumberFormatException nfe) {
				LogFactory.getLog(this.getClass()).warn("Unable to parse double for " + MAX_NB_VALUES + " property. Value was: '" + maxNbValuesP + "'.");
				this.maxNbValues = DEFAULT_MAX_NB_VALUES;
			}
		} else {
			this.maxNbValues = DEFAULT_MAX_NB_VALUES;
		}

		if (this.maxNbValues < 1) {
			LogFactory.getLog(this.getClass()).warn(MAX_NB_VALUES + " was smaller than 1. Use the two default value instead.");
			this.maxNbValues = DEFAULT_MAX_NB_VALUES;
		}

		LogFactory.getLog(this.getClass()).debug("LanguageExtraction initialised with " + MAX_NB_VALUES + "=" + this.maxNbValues);

		final String profilesFolderPathP = props.get(PROFILES_FOLDER_PATH);
		if (profilesFolderPathP != null && !profilesFolderPathP.isEmpty()) {
			File file = new File(profilesFolderPathP);
			if (!file.exists()) {
				LogFactory.getLog(this.getClass()).warn("File '" + file.getAbsolutePath() + "' does not exists. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else if (!file.canRead()) {
				LogFactory.getLog(this.getClass()).warn("File '" + file.getAbsolutePath() + "' is not readable. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else if (!file.isDirectory()) {
				LogFactory.getLog(this.getClass()).warn("File '" + file.getAbsolutePath() + "' is not a directory. Creating LanguageExtraction with default configuration.");
				try {
					this.ngps = new NGramProfilesPatched();
				} catch (final IOException ioe) {
					throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
				}
			} else {
				try {
					this.ngps = new NGramProfilesPatched(file);
				} catch (final IOException ioe) {
					LogFactory.getLog(this.getClass()).warn(
							"Unable to create NGramProfilesPatched using value of " + PROFILES_FOLDER_PATH + " property. Value was: '" + file.getAbsolutePath() + "'. Try to create default one.", ioe);
					try {
						this.ngps = new NGramProfilesPatched();
					} catch (final IOException ioe2) {
						throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe2);
					}
				}
			}
		} else {
			try {
				this.ngps = new NGramProfilesPatched();
			} catch (final IOException ioe) {
				throw new LanguageExtractionException("Unable to create NGramProfilesPatched using default value.", ioe);
			}
		}

		if (LogFactory.getLog(this.getClass()).isDebugEnabled()) {
			StringBuilder sb = new StringBuilder();
			sb.append("LanguageExtraction initialised with the following " + this.ngps.getProfileCount() + " language profiles: [");
			for (int p = 0; p < this.ngps.getProfileCount(); p++) {
				sb.append(this.ngps.getProfileName(p));
				if (p < this.ngps.getProfileCount() - 1) {
					sb.append(", ");
				} else {
					sb.append("]");
				}
			}
			LogFactory.getLog(this.getClass()).debug(sb.toString());
		}

		final String addTopLevelAnnotP = props.get(ADD_TOP_LEVEL_ANNOT);
		if (addTopLevelAnnotP != null && !addTopLevelAnnotP.isEmpty()) {
			this.addTopLevelAnnot = Boolean.parseBoolean(addTopLevelAnnotP);
		}
	}


	/*
	 * (non-Javadoc)
	 * 
	 * @see org.weblab_project.services.analyser.Analyser#process(org.weblab_project.services.analyser.types.ProcessArgs)
	 */
	@Override
	public ProcessReturn process(ProcessArgs processArgs) throws ProcessException {
		List<Text> texts = this.checkArgs(processArgs);
		final boolean topLevelAnnot = this.addTopLevelAnnot && (processArgs.getResource() instanceof ComposedUnit);
		StringBuilder sb = new StringBuilder();
		for (Text text : texts) {
			if (text.getContent() == null || text.getContent().isEmpty()) {
				LogFactory.getLog(this.getClass()).debug("Text '" + text.getUri() + "' has no content; ignored.");
				continue;
			}
			List<String> profileToAnnotate = this.checkLanguage(text.getContent(), text.getUri());
			this.annotate(text, profileToAnnotate);
			if (topLevelAnnot) {
				sb.append(text.getContent());
				sb.append("\n\n\n");
			}
		}

		if (topLevelAnnot && sb.length() > 0) {
			ComposedUnit cu = (ComposedUnit) processArgs.getResource();

			List<String> profileToAnnotate = this.checkLanguage(sb.toString(), cu.getUri());
			this.annotate(cu, profileToAnnotate);
		}

		ProcessReturn pr = new ProcessReturn();
		pr.setResource(processArgs.getResource());
		return pr;
	}

	private void annotate(Resource res, List<String> profileToAnnotate) {
		Annotation annot = AnnotationFactory.createAndLinkAnnotation(res);
		PoKHelper pokH = RDFHelperFactory.getPoKHelper(annot);
		pokH.setAutoCommitMode(false);
		for (final String language : profileToAnnotate) {
			pokH.createLitStat(res.getUri(), DublinCore.LANGUAGE_PROPERTY_NAME, language);
		}
		pokH.commit();
	}

	private List<String> checkLanguage(final String content, final String uri) {
		List<String> profileToAnnotate = new LinkedList<String>();
		Ranker ranker = this.ngps.getRanker();
		ranker.account(content);
		RankResult result = ranker.getRankResult();

		boolean warn = false;
		// Profile are listed in their rank order	
		final double bestScore = result.getScore(0);
		if (bestScore > this.minSingleValue) {
			profileToAnnotate.add(result.getName(0));
		} else if (bestScore < this.minMultipleValue) {
			profileToAnnotate.add(UNKNOWN);
			warn = true;
		} else {
			final int max = Math.min(result.getLength(), this.maxNbValues);
			for (int p = 0; p < max; p++) {
				if (result.getScore(p) >= this.minMultipleValue) {
					profileToAnnotate.add(result.getName(p));
				} else {
					break;
				}
			}
		}

		if (LogFactory.getLog(this.getClass()).isDebugEnabled() || warn) {
			StringBuilder sb = new StringBuilder();
			sb.append("Language detected for MediaUnit '" + uri + "' are: [");
			for (int p = 0; p < result.getLength(); p++) {
				sb.append(result.getName(p));
				sb.append(" - ");
				sb.append(result.getScore(p));
				if (p < result.getLength() - 1) {
					sb.append(" --|-- ");
				} else {
					sb.append("]");
				}
			}
			if (warn) {
				LogFactory.getLog(this.getClass()).warn(sb.toString());
				LogFactory.getLog(this.getClass()).warn("Unable to identify language for MediaUnit '" + uri + "'; " + profileToAnnotate + " will be annotated.");
			} else {
				LogFactory.getLog(this.getClass()).debug(sb.toString());
				LogFactory.getLog(this.getClass()).debug("Language to be annotated for MediaUnit '" + uri + "' are: " + profileToAnnotate);
			}


		}

		return profileToAnnotate;
	}

	private List<Text> checkArgs(final ProcessArgs processArg) throws ProcessException {
		if (processArg == null) {
			throw new ProcessException("ProcessArgs was null.", createE1Exception());
		}
		Resource res = processArg.getResource();
		if (res == null) {
			throw new ProcessException("Resource in ProcessArgs was null.", createE1Exception());
		}
		if (!(res instanceof MediaUnit)) {
			throw new ProcessException("Resource in ProcessArgs was not an instance of MediaUnit but of '" + res.getClass().getCanonicalName() + "'.", createE1Exception());
		}
		List<Text> texts;
		if (res instanceof ComposedUnit) {
			texts = ResourceUtil.getSelectedSubResources(res, Text.class);
		} else if (res instanceof Text) {
			texts = new LinkedList<Text>();
			texts.add((Text) res);
		} else {
			throw new ProcessException("Resource in ProcessArgs was not neither an instance of ComposedUnit nor of Text but of '" + res.getClass().getCanonicalName() + "'.", createE1Exception());
		}
		return texts;
	}

	private WebLabException createE1Exception() {
		WebLabException wle = new WebLabException();
		wle.setErrorId("E1");
		wle.setErrorMessage("Invalid parameter");
		return wle;
	}

}
