/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.services.duplicates;

import java.io.File;
import java.util.List;
import java.util.Map;

import javax.jws.WebService;
import javax.servlet.ServletContext;
import javax.xml.ws.WebServiceContext;
import javax.xml.ws.handler.MessageContext;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.ow2.weblab.services.indexer.impl.SolrIndexer;
import org.ow2.weblab.services.searcher.impl.SolrSearcher;
import org.ow2.weblab.services.solr.SolrComponent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.weblab_project.core.exception.WebLabCheckedException;
import org.weblab_project.core.factory.AnnotationFactory;
import org.weblab_project.core.helper.PoKHelper;
import org.weblab_project.core.helper.RDFHelperFactory;
import org.weblab_project.core.helper.ResourceHelper;
import org.weblab_project.core.model.Annotation;
import org.weblab_project.core.model.MediaUnit;
import org.weblab_project.core.ontologies.DublinCore;
import org.weblab_project.core.ontologies.WebLab;
import org.weblab_project.core.properties.PropertiesLoader;
import org.weblab_project.services.analyser.Analyser;
import org.weblab_project.services.analyser.ProcessException;
import org.weblab_project.services.analyser.types.ProcessArgs;
import org.weblab_project.services.analyser.types.ProcessReturn;
import org.weblab_project.services.exception.WebLabException;
import org.weblab_project.services.indexer.IndexException;

@WebService(endpointInterface = "org.weblab_project.services.analyser.Analyser")
public class DuplicatesDetectorService implements Analyser {

	private final static Logger logger = LoggerFactory.getLogger(DuplicatesDetectorService.class);

	@javax.annotation.Resource
	protected WebServiceContext wsContext;

	private SolrComponent comp;

	/**
	 * Config file properties
	 */
	protected final static String CONFIG_FILE = "duplicates-detector.config";
	protected final static String SIMILARITY_LIMIT_PROPERTY = "similarityLimit";
	protected static Map<String, String> props;

	/**
	 * Process method is composed by 2 steps. First, extract source property values from resource and search them into
	 * duplicate index. If results are found, compare their texts with Levenshtein distance to in parameter resource
	 * text. If no resources found with source property, try with "more like this" Solr query and also compare texts
	 * with Leveinshtein distance.<br/>
	 * The similarity limit, which is configurable into "duplicates-detector.config" file, is used to determine if a
	 * document is a duplicate or not and is expressed as a percentage. This percentage represents the number of
	 * characters unmodified against the total number of characters.
	 */
	@Override
	public ProcessReturn process(ProcessArgs args) throws ProcessException {
		final MediaUnit unit = DuplicatesDetectorService.checkArgs(args);
		logger.info("Process method of DuplicatesDetectorService called for Document: " + unit.getUri());

		boolean docIsAlreadyPresent = false;

		this.comp = SolrComponent.getInstance();
		try {
			this.comp.open(this.getWebAppPath());

			// Test if resource already exists in index by searching with source field
			docIsAlreadyPresent = this.testDuplicateWithSourceProperty(unit);

			// If no duplicate document found, try with Solr "More like this" query
			if (!docIsAlreadyPresent) {
				docIsAlreadyPresent = this.testDuplicateWithSolrMoreLikeThisQuery(unit);
			}

			// Add annotation if document is already indexed
			if (docIsAlreadyPresent) {
				logger.info("Document (" + unit.getUri() + ") is a duplicate.");
				Annotation annot = AnnotationFactory.createAndLinkAnnotation(unit);
				PoKHelper pokHlp = RDFHelperFactory.getPoKHelper(annot);

				pokHlp.createLitStat(unit.getUri(), WebLab.CAN_BE_IGNORED, "true");
			}
			else {
				logger.info("Document (" + unit.getUri() + ") is a new document.");
			}
		}
		catch (WebLabCheckedException e) {
			WebLabException exp = new WebLabException();
			exp.setErrorId("E0");
			exp.setErrorMessage("Unexpected error");
			throw new ProcessException("Error when calling solr index.", exp, e);
		}

		ProcessReturn pr = new ProcessReturn();
		pr.setResource(unit);

		return pr;
	}

	/**
	 * Extract source property values from MediaUnit in parameter and search with these into index. For each resource
	 * found, call Levenshtein similarity and compare the result to similarity limit to determine if it is a duplicate
	 * or not.
	 * 
	 * @param unit
	 *            the MediaUnit to search
	 * @return true if a duplicate resource was found
	 * @throws WebLabCheckedException
	 */
	private boolean testDuplicateWithSourceProperty(MediaUnit unit) throws WebLabCheckedException {
		boolean docIsAlreadyPresent = false;
		String textToCompare = DuplicatesDetectorService.removeAllBreakLinesAndMultipleSpaces(SolrComponent
				.extractTextFromResource(unit));

		if (props == null)
			loadProps();

		Float similarityLimit = Float.parseFloat(props.get(SIMILARITY_LIMIT_PROPERTY));

		// Retrieve dublin core source property from resource
		ResourceHelper helper = RDFHelperFactory.getResourceHelper(unit);
		List<String> sourceProperties = helper.getLitsOnPredSubj(unit.getUri(), DublinCore.SOURCE_PROPERTY_NAME);

		if (sourceProperties != null && sourceProperties.size() > 0) {
			StringBuilder sourceQuery = new StringBuilder();
			for (String sourceProperty : sourceProperties) {
				sourceQuery.append("source:(\"" + ClientUtils.escapeQueryChars(sourceProperty) + "\") OR ");
			}
			sourceQuery.delete(sourceQuery.lastIndexOf(" OR "), sourceQuery.length() - 1);

			// Test if resource already exists in index by searching with source field
			QueryResponse qr = this.comp.search(sourceQuery.toString(), 0, 10);

			if (qr.getResults() != null && qr.getResults().size() > 0) {
				for (SolrDocument hit : qr.getResults()) {
					String hitText = DuplicatesDetectorService.removeAllBreakLinesAndMultipleSpaces(String.valueOf(hit
							.getFieldValue("text")));
					float similarity = DuplicatesDetectorService.getLevenshteinSimilarity(textToCompare, hitText);

					if (similarity > similarityLimit) {
						logger.info("Similar document found from source property (" + similarity + "% of text similarity): " + String.valueOf(hit.getFieldValue("id")));
						docIsAlreadyPresent = true;
					}
					else {
						logger.info("Document with same source property found but similiraty (" + similarity
								+ "%) is lower than limit (" + similarityLimit + "%): " + String.valueOf(hit.getFieldValue("id")));
					}
				}
			}
			else {
				logger.info("No documents with same source property found.");
			}
		}

		return docIsAlreadyPresent;
	}

	/**
	 * Index MediaUnit in parameter and use "more like this" query to find similar resources. For each resource found,
	 * call Levenshtein similarity and compare the result to similarity limit to determine if it is a duplicate or not.
	 * 
	 * @param unit
	 *            the MediaUnit to search
	 * @return true if a duplicate resource was found
	 * @throws WebLabCheckedException
	 */
	private boolean testDuplicateWithSolrMoreLikeThisQuery(MediaUnit unit) throws WebLabCheckedException {
		boolean docIsAlreadyPresent = false;
		String textToCompare = DuplicatesDetectorService.removeAllBreakLinesAndMultipleSpaces(SolrComponent
				.extractTextFromResource(unit));

		if (props == null)
			loadProps();

		Float similarityLimit = Float.parseFloat(props.get(SIMILARITY_LIMIT_PROPERTY));

		// First, add document to index to use "more like this" query on its id
		this.comp.addDocument(unit);
		this.comp.flushIndexBuffer();
		QueryResponse qr = this.comp.moreLikeThis("id:(\"" + unit.getUri() + "\")");

		if (qr.getResults() != null && qr.getResults().size() > 0) {
			for (SolrDocument hit : qr.getResults()) {
				String hitText = DuplicatesDetectorService.removeAllBreakLinesAndMultipleSpaces(String.valueOf(hit
						.getFieldValue("text")));
				float similarity = DuplicatesDetectorService.getLevenshteinSimilarity(textToCompare, hitText);

				if (similarity > similarityLimit) {
					logger.info("Similar document found from \"moreLikeThis\" query (" + similarity + "% of text similarity): " + String.valueOf(hit.getFieldValue("id")));
					docIsAlreadyPresent = true;
				}
				else {
					logger.info("Document found in index but similiraty (" + similarity + "%) is lower than limit ("
							+ similarityLimit + "%): " + String.valueOf(hit.getFieldValue("id")));
				}
			}
		}
		else {
			logger.info("No documents found with \"More like this\" query.");
		}

		return docIsAlreadyPresent;
	}

	/**
	 * Provide percentage of similarity between two <code>String</code>. The Levenshtein distance is used to calculate
	 * this value.
	 * 
	 * @param s1
	 *            the first <code>String</code> to compare
	 * @param s2
	 *            the second <code>String</code> to compare
	 * @return percentage of similarity between the two <code>String</code>
	 */
	private static float getLevenshteinSimilarity(String s1, String s2) {
		int nbOfChanges = StringUtils.getLevenshteinDistance(s1, s2);
		int nbOfCharacters = Math.max(s1.length(), s2.length());
		float similarity = 100 - nbOfChanges / new Float(nbOfCharacters) * 100;
		return similarity;
	}

	/**
	 * @param args
	 *            The <code>ProcessArgs</code> to check in the begin of <code>process</code>.
	 * @return The contained <code>MediaUnit</code>
	 * @throws IndexException
	 *             if we are unable to extract the contained <code>MediaUnit</code>
	 */
	private static MediaUnit checkArgs(final ProcessArgs args) throws ProcessException {
		WebLabException wle = new WebLabException();
		wle.setErrorId("E1");
		wle.setErrorMessage("Invalid parameter");
		if (args == null) {
			LogFactory.getLog(SolrIndexer.class).error("IndexArgs was null.");
			throw new ProcessException("IndexArgs was null.", wle);
		}
		if (args.getResource() == null) {
			LogFactory.getLog(SolrIndexer.class).error("Args must contain a non-null Resource to index");
			throw new ProcessException("Args must contain a " + "non-null Resource to index", wle);
		}
		if (!(args.getResource() instanceof MediaUnit)) {
			LogFactory.getLog(SolrIndexer.class).error("Resource to index as not a MediaUnit.");
			throw new ProcessException("Resource to index " + "as not a MediaUnit.", wle);
		}
		return (MediaUnit) args.getResource();
	}
	
	private static String removeAllBreakLinesAndMultipleSpaces(String textToProcess) {
		return textToProcess.replaceAll("[\\s\\xA0]+", " ");
	}

	/**
	 * Return the Web application path
	 * 
	 * @return webapp path
	 * @throws WebLabCheckedException
	 */
	private String getWebAppPath() throws WebLabCheckedException {
		String appPath = "./src/main/resources";
		if (this.wsContext != null) {
			ServletContext ctx = (ServletContext) this.wsContext.getMessageContext()
					.get(MessageContext.SERVLET_CONTEXT);
			appPath = ctx.getRealPath("WEB-INF/classes");
		}
		else {
			LogFactory.getLog(SolrSearcher.class).warn(
					"Webservice context not available returning current local path as default web app path");
		}

		File f = new File(appPath);
		if (!f.exists())
			throw new WebLabCheckedException("Webapp path [" + appPath + "] does not exists...");
		return appPath;
	}

	private void loadProps() {
		props = PropertiesLoader.loadProperties(CONFIG_FILE);
	}

	/**
	 * @return the wsContext
	 */
	public WebServiceContext getWsContext() {
		return this.wsContext;
	}

	/**
	 * @param wsContext
	 *            the wsContext to set
	 */
	public void setWsContext(WebServiceContext wsContext) {
		this.wsContext = wsContext;
	}

}
