/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2011 Cassidian, an EADS company
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.normaliser.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import javax.jws.WebService;
import javax.xml.bind.DatatypeConverter;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.ClimateForcast;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.ow2.weblab.content.api.ContentManager;
import org.ow2.weblab.content.impl.FileContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.jaxb.XMLStringCleaner;
import org.ow2.weblab.core.extended.ontologies.DCTerms;
import org.ow2.weblab.core.extended.ontologies.DublinCore;
import org.ow2.weblab.core.extended.ontologies.RDFS;
import org.ow2.weblab.core.extended.ontologies.WebLabProcessing;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.impl.JenaPoKHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.rdf.Value;
import org.ow2.weblab.service.normaliser.tika.handlers.WebLabHandlerDecorator;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Tika extractor is quite simple since it does not handle with structure of documents (sheets in Excel, paragraphs in Word, etc.) The structure might have been
 * represented as various MediaUnits.
 * 
 * @todo Rewrite the class comment which is not good... TODO
 */
@WebService(endpointInterface = "org.ow2.weblab.core.services.Analyser")
public class TikaExtractorService implements Analyser {

	/**
	 * The logger to be used inside this class.
	 */
	protected final Log logger;

	/**
	 * The <code>ContentManager</code> to use. Various implementation exists. They are defined through a configuration file.
	 */
	final protected ContentManager contentManager;

	/**
	 * The configuration to be used for the service.
	 */
	final protected TikaConfiguration serviceConfig;

	/**
	 * The configuration Tika by it self.
	 */
	final protected TikaConfig tikaConfig;

	/**
	 * Whether or not to remove content. Just a flag to prevent calculation on each process method call. True only and only if the reader of the content manager
	 * is not a file AND
	 */
	final protected boolean removeContent;

	/**
	 * The formatter used to annotate dates (like 2011-12-31)
	 */
	protected final DateFormat simpleDateFormat;

	/**
	 * The only constructor of this class that needs a configuration.
	 * 
	 * @param conf
	 *            The service configuration.
	 * 
	 * @throws IOException
	 *             If an error occurs accessing the tika configuration or instanciating the content manager.
	 * @throws TikaException
	 *             If an error occurs reading the tika configuration.
	 */
	public TikaExtractorService(final TikaConfiguration conf) throws TikaException, IOException {
		this.logger = LogFactory.getLog(this.getClass());
		this.serviceConfig = conf;
		this.contentManager = ContentManager.getInstance();
		this.removeContent = (!(this.contentManager.getReader() instanceof FileContentManager)) && this.serviceConfig.isRemoveTempContent();
		this.simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");

		if (this.serviceConfig.getPathToXmlConfigurationFile() == null) {
			this.logger.debug(Messages.getString(Constants.KEY_DEBUG_DEFAULT_TIKA_CONFIG));
			this.tikaConfig = new TikaConfig();
		} else {
			try {
				this.tikaConfig = new TikaConfig(this.getClass().getClassLoader().getResource(this.serviceConfig.getPathToXmlConfigurationFile()));
			} catch (SAXException e) {
				throw new IOException(e);
			}
		}

		if (this.contentManager == null) {
			this.logger.fatal(Messages.getString(Constants.KEY_ERROR_UNABLE_TO_LOAD_CONTENT_MANAGER));
			throw new IOException(Messages.getString(Constants.KEY_ERROR_UNABLE_TO_LOAD_CONTENT_MANAGER));
		}

		if (!(this.tikaConfig.getParser() instanceof CompositeParser)) {
			this.logger.warn(Messages.getString(Constants.KEY_WARN_NOT_A_COMPOSITE_PARSER_1, this.tikaConfig.getParser().getClass().getCanonicalName()));
		}

		this.logger.info(Messages.getString(Constants.KEY_INFO_SERVICE_STARTED));
	}

	@Override
	public ProcessReturn process(final ProcessArgs args) throws InvalidParameterException, ContentNotAvailableException, UnexpectedException {
		this.logger.trace("Process method called.");

		// Check that the processArgs contains a document and return it.
		final Document document = this.checkArgs(args);

		this.logger.info(Messages.getString(Constants.KEY_INFO_PROCESS_DOCUMENT_1, document.getUri()));

		// Check that the document contains a file in content manager and return
		// it.
		final File file = this.getContent(document);

		// Feed the document with Text unit from file content and put metadata
		// in the map.
		Map<String, List<String>> toAnnot = this.extractTextAndMetadata(document, file, false);

		// If no text unit are extracted from document, try to extract once
		// again but with the auto-detect parser.
		if (ResourceUtil.getSelectedSubResources(document, Text.class).isEmpty()) {
			this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_TEXT_FOUND_2, file.getAbsolutePath(), document.getUri()));
			toAnnot = this.extractTextAndMetadata(document, file, true);
		}

		// Annotate the document with content of the map.
		if (this.serviceConfig.isAddMetadata()) {
			this.annotate(document, toAnnot);
		}

		// Remove temporary content file if needed
		if (this.removeContent) {
			if (!file.delete()) {
				this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_DELETE_TEMP_2, file.getAbsolutePath(), document.getUri()));
			}
		}

		// Create the return wrapper.
		final ProcessReturn pr = new ProcessReturn();
		pr.setResource(document);

		this.logger.info(Messages.getString(Constants.KEY_INFO_END_OF_PROCESS_1, document.getUri()));

		return pr;
	}

	/**
	 * Get the document inside the process args or throw an <code>InvalidParameterException</code> if not possible.
	 * 
	 * @param args
	 *            The <code>ProcessArgs</code> of the process method.
	 * @return The <code>Document</code> that must be contained by <code>args</code>.
	 * @throws InvalidParameterException
	 *             If <code>resource</code> in <code>args</code> is <code>null</code> or not a <code>Document</code>.
	 */
	protected Document checkArgs(final ProcessArgs args) throws InvalidParameterException {
		if (args == null) {
			final String err = Messages.getString(Constants.KEY_ERROR_PROCESSARGS_NULL);
			this.logger.error(err);
			throw new InvalidParameterException(err, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
		}
		final Resource res = args.getResource();
		if (res == null) {
			final String err = Messages.getString(Constants.KEY_ERROR_RESOURCE_NULL);
			this.logger.error(err);
			throw new InvalidParameterException(err, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
		}
		if (!(res instanceof Document)) {
			final String err = Messages.getString(Constants.KEY_ERROR_NOT_A_DOCUMENT_2, res.getUri(), res.getClass().getCanonicalName());
			this.logger.error(err);
			throw new InvalidParameterException(err, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
		}
		return (Document) res;
	}

	/**
	 * Uses the content manager to retrieve the normalised content of the document in input.
	 * 
	 * @param document
	 *            The document that must contains an hasNativeContentProperty
	 * @return The file, either a temp one (if the Content is managed remotely) or the real if is in local.
	 * @throws ContentNotAvailableException
	 *             If the ContentManager fails or if the file does not exist or is not accessible.
	 */
	private File getContent(final Document document) throws ContentNotAvailableException {
		final File file;
		try {
			file = this.contentManager.readNativeContent(document);
		} catch (final WebLabCheckedException wlce) {
			final String err = Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_1, document.getUri());
			this.logger.error(err, wlce);
			throw new ContentNotAvailableException(err, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE), wlce);
		}
		if (!file.exists()) {
			final String err = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_FOUND_2, file.getAbsolutePath(), document.getUri());
			this.logger.error(err);
			throw new ContentNotAvailableException(err, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
		}
		if (!file.canRead()) {
			final String err = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_READABLE_2, file.getAbsolutePath(), document.getUri());
			this.logger.error(err);
			throw new ContentNotAvailableException(err, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
		}
		return file;
	}

	/**
	 * @param document
	 *            The document to be fill with MediaUnit units
	 * @param contentFile
	 *            The file to be parsed
	 * @param forceAutoDetectParser
	 *            Whether to let Tika guess the parser to use from file content or use existing mimeType on the document (dc:format) to select the appropriated
	 *            parser.
	 * 
	 * @throws UnexpectedException
	 *             If the Tika parser fails.
	 * @throws ContentNotAvailableException
	 *             If the file is not reachable. (This should not appear this its access has been checked before)
	 */
	public Map<String, List<String>> extractTextAndMetadata(final Document document, final File contentFile, final boolean forceAutoDetectParser)
			throws UnexpectedException, ContentNotAvailableException {

		/*
		 * Try to get mimeType in media unit if forceAutoDetectParser condition is false
		 */
		final String mimeType;
		if (forceAutoDetectParser) {
			mimeType = null;
		} else {
			final Value<String> format = new DublinCoreAnnotator(document).readFormat();
			if ((format != null) && format.hasValue()) {
				mimeType = format.firstTypedValue();
				if (format.getValues().size() > 1) {
					this.logger.warn(Messages.getString(Constants.KEY_WARN_MORE_THAN_ONE_TYPE_2, document.getUri(), mimeType));
				}
			} else {
				mimeType = null;
			}
			this.logger.debug("Mime type detected in Resource: " + mimeType);
		}

		/*
		 * If the mime type not defined, uses the auto-detect parser. Otherwise, look up in the Tika config to get the appropriated parser.
		 */
		final Parser parser;
		if (mimeType == null) {
			parser = new AutoDetectParser(this.tikaConfig);
		} else if (this.tikaConfig.getParser() instanceof CompositeParser) {
			final CompositeParser composite = (CompositeParser) this.tikaConfig.getParser();
			final MediaType mediaType = MediaType.parse(mimeType);
			if (composite.getParsers().containsKey(mediaType)) {
				parser = composite.getParsers().get(mediaType);
			} else {
				this.logger.debug("No parser for type " + mediaType + " let Tika guess type.");
				parser = new AutoDetectParser(this.tikaConfig);
			}
		} else {
			parser = this.tikaConfig.getParser();
			// The Parser in the configuration is not composite. That's weird?
			// We only parser one type of file?
			this.logger.debug("Tika Config does not use an AutodetectParser but a " + parser.getClass().getCanonicalName() + ".");
		}

		// The handler that will guess language in the document
		final ProfilingHandler langGuesser = new ProfilingHandler();

		/*
		 * Create an xhtmlOutput file in the temp directory (even if not used). The variable generateHtml is used to keep track on errors. Will be true only at
		 * the end of the method only if the content has been generated.
		 */
		boolean generateHtml = this.serviceConfig.isGenerateHtml();
		File xhtmlOutputFile;
		try {
			xhtmlOutputFile = File.createTempFile("tika", ".xhtml");
		} catch (final IOException ioe) {
			this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TEMP_FILE_1, document.getUri()), ioe);
			xhtmlOutputFile = new File(FileUtils.getTempDirectory(), "noFile");
			generateHtml = false;
		}

		/*
		 * Create the appropriated handler (or tee handler) depending on the things needed.
		 * --> MediaUnit + Language guesser + Normalised content generator
		 * --> MediaUnit + Normalised content generator
		 * --> MediaUnit + Language guesser
		 * --> MediaUnit only.
		 * 
		 * If an error occurs creating the transformer for normalised content generator, it is just skipped and the generateHtml variable is set to false to
		 * prevent use of an empty content.
		 */
		ContentHandler handler;
		if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && generateHtml) {
			this.logger.trace("Create a TeeContentHandler for language guesser, MediaUnit creation and XHTML output creation.");
			try {
				handler = new TeeContentHandler(this.getMUCreatorCHandler(document), langGuesser, this.getHtmlCreatorCHandler(xhtmlOutputFile));
			} catch (final TransformerConfigurationException tce) {
				this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TRANSFORMER_1, document.getUri()), tce);
				generateHtml = false;
				handler = new TeeContentHandler(this.getMUCreatorCHandler(document), langGuesser);
			}
		} else if (generateHtml) {
			this.logger.trace("Create a TeeContentHandler for MediaUnit creation and XHTML output creation.");
			try {
				handler = new TeeContentHandler(this.getMUCreatorCHandler(document), this.getHtmlCreatorCHandler(xhtmlOutputFile));
			} catch (final TransformerConfigurationException tce) {
				this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TRANSFORMER_1, document.getUri()), tce);
				generateHtml = false;
				handler = new TeeContentHandler(this.getMUCreatorCHandler(document));
			}
		} else if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang()) {
			this.logger.trace("Create a TeeContentHandler for language guesser and MediaUnit creation.");
			handler = new TeeContentHandler(this.getMUCreatorCHandler(document), langGuesser);
		} else {
			this.logger.trace("Create a ContentHandler for MediaUnit creation.");
			handler = this.getMUCreatorCHandler(document);
		}

		// The metadata object to be filled by Tika parser.
		final Metadata metadata = new Metadata();

		// The parsecontext
		final ParseContext context = new ParseContext();

		// The inputstream on the content to parse
		final InputStream stream;
		try {
			stream = new FileInputStream(contentFile);
		} catch (final FileNotFoundException fnfe) {
			final String err = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_FOUND_2, contentFile.getAbsolutePath(), document.getUri());
			this.logger.error(err);
			throw new ContentNotAvailableException(err, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
		}

		this.logger.debug("Start parsing " + contentFile.getPath() + " for document " + document.getUri() + ".");
		try {
			parser.parse(stream, handler, metadata, context);
		} catch (final IOException ioe) {
			final String err = Messages.getString(Constants.KEY_ERROR_IOE_ON_CONTENT_2, contentFile.getPath(), document.getUri());
			this.logger.error(err, ioe);
			throw new UnexpectedException(err, Messages.getString(Constants.KEY_ERROR_IOE_ON_CONTENT_SIMPLE), ioe);
		} catch (final SAXException saxe) {
			final String err = Messages.getString(Constants.KEY_ERROR_SAXE_ON_CONTENT_2, contentFile.getPath(), document.getUri());
			this.logger.error(err, saxe);
			throw new UnexpectedException(err, Messages.getString(Constants.KEY_ERROR_ERROR_ON_CONTENT_SIMPLE), saxe);
		} catch (final TikaException te) {
			final String err = Messages.getString(Constants.KEY_ERROR_TIKA_EX_ON_CONTENT_2, contentFile.getPath(), document.getUri());
			this.logger.error(err, te);
			throw new UnexpectedException(err, Messages.getString(Constants.KEY_ERROR_ERROR_ON_CONTENT_SIMPLE), te);
		} finally {
			IOUtils.closeQuietly(stream);
		}
		this.logger.debug("Finished parsing " + contentFile.getPath() + " for document " + document.getUri() + ".");

		/*
		 * If the language identification is enabled and certain enough, add the language to metadata
		 */
		if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && langGuesser.getLanguage().isReasonablyCertain()) {
			metadata.set(org.apache.tika.metadata.DublinCore.LANGUAGE, langGuesser.getLanguage().getLanguage());
		} else if (this.serviceConfig.isAnnotateDocumentWithLang() && (this.serviceConfig.getDefaultLang() != null)) {
			metadata.set(org.apache.tika.metadata.DublinCore.LANGUAGE, this.serviceConfig.getDefaultLang());
		}

		if (generateHtml) {
			if (!xhtmlOutputFile.exists()) {
				this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_OUTPUT_FILE_2, xhtmlOutputFile.getPath(), document.getUri()));
			} else if (FileUtils.sizeOf(xhtmlOutputFile) <= 0) {
				this.logger.warn(Messages.getString(Constants.KEY_WARN_EMPTY_OUTPUT_FILE_2, xhtmlOutputFile.getPath(), document.getUri()));
			} else {
				final FileInputStream fis;
				try {
					fis = new FileInputStream(xhtmlOutputFile);
					this.logger.debug("Save normalised content file: " + xhtmlOutputFile);
					try {
						this.contentManager.writeNormalisedContent(fis, document);
					} catch (final WebLabCheckedException wlce) {
						this.logger.warn(Messages.getString(Constants.KEY_WARN_ERROR_SAVING_NORMALISED_2, xhtmlOutputFile.getPath(), document.getUri()), wlce);
					} finally {
						IOUtils.closeQuietly(fis);
					}
				} catch (final FileNotFoundException fnfe) {
					this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_OUTPUT_FILE_2, xhtmlOutputFile.getPath(), document.getUri()), fnfe);
				}
			}
		}

		FileUtils.deleteQuietly(xhtmlOutputFile);

		// Convert Tika metadata into WebLab RDF if annotations are needed.
		if (this.serviceConfig.isAddMetadata()) {
			return this.fillMapWithMetadata(metadata);
		}

		return Collections.emptyMap();
	}

	/**
	 * Annotates <code>document</code> with the predicates and literals contained in <code>toAnnot</code>.
	 * 
	 * @param document
	 *            The Document to be annotated.
	 * @param toAnnot
	 *            The Map of predicate and their literal values.
	 */
	protected void annotate(final Document document, final Map<String, List<String>> toAnnot) {
		if (toAnnot.isEmpty()) {
			this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_META_1, document.getUri()));
			return;
		}

		final Annotation annot = AnnotationFactory.createAndLinkAnnotation(document);
		final JenaPoKHelper ahe = new JenaPoKHelper(annot);
		ahe.setAutoCommitMode(false);
		boolean addDCTPrefix = false, addTikaPrefix = false, addWLPPrefix = false;

		for (final Entry<String, List<String>> entry : toAnnot.entrySet()) {
			if (!entry.getValue().isEmpty()) {
				// Set boolean needed for prefix definitions
				if (entry.getKey().startsWith(DCTerms.NAMESPACE)) {
					addDCTPrefix = true;
				} else if (entry.getKey().startsWith(WebLabProcessing.NAMESPACE)) {
					addWLPPrefix = true;
				} else if (entry.getKey().startsWith(this.serviceConfig.getUnmappedPropertiesBaseUri())) {
					addTikaPrefix = true;
				}

				// Annotate
				for (final String val : entry.getValue()) {
					ahe.createLitStat(document.getUri(), entry.getKey(), val);
				}
			}
		}

		if (this.serviceConfig.getServiceUri() != null) {
			ahe.createResStat(annot.getUri(), WebLabProcessing.IS_PRODUCED_BY, this.serviceConfig.getServiceUri());
			ahe.createLitStat(annot.getUri(), DCTerms.CREATED, DatatypeConverter.printDateTime(Calendar.getInstance()));

			ahe.setNSPrefix(DCTerms.PREFERRED_PREFIX, DCTerms.NAMESPACE);
			ahe.setNSPrefix(WebLabProcessing.PREFERRED_PREFIX, WebLabProcessing.NAMESPACE);
		} else {
			if (addDCTPrefix) {
				ahe.setNSPrefix(DCTerms.PREFERRED_PREFIX, DCTerms.NAMESPACE);
			}
			if (addWLPPrefix) {
				ahe.setNSPrefix(WebLabProcessing.PREFERRED_PREFIX, WebLabProcessing.NAMESPACE);
			}
		}

		if (addTikaPrefix) {
			ahe.setNSPrefix(this.serviceConfig.getUnmappedPropertiesPrefix(), this.serviceConfig.getUnmappedPropertiesBaseUri());
		}

		try {
			ahe.commit();
		} catch (final Exception e) {
			this.logger.warn(Messages.getString(Constants.KEY_WARN_ERROR_COMMIT_2, document.getUri(), toAnnot), e);
			document.getAnnotation().remove(annot);
		}
	}

	/**
	 * Creates a new MediaUnit content Handler, with a HTML Body handler inside.
	 * 
	 * @param document
	 *            The document to be enriched with mediaUnits
	 * @return The MediaUnitContent Handler
	 * @throws UnexpectedException
	 */
	private WebLabHandlerDecorator getMUCreatorCHandler(final Document document) throws UnexpectedException {
		WebLabHandlerDecorator wlhd;
		try {
			wlhd = this.serviceConfig.getWebLabHandlerDecoratorClass().newInstance();
		} catch (final InstantiationException ie) {
			final String err = Messages.getString(Constants.KEY_ERROR_BAD_HANDLER_1, this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName());
			this.logger.error(err, ie);
			throw new UnexpectedException(err, err, ie);
		} catch (final IllegalAccessException iae) {
			final String err = Messages.getString(Constants.KEY_ERROR_BAD_HANDLER_1, this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName());
			this.logger.error(err, iae);
			throw new UnexpectedException(err, err, iae);
		}
		wlhd.setDocument(document);
		wlhd.setTikaConfiguration(this.serviceConfig);
		wlhd.setContentHandler(new BodyContentHandler(-1));
		return wlhd;
	}

	/**
	 * Creates an handler in charge of writing the XHTML events into the xhtmlFile.
	 * 
	 * @param xhtmlFile
	 *            The file in which the XHTML should be written.
	 * @return A ContentHandler that writes into the file
	 * @throws TransformerConfigurationException
	 *             If the content handler cannot be created.
	 */
	private ContentHandler getHtmlCreatorCHandler(final File xhtmlFile) throws TransformerConfigurationException {
		final SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory.newInstance();
		final TransformerHandler handler;
		handler = factory.newTransformerHandler();
		handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
		handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
		handler.setResult(new StreamResult(xhtmlFile));
		return handler;
	}

	/**
	 * Removes empty properties and ensure that they contains only recommended XML chars.
	 * 
	 * @param toAnnot
	 *            The <code>Map</code> of predicates and values to be cleaned from empty String.
	 */
	private void cleanMap(final Map<String, List<String>> toAnnot) {
		final Set<String> predToRemove = new HashSet<String>();
		for (final Entry<String, List<String>> entry : toAnnot.entrySet()) {
			final ListIterator<String> listIt = entry.getValue().listIterator();
			while (listIt.hasNext()) {
				final String val = listIt.next();
				listIt.set(XMLStringCleaner.getXMLRecommendedString(val));
				if (val.trim().equals("")) {
					listIt.remove();
				}
			}
			if (entry.getValue().isEmpty()) {
				predToRemove.add(entry.getKey());
			}
		}

		for (final String keyToRemove : predToRemove) {
			toAnnot.remove(keyToRemove);
		}
	}

	/**
	 * The method converts the metadata extracted by Tika into a Map of predicates with their values that can be annotated.
	 * 
	 * It can map some Tikas properties with DublinCore and DCTerms ones and for any metadata it also create a dirty predicate using the base URI.
	 * 
	 * @param metadata
	 *            The dirty map of metadata extrated by Tika.
	 * @return A map of RDF predicates and their literal values.
	 */
	protected Map<String, List<String>> fillMapWithMetadata(final Metadata metadata) {
		final Map<String, List<String>> toAnnot = new HashMap<String, List<String>>();

		for (final String name : metadata.names()) {

			final String[] values = metadata.getValues(name);

			if (values.length == 0) {
				// Just in case
				continue;
			}

			/*
			 * Start a dirty if else if on each metadata type.
			 * 
			 * Sometimes Tika handle Properties that contains Object instead of Strings. Some properties are not mapped to existing ontologies, in this case, we
			 * a dedicated namespace if needed.
			 */

			// ClimateForcast
			if (name.equalsIgnoreCase(ClimateForcast.CONTACT) || name.equalsIgnoreCase(ClimateForcast.INSTITUTION)) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(ClimateForcast.REFERENCES)) {
				this.addToAnnot(toAnnot, DCTerms.REFERENCES, values);
			} else if (name.equalsIgnoreCase(ClimateForcast.SOURCE)) {
				this.addToAnnot(toAnnot, DublinCore.SOURCE_PROPERTY_NAME, values);
			} // + About 10 properties that will be mapped to Tika ns

			// CreativeCommons
			else if (name.equalsIgnoreCase(CreativeCommons.LICENSE_LOCATION) || name.equalsIgnoreCase(CreativeCommons.LICENSE_URL)) {
				this.addToAnnot(toAnnot, DCTerms.LICENSE, values);
			} // WorkType that will be mapped to Tika ns

			else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.CONTRIBUTOR)) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.COVERAGE)) {
				this.addToAnnot(toAnnot, DublinCore.CREATOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.CREATOR)) {
				this.addToAnnot(toAnnot, DublinCore.CREATOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.DESCRIPTION)) {
				this.addToAnnot(toAnnot, DublinCore.DESCRIPTION_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.FORMAT)) {
				this.addToAnnot(toAnnot, DublinCore.FORMAT_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.IDENTIFIER)) {
				this.addToAnnot(toAnnot, DublinCore.IDENTIFIER_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.LANGUAGE) && this.serviceConfig.isAnnotateDocumentWithLang()) {
				this.addToAnnot(toAnnot, DublinCore.LANGUAGE_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.MODIFIED)) {
				for (final String value : values) {
					this.addToAnnot(toAnnot, DCTerms.MODIFIED, new String[] { this.convertToISO8601Date(value) });
				}
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.PUBLISHER)) {
				this.addToAnnot(toAnnot, DublinCore.PUBLISHER_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.RELATION)) {
				this.addToAnnot(toAnnot, DublinCore.RELATION_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.RIGHTS)) {
				this.addToAnnot(toAnnot, DublinCore.RIGHTS_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.SOURCE)) {
				this.addToAnnot(toAnnot, DublinCore.SOURCE_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.SUBJECT)) {
				this.addToAnnot(toAnnot, DublinCore.SUBJECT_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.TITLE)) {
				this.addToAnnot(toAnnot, DublinCore.TITLE_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.TYPE)) {
				this.addToAnnot(toAnnot, DublinCore.TYPE_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(org.apache.tika.metadata.DublinCore.DATE.getName())) {
				final Date d = metadata.getDate(org.apache.tika.metadata.DublinCore.DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DublinCore.DATE_PROPERTY_NAME, new String[] { this.simpleDateFormat.format(d) });
				}
			} // Ok

			// Geographic, 3 properties none mapped to a standard ontology

			// HttpHeaders
			else if (name.equalsIgnoreCase(HttpHeaders.LAST_MODIFIED.getName())) {
				final Date d = metadata.getDate(HttpHeaders.LAST_MODIFIED);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.MODIFIED, new String[] { this.simpleDateFormat.format(d) });
				}
			} // + 8 properties mapped to Tika ns

			// Message
			else if (name.equalsIgnoreCase(Message.MESSAGE_FROM)) {
				this.addToAnnot(toAnnot, DublinCore.CREATOR_PROPERTY_NAME, values);
			} // + 4 properties mapped to Tika ns

			else if (name.equalsIgnoreCase(MSOffice.AUTHOR)) {
				this.addToAnnot(toAnnot, DublinCore.CREATOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(MSOffice.CHARACTER_COUNT)) {
				this.addToDCExtent(toAnnot, values, "characters");
			} else if (name.equalsIgnoreCase(MSOffice.CHARACTER_COUNT_WITH_SPACES)) {
				this.addToDCExtent(toAnnot, values, "characters (with spaces)");
			} else if (name.equalsIgnoreCase(MSOffice.COMMENTS)) {
				this.addToAnnot(toAnnot, RDFS.COMMENT, values);
			} else if (name.equalsIgnoreCase(MSOffice.COMPANY)) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(MSOffice.KEYWORDS)) {
				this.addToAnnot(toAnnot, DublinCore.DESCRIPTION_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(MSOffice.LAST_AUTHOR)) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(MSOffice.LINE_COUNT)) {
				this.addToDCExtent(toAnnot, values, "lines");
			} else if (name.equalsIgnoreCase(MSOffice.MANAGER)) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(MSOffice.NOTES)) {
				this.addToAnnot(toAnnot, RDFS.COMMENT, values);
			} else if (name.equalsIgnoreCase(MSOffice.PAGE_COUNT)) {
				this.addToDCExtent(toAnnot, values, "pages");
			} else if (name.equalsIgnoreCase(MSOffice.PARAGRAPH_COUNT)) {
				this.addToDCExtent(toAnnot, values, "paragraphs");
			} else if (name.equalsIgnoreCase(MSOffice.SLIDE_COUNT)) {
				this.addToDCExtent(toAnnot, values, "slides");
			} else if (name.equalsIgnoreCase(MSOffice.WORD_COUNT)) {
				this.addToDCExtent(toAnnot, values, "words");
			} else if (name.equalsIgnoreCase(MSOffice.CREATION_DATE.getName())) {
				final Date d = metadata.getDate(MSOffice.CREATION_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.CREATED, new String[] { this.simpleDateFormat.format(d) });
				}
			} else if (name.equalsIgnoreCase(MSOffice.LAST_SAVED.getName())) {
				final Date d = metadata.getDate(MSOffice.LAST_SAVED);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.MODIFIED, new String[] { this.simpleDateFormat.format(d) });
				}
			} // +12 properties (including a date)

			// PagedText
			else if (name.equalsIgnoreCase(PagedText.N_PAGES.getName())) {
				this.addToDCExtent(toAnnot, values, "pages");
			} // ok

			// TODO Map to the W3C Media Ontology TIFF and XMPDM properties

			// TIFF
			else if (name.equalsIgnoreCase(TIFF.ORIGINAL_DATE.getName())) {
				final Date d = metadata.getDate(TIFF.ORIGINAL_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.CREATED, new String[] { this.simpleDateFormat.format(d) });
				}
			} // + about 10 other properties

			// XMPDM
			else if (name.equalsIgnoreCase(XMPDM.ARTIST.getName())) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(XMPDM.AUDIO_MOD_DATE.getName())) {
				final Date d = metadata.getDate(XMPDM.AUDIO_MOD_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.MODIFIED, new String[] { this.simpleDateFormat.format(d) });
				}
			} else if (name.equalsIgnoreCase(XMPDM.COMPOSER.getName())) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(XMPDM.COPYRIGHT.getName())) {
				this.addToAnnot(toAnnot, DublinCore.RIGHTS_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(XMPDM.ENGINEER.getName())) {
				this.addToAnnot(toAnnot, DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(XMPDM.GENRE.getName())) {
				this.addToAnnot(toAnnot, DublinCore.DESCRIPTION_PROPERTY_NAME, values);
			} else if (name.equalsIgnoreCase(XMPDM.LOG_COMMENT.getName())) {
				this.addToAnnot(toAnnot, RDFS.COMMENT, values);
			} else if (name.equalsIgnoreCase(XMPDM.METADATA_MOD_DATE.getName())) {
				final Date d = metadata.getDate(XMPDM.METADATA_MOD_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.MODIFIED, new String[] { this.simpleDateFormat.format(d) });
				}
			} else if (name.equalsIgnoreCase(XMPDM.RELEASE_DATE.getName())) {
				final Date d = metadata.getDate(XMPDM.METADATA_MOD_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.AVAILABLE, new String[] { this.simpleDateFormat.format(d) });
				}
			} else if (name.equalsIgnoreCase(XMPDM.SHOT_DATE.getName())) {
				final Date d = metadata.getDate(XMPDM.SHOT_DATE);
				if (d != null) {
					this.addToAnnot(toAnnot, DCTerms.CREATED, new String[] { this.simpleDateFormat.format(d) });
				}
			} // + about 10 other properties

			// Tika properties directly mapped to Tika ns

			// Unmapped properties
			else if (this.serviceConfig.isAddUnmappedProperties()) {
				// The name of some properties is not usable in URI directly
				// names. Remove every non word characters and replace them by
				// _.
				final String cleanedName = name.replaceAll("\\W", "_");
				try {
					final String predicate = new URL(this.serviceConfig.getUnmappedPropertiesBaseUri() + cleanedName).toURI().toString();
					this.addToAnnot(toAnnot, predicate, values);
				} catch (final URISyntaxException urise) {
					this.logger.warn(Messages.getString(Constants.KEY_WARN_UNMAPPED_PROPERTY_ERROR_4, name, this.serviceConfig.getUnmappedPropertiesBaseUri(),
							cleanedName, values.toString()), urise);
				} catch (final MalformedURLException murle) {
					this.logger.warn(Messages.getString(Constants.KEY_WARN_UNMAPPED_PROPERTY_ERROR_4, name, this.serviceConfig.getUnmappedPropertiesBaseUri(),
							cleanedName, values.toString()), murle);
				}
			}
		}

		this.cleanMap(toAnnot);

		return toAnnot;
	}

	private void addToDCExtent(final Map<String, List<String>> toAnnot, final String[] values, final String unit) {
		for (final String value : values) {
			this.addToAnnot(toAnnot, DCTerms.EXTENT, new String[] { value + " " + unit });
		}
	}

	private void addToAnnot(final Map<String, List<String>> toAnnot, final String propertyUri, final String[] values) {
		if (!toAnnot.containsKey(propertyUri)) {
			toAnnot.put(propertyUri, new LinkedList<String>());
		}
		toAnnot.get(propertyUri).addAll(Arrays.asList(values));
	}

	/**
	 * @param inDateStr
	 *            The input date that might be in three different formats. The Office one e.g.: <code>Mon Jan 05 16:53:20 CET 2009</code> or already in ISO8601
	 *            format. Else the date will be logged as error, an replaced by the empty String.
	 * @return The date in ISO8601 format
	 */
	private String convertToISO8601Date(final String inDateStr) {
		final String outDateStr;
		String tmpDateStr = inDateStr;
		if ((tmpDateStr != null) && !tmpDateStr.trim().equals("")) {
			tmpDateStr = tmpDateStr.trim();
			SimpleDateFormat sdf;
			if (Character.isDigit(tmpDateStr.charAt(0))) {
				sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
			} else if (!tmpDateStr.contains(",")) {
				sdf = new SimpleDateFormat("EEE MMM d hh:mm:ss z yyyy", Locale.ENGLISH);
			} else {
				sdf = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
			}

			Date date = null;
			try {
				date = sdf.parse(tmpDateStr);
			} catch (final ParseException pe) {
				this.logger.warn("Unable to read date: '" + tmpDateStr + "'.", pe);
			}
			if (date != null) {
				outDateStr = this.simpleDateFormat.format(date);
			} else {
				outDateStr = "";
			}
		} else {
			outDateStr = "";
		}
		return outDateStr;
	}

	/**
	 * Adding unit on each values of the list
	 * 
	 * @param values
	 *            the <code>List</code> of values
	 * @param unit
	 *            the unit to add
	 * @return the <code>List</code> of values with unit
	 */
	protected static List<String> addUnitOnValues(final List<String> values, final String unit) {
		final List<String> result = new ArrayList<String>();
		for (final String val : values) {
			result.add(val + unit);
		}
		return result;
	}

}
