/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2011 CASSIDIAN
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.services.normaliser.tika;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import javax.jws.WebService;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMimeKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.ow2.weblab.content.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.jaxb.XMLStringCleaner;
import org.ow2.weblab.core.extended.ontologies.DCTerms;
import org.ow2.weblab.core.extended.ontologies.DublinCore;
import org.ow2.weblab.core.extended.ontologies.RDFS;
import org.ow2.weblab.core.extended.properties.PropertiesLoader;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.PoKHelperExtended;
import org.ow2.weblab.core.helper.RDFHelperFactory;
import org.ow2.weblab.core.helper.ResourceHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.AccessDeniedException;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InsufficientResourcesException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.ServiceNotConfiguredException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.UnsupportedRequestException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.springframework.core.io.ClassPathResource;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Tika extractor is quite simple since it does not handle with structure of
 * documents (sheets in Excel, paragraphs in Word, etc.) The structure might
 * have been represented as various MediaUnits
 * 
 * @todo Maybe some properties shall be extracted to a configuration file.
 */
@WebService(endpointInterface = "org.ow2.weblab.core.services.Analyser")
public class TikaExtractorService implements Analyser {

	private final static Log logger = LogFactory.getLog(TikaExtractorService.class);

	/**
	 * Properties file
	 */
	final public static String CONFIG_FILE = "tika-service.properties";
	public static final String BASE_URI_PROPERTY_NAME = "baseUri";
	public static final String REMOVE_COTNENT_PROPERTY_NAME = "removeContent";
	public static final String OVERRIDE_METADATA_PROPERTY_NAME = "overrideMetadata";
	public static final String XHTML_FOLDER_PROPERTY_NAME = "xhtmlFolder";
	public static final String XHTML_SAVE = "saveXhtml";

	/**
	 * The <code>BinaryFolderContentManager</code> to use
	 */
	protected ContentManager contentManager = null;

	/**
	 * The base URI to be used for creating predicates when annotating directly
	 * the metadata extracted using Tika. <br/>
	 * Default value is "http://weblab.eads.com/service/format/tika/"
	 */
	private static String baseUri = "http://weblab.eads.com/service/format/tika/";

	/**
	 * The prefix to be used in the generated RDF to represent the Base URI.
	 */
	private final static String BASE_PREFIX = "tika";

	/**
	 * Whether or not remove the temporary content file at the end of the
	 * process. <br/>
	 * Default value is true
	 */
	private static boolean removeContent = true;

	/**
	 * Whether or not metadata already presents on Resource are override by Tika
	 * extracted metadata. <br/>
	 * Default value is false
	 */
	private static boolean overrideMetadata = false;

	/**
	 * Whether or not generate HTML file
	 */
	private final static boolean GENERATE_HTML = false;

	/**
	 * Mime type value when no dublin core format property was find in document
	 */
	private final static String NO_MIME_TYPE_DETECTED_PROPERTY = "NO_MIME_TYPE";

	/**
	 * List of predicates that are representing dates and that need to be
	 * converted into the W3C date standard, i.e. the ISO8601.
	 */
	private static List<String> DATE_PREDS;

	static {
		List<String> tempList = new ArrayList<String>();
		tempList.add(DCTerms.CREATED);
		tempList.add(DCTerms.DATE);
		tempList.add(DCTerms.DATE_ACCEPTED);
		tempList.add(DCTerms.DATE_COPYRIGHTED);
		tempList.add(DCTerms.DATE_SUBMITTED);
		tempList.add(DCTerms.ISSUED);
		tempList.add(DCTerms.MODIFIED);
		tempList.add(DublinCore.DATE_PROPERTY_NAME);
		TikaExtractorService.DATE_PREDS = Collections.unmodifiableList(tempList);
	}

	/**
	 * The default and only constructor.
	 * 
	 * It load the content manager and initializes the list of date predicates.
	 */
	public TikaExtractorService() {
		this.contentManager = ContentManager.getInstance();

		if (this.contentManager == null) {
			throw new WebLabUncheckedException("Unable to load required " + "properties file for content management.");
		}

	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see
	 * org.weblab_project.services.analyser.Analyser#process(org.weblab_project
	 * .services.analyser.types.ProcessArgs)
	 */
	public ProcessReturn process(ProcessArgs args) throws AccessDeniedException, ContentNotAvailableException, InsufficientResourcesException,
			InvalidParameterException, ServiceNotConfiguredException, UnexpectedException, UnsupportedRequestException {

		Document document = TikaExtractorService.checkArgs(args);

		logger.info("Process the document " + document.getUri() + ".");

		// Get file from the native content URI and check existence and access.
		File file;
		try {
			file = this.contentManager.readNativeContent(document);
		} catch (final WebLabCheckedException wlce) {
			throw new ContentNotAvailableException("Tika service is unable to retrieve native content.", "Unable to retrieve content.");
		}

		// The map containing the metadata that will be annotated.
		Map<String, List<String>> toAnnot = new HashMap<String, List<String>>();

		// Insert content length in metadata
		List<String> values = new ArrayList<String>();
		values.add(String.valueOf(file.length()) + " bytes");
		toAnnot.put(DCTerms.EXTENT, values);

		// The Tika wrapper extracting text and metadata.
		TikaExtractorService.extractTextAndMetadata(document, file, toAnnot, false);

		// If no text unit are extracted from document, try to extract once more
		// with auto detect parser

		if (ResourceUtil.getSelectedSubResources(document, Text.class).size() == 0) {
			logger.info("No texte unit extracted from document." + " Try to extract once more with auto detect parser");
			TikaExtractorService.extractTextAndMetadata(document, file, toAnnot, true);

		}

		// Remove temporary file if necessary
		if (removeContent) {
			if (!file.delete()) {
				logger.warn("Unable to delete temp file." + file.getAbsolutePath());
			}
		}

		// Annotates the document with content of the map.
		TikaExtractorService.annotate(document, toAnnot);

		// add a language annotation
		// TikaExtractorService.annotateLanguage(document);

		// Creating the return wrapper.
		ProcessReturn pr = new ProcessReturn();
		pr.setResource(document);

		logger.info("End of processing " + document.getUri() + " in Tika extractor");

		return pr;
	}

	/**
	 * Annotates <code>cu</code> with the predicates and literals contained in
	 * <code>toAnnot</code>.
	 * 
	 * @param document
	 *            The Document to be annotated.
	 * @param toAnnot
	 *            The Map of predicate and their literal values.
	 */
	protected static void annotate(Document document, final Map<String, List<String>> toAnnot) {
		if (!toAnnot.isEmpty()) {
			Annotation annot = AnnotationFactory.createAndLinkAnnotation(document);
			PoKHelperExtended ahe = RDFHelperFactory.getPoKHelperExtended(annot);
			ResourceHelper rh = RDFHelperFactory.getResourceHelper(document);
			ahe.setAutoCommitMode(false);
			for (Entry<String, List<String>> entry : toAnnot.entrySet()) {
				// Test if property can be write into resource (possible if
				// override mode is enable or if properties
				// are not already presents)
				if (overrideMetadata
						|| (!overrideMetadata && rh.getLitsOnPredSubj(document.getUri(), entry.getKey()).size() == 0 && rh.getRessOnPredSubj(document.getUri(),
								entry.getKey()).size() == 0)) {
					for (final String val : entry.getValue()) {
						ahe.createLitStat(document.getUri(), entry.getKey(), val);
					}
				}

			}
			ahe.setNSPrefix(TikaExtractorService.BASE_PREFIX, baseUri);
			try {
				ahe.commit();
			} catch (final Exception e) {
				logger.error("An error happened during the commiting of annotation " + "changes. Remove the whole annotation.");
				logger.info("Failing metadata were: " + toAnnot);
				document.getAnnotation().remove(annot);
			}
		} else {
			logger.warn("No metadata extracted for document: " + document.getUri());
		}
	}

	public static void extractTextAndMetadata(Document document, final File file, Map<String, List<String>> toAnnot, boolean forceAutoDetectParser)
			throws UnexpectedException, ContentNotAvailableException {
		TikaConfig tikaConfig;
		try {
			tikaConfig = getTikaConfig();
			logger.info("Custom tika configuration loaded successfully...");
		} catch (AccessDeniedException e) {
			logger.warn("Unable to load custom tika configuration, load default...");
			tikaConfig = TikaConfig.getDefaultConfig();
		}

		/*
		 * Try to get mimeType in media unit if forceAutoDetectParser condition
		 * is false
		 */
		ResourceHelper h = RDFHelperFactory.getResourceHelper(document);
		List<String> formatAnnots = h.getLitsOnPredSubj(document.getUri(), DublinCore.FORMAT_PROPERTY_NAME);
		String mimeType;
		if (formatAnnots.size() > 0 && !forceAutoDetectParser)
			mimeType = formatAnnots.get(0);
		else
			mimeType = TikaExtractorService.NO_MIME_TYPE_DETECTED_PROPERTY;

		logger.info("Mime type detected in Resource: " + mimeType);

		// Map<String, Parser> parserList = tikaConfig.getParsers();-->tika 0.7
		Map<MediaType, Parser> parserList = tikaConfig.getParsers();
		Parser parser;
		if (mimeType.equals(TikaExtractorService.NO_MIME_TYPE_DETECTED_PROPERTY) || !parserList.containsKey(mimeType))
			parser = new AutoDetectParser(tikaConfig);
		else {
			// parser=tikaConfig.getParser(mimeType)-->tika 0.7
			MediaType type = MediaType.parse(mimeType);
			parser = tikaConfig.getParser(type);

		}

		Metadata metadata = new Metadata();

		InputStream stream;
		try {
			stream = new FileInputStream(file);
		} catch (final FileNotFoundException fnfe) {
			throw new ContentNotAvailableException("Unable to open stream on content file.", fnfe.getMessage(), fnfe);
		}

		String xhtmlFilePath = "";
		final Map<String, String> props = PropertiesLoader.loadProperties(TikaExtractorService.CONFIG_FILE);

		try {
			ParseContext context = new ParseContext();

			/*
			 * Build a language Profile
			 */
			// ProfilingHandler profiler = new ProfilingHandler();
			//
			// ContentHandler teeHandler = new TeeContentHandler(
			// getMediaUnitContentHandler(document),
			// profiler);
			parser.parse(stream, getMediaUnitContentHandler(document), metadata, context);

			/*
			 * If the language identification is certain enough, add the
			 * language to metadata
			 */

			// if (profiler.getLanguage().isReasonablyCertain())
			// metadata.set(Metadata.LANGUAGE,
			// profiler.getLanguage().getLanguage());

			/*
			 * Generate an xhtml file if the property XHTML_SAVE is true
			 */
			
			//FIXME update this part with the new content management system
			
//			xhtmlFilePath = "";
//			if (props.get(XHTML_SAVE) != null) {
//				if (props.get(XHTML_SAVE).equals("true") && props.get(XHTML_FOLDER_PROPERTY_NAME) != null && !props.get(XHTML_FOLDER_PROPERTY_NAME).isEmpty()) {
//
//					xhtmlFilePath = props.get(XHTML_FOLDER_PROPERTY_NAME);
//
//					File xhtmlDestFolder = new File(xhtmlFilePath);
//					if (!xhtmlDestFolder.exists())
//						xhtmlDestFolder.mkdirs();
//
//					xhtmlFilePath += "\\" + file.getName() + ".xhtml";
//					File xhtmlFile = new File(xhtmlFilePath);
//					InputStream streamForXhtm = new FileInputStream(file);
//
//					/**
//					 * if the MIME type is html, parse without boilerpipe to get
//					 * a formatted xhtml with the hole document content
//					 */
//					if (isHtml(tikaConfig, new FileInputStream(file)))
//						parser = new HtmlParser();
//
//					parser.parse(streamForXhtm, getXmlContentHandler(xhtmlFile), metadata, context);
//
//					if (xhtmlFile.exists()) {
//						logger.info("XHTML File saved : " + xhtmlFile.getAbsolutePath());
//						metadata.add("xhtmlPath", xhtmlFile.getCanonicalPath());
//					}
//				}
//			}

			TikaExtractorService.fillMapWithMetadata(toAnnot, metadata);
			stream.close();

		} catch (IOException e) {
			logger.error("Document stream could not be read.", e);
			throw new UnexpectedException("Document stream could not be read.", e.getMessage(), e);
		} catch (SAXException e) {
			logger.error("SAX events could not be processed.", e);
			throw new UnexpectedException("Document stream could not be read.", e.getMessage(), e);
		} catch (TikaException e) {
			logger.error("Document could not be parsed.", e);
			throw new UnexpectedException("Document stream could not be read.", e.getMessage(), e);
		}
//		catch (TransformerConfigurationException e) {
//			logger.error("Document could not be parsed.", e);
//			throw new UnexpectedException("Document stream could not be read.", e.getMessage(), e);
//		}

		TikaExtractorService.cleanMap(toAnnot);
	}

	// Detect the MIME type of the document. Return true if the MIME type is
	// text/html...
	private static boolean isHtml(TikaConfig tikaConfig, InputStream stream) throws IOException {

		MediaType type = tikaConfig.getMimeRepository().detect(new BufferedInputStream(stream), new Metadata());
		logger.info("Mime Type detected in Resource: " + type.toString());
		if (type.toString().equals("text/html") || type.toString().equals("application/xhtml+xml") || type.toString().equals("application/vnd.wap.xhtml+xml")
				|| type.toString().equals("application/x-asp"))
			return true;
		return false;
	}

	private static MediaUnitContentHandler getMediaUnitContentHandler(Document document) throws TransformerFactoryConfigurationError {
		/*
		 * HTML handler is not used for the moment
		 */
		ContentHandler bodyHandler;
		if (TikaExtractorService.GENERATE_HTML) {

			SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory.newInstance();
			try {
				bodyHandler = factory.newTransformerHandler();
				((TransformerHandler) bodyHandler).getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
				((TransformerHandler) bodyHandler).getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
				((TransformerHandler) bodyHandler).setResult(new StreamResult(System.out));
			} catch (TransformerConfigurationException e1) {
				bodyHandler = new BodyContentHandler(-1);
			}
		} else
			bodyHandler = new BodyContentHandler(-1);
		MediaUnitContentHandler mediaUnitHandler = new MediaUnitContentHandler(bodyHandler, document);
		return mediaUnitHandler;
	}

	private static ContentHandler getXmlContentHandler(File xhtmlFile) throws TransformerConfigurationException {

		SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();

		TransformerHandler handler = factory.newTransformerHandler();
		handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
		handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");

		handler.setResult(new StreamResult(xhtmlFile));

		return handler;
	}

	/**
	 * Modify the <code>Map</code> in parameter. Convert dates into W3C ISO8601
	 * standard format and remove empty properties
	 * 
	 * @param toAnnot
	 *            The <code>Map</code> of predicates and values to be cleaned
	 *            from empty String, List and convert dates into W3C ISO8601
	 *            standard format.
	 */
	protected static void cleanMap(Map<String, List<String>> toAnnot) {
		for (final String datePred : TikaExtractorService.DATE_PREDS) {
			if (toAnnot.containsKey(datePred)) {
				List<String> cleanedDates = new ArrayList<String>();
				for (final String date : toAnnot.get(datePred)) {
					final String newDate = TikaExtractorService.convertToISO8601Date(date);
					cleanedDates.add(newDate);
				}
				toAnnot.put(datePred, cleanedDates);
			}
		}
		Set<String> predToRemove = new HashSet<String>();
		for (Entry<String, List<String>> entry : toAnnot.entrySet()) {
			ListIterator<String> listIt = entry.getValue().listIterator();
			while (listIt.hasNext()) {
				final String val = listIt.next();
				listIt.set(XMLStringCleaner.getXMLRecommendedString(val));
				if (val.trim().equals("")) {
					listIt.remove();
				}
			}
			if (entry.getValue().isEmpty()) {
				predToRemove.add(entry.getKey());
			}
		}

		// predToRemove.add(DublinCore.FORMAT_PROPERTY_NAME);

		for (final String keyToRemove : predToRemove) {
			toAnnot.remove(keyToRemove);
		}
	}

	/**
	 * The method converts the metadata extracted by Tika into a Map of
	 * predicates with their values that can be annotated.
	 * 
	 * It can map some Tikas properties with DublinCore and DCTerms ones and for
	 * any metadata it also create a dirty predicate using the base URI.
	 * 
	 * @param toAnnot
	 *            The empty map of predicate values.
	 * @param metadata
	 *            The dirty map of metadata extrated by Tika.
	 * @return A map of RDF predicates and their literal values.
	 */
	protected static Map<String, List<String>> fillMapWithMetadata(Map<String, List<String>> toAnnot, final Metadata metadata) {
		for (final String name : metadata.names()) {
			List<String> values = new ArrayList<String>();
			if (metadata.isMultiValued(name)) {
				values.addAll(Arrays.asList(metadata.getValues(name)));
			} else {
				if (metadata.get(name) != null)
					values.add(metadata.get(name).trim());
			}
			if (name.equals(org.apache.tika.metadata.DublinCore.LANGUAGE)) {
				List<String> list = toAnnot.get(DublinCore.LANGUAGE_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.LANGUAGE_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(DublinCore.PUBLISHER_PROPERTY_NAME) || name.contains(DublinCore.PUBLISHER_PROPERTY_NAME)) {
				List<String> list = toAnnot.get(DublinCore.PUBLISHER_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.PUBLISHER_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(MSOffice.AUTHOR) || name.equals(org.apache.tika.metadata.DublinCore.CREATOR)) {
				List<String> list = toAnnot.get(DublinCore.CREATOR_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.CREATOR_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.CONTENT_LOCATION) || name.equals(HttpHeaders.LOCATION)
					|| name.equals(org.apache.tika.metadata.DublinCore.SOURCE)) {
				List<String> list = toAnnot.get(DublinCore.SOURCE_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.SOURCE_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.CONTENT_TYPE) || name.equals(org.apache.tika.metadata.DublinCore.FORMAT)
					|| name.equals(TikaMimeKeys.MIME_TYPE_MAGIC)) {
				List<String> list = toAnnot.get(DublinCore.FORMAT_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.FORMAT_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(org.apache.tika.metadata.DublinCore.CONTRIBUTOR) || name.equals(MSOffice.LAST_AUTHOR) || name.equals(MSOffice.COMPANY)
					|| name.equals(MSOffice.MANAGER)) {
				List<String> list = toAnnot.get(DublinCore.CONTRIBUTOR_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(org.apache.tika.metadata.DublinCore.DATE) || name.equals(MSOffice.LAST_PRINTED)) {
				List<String> list = toAnnot.get(DublinCore.DATE_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.DATE_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.LAST_MODIFIED) || name.equals(org.apache.tika.metadata.DublinCore.MODIFIED) || name.equals(MSOffice.LAST_SAVED)) {
				List<String> list = toAnnot.get(DCTerms.MODIFIED);
				if (list == null) {
					toAnnot.put(DCTerms.MODIFIED, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(MSOffice.CHARACTER_COUNT) || name.equals(MSOffice.CHARACTER_COUNT_WITH_SPACES) || name.equals(MSOffice.PAGE_COUNT)
					|| name.equals(MSOffice.WORD_COUNT) || name.equals(MSOffice.PARAGRAPH_COUNT)) {
				List<String> valuesWithUnit;
				// Adding unit
				if (name.equals(MSOffice.CHARACTER_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " characters");
				else if (name.equals(MSOffice.CHARACTER_COUNT_WITH_SPACES))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " characters (with spaces)");
				else if (name.equals(MSOffice.PAGE_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " pages");
				else if (name.equals(MSOffice.WORD_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " words");
				else if (name.equals(MSOffice.PARAGRAPH_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " paragraphs");
				else
					valuesWithUnit = new ArrayList<String>();

				List<String> list = toAnnot.get(DCTerms.EXTENT);
				if (list == null) {
					toAnnot.put(DCTerms.EXTENT, valuesWithUnit);
				} else {
					list.addAll(valuesWithUnit);
				}
			} else if (name.equals(MSOffice.KEYWORDS) || name.equals(org.apache.tika.metadata.DublinCore.SUBJECT)) {
				List<String> list = toAnnot.get(DublinCore.SUBJECT_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.SUBJECT_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(CreativeCommons.LICENSE_LOCATION) || name.equals(CreativeCommons.LICENSE_URL)) {
				List<String> list = toAnnot.get(DCTerms.LICENSE);
				if (list == null) {
					toAnnot.put(DCTerms.LICENSE, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(org.apache.tika.metadata.DublinCore.DESCRIPTION) || name.equals(MSOffice.NOTES) || name.equals(MSOffice.CATEGORY)) {
				List<String> list = toAnnot.get(DublinCore.DESCRIPTION_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.DESCRIPTION_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(org.apache.tika.metadata.DublinCore.IDENTIFIER)) {
				toAnnot.put(DublinCore.IDENTIFIER_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.PUBLISHER)) {
				toAnnot.put(DublinCore.PUBLISHER_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.RELATION)) {
				toAnnot.put(DublinCore.RELATION_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.RIGHTS)) {
				toAnnot.put(DublinCore.RIGHTS_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.TITLE)) {
				toAnnot.put(DublinCore.TITLE_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.TYPE)) {
				toAnnot.put(DublinCore.TYPE_PROPERTY_NAME, values);
			} else if (name.equals(MSOffice.CREATION_DATE)) {
				toAnnot.put(DCTerms.CREATED, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.COVERAGE)) {
				toAnnot.put(DublinCore.COVERAGE_PROPERTY_NAME, values);
			} else if (name.equals(MSOffice.COMMENTS)) {
				toAnnot.put(RDFS.COMMENT, values);
			} else {

				String predicate = "";
				boolean skip = false;
				String cleanedName = "";
				try {
					cleanedName = name.replace('(', '_').replace(' ', '_').replace(')', '_').replace("N°", "N_").replace("n°", "n.").replace('$', '_')
							.replace('/', '_').replace('\\', '_').replace('#', '_').replace('\'', '_').replace('.', '_').replace(',', '_').replace('?', '_')
							.replace('!', '_').replace('@', '_');
					predicate = new URL(baseUri + cleanedName).toURI().toString();
				} catch (final URISyntaxException urise) {
					logger.warn("Unable to transform the property '" + name + "' into a predicate (" + predicate + ")", urise);
					skip = true;
				} catch (final MalformedURLException murle) {
					logger.warn("Unable to transform the property '" + name + "' into a predicate (" + predicate + ")", murle);
					skip = true;
				}
				if (!skip) {
					toAnnot.put(predicate, new ArrayList<String>(values));
				}
			}
		}

		return toAnnot;
	}

	/**
	 * @param args
	 *            The <code>ProcessArgs</code> of the process method.
	 * @return The <code>ComposedUnit</code> that must be contained by
	 *         <code>args</code>.
	 * @throws ProcessException
	 *             If <code>resource</code> in <code>args</code> is not a
	 *             <code>ComposedUnit</code>.
	 */
	protected static Document checkArgs(final ProcessArgs args) throws InvalidParameterException {
		if (args == null) {
			throw new InvalidParameterException("Invalid parameter from tika service.", "ProcessArgs was null.");
		}
		final Resource res = args.getResource();
		if (res == null) {
			throw new InvalidParameterException("Invalid parameter from tika service.", "Resource of ProcessArgs was null.");
		}
		if (!(res instanceof Document)) {
			throw new InvalidParameterException("Invalid parameter from tika service.", "Resource of ProcessArgs was not a Com" + "posedUnit, but a "
					+ res.getClass().getName() + ".");
		}
		return (Document) res;
	}

	/**
	 * @param inDateStr
	 *            The input date that might be in three different formats. The
	 *            Office one e.g.: <code>Mon Jan 05 16:53:20 CET 2009</code> or
	 *            already in ISO8601 format. Else the date will be logged as
	 *            error, an replaced by the empty String.
	 * @return The date in ISO8601 format
	 */
	protected static String convertToISO8601Date(String inDateStr) {
		String outDateStr = "";
		String tmpDateStr = inDateStr;
		if (tmpDateStr != null && !tmpDateStr.trim().equals("")) {
			tmpDateStr = tmpDateStr.trim();
			SimpleDateFormat sdf;
			if (Character.isDigit(tmpDateStr.charAt(0))) {
				sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
			} else if (!tmpDateStr.contains(",")) {
				sdf = new SimpleDateFormat("EEE MMM d hh:mm:ss z yyyy", Locale.ENGLISH);
			} else {

				sdf = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
			}
			Date date = null;
			try {
				date = sdf.parse(tmpDateStr);
			} catch (final ParseException pe) {
				logger.warn("Unable to read date: '" + tmpDateStr + "'.", pe);
			}
			if (date != null) {
				SimpleDateFormat simpleDate = new SimpleDateFormat("yyyy-MM-dd");
				outDateStr = simpleDate.format(date);
			}
		}
		return outDateStr;
	}

	/**
	 * Adding unit on each values of the list
	 * 
	 * @param values
	 *            the <code>List</code> of values
	 * @param unit
	 *            the unit to add
	 * @return the <code>List</code> of values with unit
	 */
	protected static List<String> addUnitOnValues(List<String> values, String unit) {
		List<String> result = new ArrayList<String>();
		for (String val : values) {
			result.add(val + unit);
		}
		return result;
	}

	protected static TikaConfig getTikaConfig() throws AccessDeniedException {
		TikaConfig tikaConfig;
		try {
			tikaConfig = new TikaConfig(new ClassPathResource("tika-config.xml").getFile());
		} catch (TikaException e) {

			throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), e);
		} catch (IOException e) {
			throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), e);
		} catch (SAXException e) {
			throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), e);
		}
		return tikaConfig;
	}

	protected void loadTikaServiceProps() {
		final Map<String, String> props = PropertiesLoader.loadProperties(TikaExtractorService.CONFIG_FILE);
		if (props.get(BASE_URI_PROPERTY_NAME) != null && props.get(BASE_URI_PROPERTY_NAME).isEmpty())
			baseUri = props.get(BASE_URI_PROPERTY_NAME);

		if (props.get(REMOVE_COTNENT_PROPERTY_NAME) != null && props.get(REMOVE_COTNENT_PROPERTY_NAME).isEmpty())
			removeContent = Boolean.parseBoolean(props.get(REMOVE_COTNENT_PROPERTY_NAME));

		if (props.get(OVERRIDE_METADATA_PROPERTY_NAME) != null && props.get(OVERRIDE_METADATA_PROPERTY_NAME).isEmpty())
			overrideMetadata = Boolean.parseBoolean(props.get(OVERRIDE_METADATA_PROPERTY_NAME));
	}

}
