/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.services.normaliser.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import javax.jws.WebService;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMimeKeys;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.ow2.weblab.content.binary.BinaryFolderContentManager;
import org.springframework.core.io.ClassPathResource;
import org.weblab_project.core.exception.WebLabCheckedException;
import org.weblab_project.core.exception.WebLabUncheckedException;
import org.weblab_project.core.factory.AnnotationFactory;
import org.weblab_project.core.helper.PoKHelperExtended;
import org.weblab_project.core.helper.RDFHelperFactory;
import org.weblab_project.core.helper.ResourceHelper;
import org.weblab_project.core.model.Annotation;
import org.weblab_project.core.model.ComposedUnit;
import org.weblab_project.core.model.Resource;
import org.weblab_project.core.model.text.Text;
import org.weblab_project.core.ontologies.DCTerms;
import org.weblab_project.core.ontologies.DublinCore;
import org.weblab_project.core.ontologies.RDFS;
import org.weblab_project.core.util.ComposedUnitUtil;
import org.weblab_project.services.analyser.Analyser;
import org.weblab_project.services.analyser.ProcessException;
import org.weblab_project.services.analyser.types.ProcessArgs;
import org.weblab_project.services.analyser.types.ProcessReturn;
import org.weblab_project.services.exception.WebLabException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Tika extractor is quite simple since it does not handle with structure of
 * documents (sheets in Excel, paragraphs in Word, etc.) The structure might
 * have been represented as various MediaUnits
 * 
 * @todo Maybe some properties shall be extracted to a configuration file.
 */
@WebService(endpointInterface = "org.weblab_project.services.analyser.Analyser")
public class TikaExtractorService implements Analyser {

	private final static Log logger = LogFactory.getLog(TikaExtractorService.class);
	
	/**
	 * The <code>BinaryFolderContentManager</code> to use
	 */
	private BinaryFolderContentManager contentManager = null;
	
	/**
	 * The base URI to be used for creating predicates when annotating directly
	 * the metadata extracted using Tika.
	 */
	private static final String BASE_URI = "http://weblab.eads.com/service/format/tika/";

	/**
	 * The prefix to be used in the generated RDF to represent the Base URI.
	 */
	private static final String BASE_PREFIX = "tika";

	/**
	 * Whether or not remove the temporary content file at the end of the
	 * process.
	 */
	private final static boolean REMOVE_CONTENT = true;
	
	/**
	 * Whether or not generate HTML file  
	 */
	private final static boolean GENERATE_HTML = false;

	/**
	 * Mime type value when no dublin core format property was find in document
	 */
	private final static String NO_MIME_TYPE_DETECTED_PROPERTY = "NO_MIME_TYPE";
	
	/**
	 * List of predicates that are representing dates and that need to be
	 * converted into the W3C date standard, i.e. the ISO8601.
	 */
	private static List<String> DATE_PREDS;

	static {
		List<String> tempList = new ArrayList<String>();
		tempList.add(DCTerms.CREATED);
		tempList.add(DCTerms.DATE);
		tempList.add(DCTerms.DATE_ACCEPTED);
		tempList.add(DCTerms.DATE_COPYRIGHTED);
		tempList.add(DCTerms.DATE_SUBMITTED);
		tempList.add(DCTerms.ISSUED);
		tempList.add(DCTerms.MODIFIED);
		tempList.add(DublinCore.DATE_PROPERTY_NAME);
		TikaExtractorService.DATE_PREDS = Collections
				.unmodifiableList(tempList);
	}

	/**
	 * The default and only constructor.
	 * 
	 * It load the content manager and initialises the list of date predicates.
	 */
	public TikaExtractorService() {
		this.contentManager = BinaryFolderContentManager.getInstance();

		if (this.contentManager == null) {
			throw new WebLabUncheckedException("Unable to load required "
					+ "properties file for content management.");
		}
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see
	 * org.weblab_project.services.analyser.Analyser#process(org.weblab_project
	 * .services.analyser.types.ProcessArgs)
	 */
	public ProcessReturn process(ProcessArgs args) throws ProcessException {

		ComposedUnit cu = TikaExtractorService.checkArgs(args);

		logger.info(
				"Process the document " + cu.getUri() + ".");

		// Get file from the native content URI and check existence and access.
		File file;
		try {
			file = this.contentManager.getNativeFileFromResource(cu);
		} catch (final WebLabCheckedException wlce) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Content not available");
			throw new ProcessException("Unable to retrieve content.", wle, wlce);
		}

		// The map containing the metadata that will be annotated.
		Map<String, List<String>> toAnnot = new HashMap<String, List<String>>();

		// Insert content length in metadata
		List<String> values = new ArrayList<String>();
		values.add(String.valueOf(file.length()) + " bytes");
		toAnnot.put(DCTerms.EXTENT, values);

		// The Tika wrapper extracting text and metadata.
		TikaExtractorService.extractTextAndMetadata(cu, file, toAnnot, false);
		
		// If no text unit are extracted from document, try to extract once more with auto detect parser 
		if (ComposedUnitUtil.getSelectedSubMediaUnits(cu, Text.class).size() == 0)
			TikaExtractorService.extractTextAndMetadata(cu, file, toAnnot, true);
		
		// Remove temporary file if necessary
		if (TikaExtractorService.REMOVE_CONTENT) {
			if (!file.delete()) {
				logger.warn(
						"Unable to delete temp file.");
			}
		}
		
		// Annotates the document with content of the map.
		TikaExtractorService.annotate(cu, toAnnot);

		// Creating the return wrapper.
		ProcessReturn pr = new ProcessReturn();
		pr.setResource(cu);

		logger.info(
				"End of processing " + cu.getUri() + " in Tika extractor");

		return pr;
	}

	/**
	 * Annotates <code>cu</code> with the predicates and literals contained in
	 * <code>toAnnot</code>.
	 * 
	 * @param cu
	 *            The Composed Unit to be annotated.
	 * @param toAnnot
	 *            The Map of predicate and their literal values.
	 */
	protected static void annotate(ComposedUnit cu,
			final Map<String, List<String>> toAnnot) {
		if (!toAnnot.isEmpty()) {
			Annotation annot = AnnotationFactory.createAndLinkAnnotation(cu);
			PoKHelperExtended ahe = RDFHelperFactory
					.getPoKHelperExtended(annot);
			ahe.setAutoCommitMode(false);
			for (Entry<String, List<String>> entry : toAnnot.entrySet()) {
				for (final String val : entry.getValue()) {
					ahe.createLitStat(cu.getUri(), entry.getKey(), val);
				}
			}
			ahe.setNSPrefix(TikaExtractorService.BASE_PREFIX,
					TikaExtractorService.BASE_URI);
			try {
				ahe.commit();
			} catch (final Exception e) {
				logger.error(
						"An error happened during the commiting of annotation "
								+ "changes. Remove the whole annotation.");
				logger.info(
						"Failing metadata were: " + toAnnot);
				cu.getAnnotation().remove(annot);
			}
		} else {
			logger.warn(
					"No metadata extracted for document: " + cu.getUri());
		}
	}

	public static void extractTextAndMetadata(ComposedUnit cu, final File file,
			Map<String, List<String>> toAnnot, boolean forceAutoDetectParser) throws ProcessException {
		TikaConfig tikaConfig;
		try {
			tikaConfig = getTikaConfig();
			logger.info("Custom tika configuration loaded successfully...");
		} catch (ProcessException e) {
			logger.warn("Unable to load custom tika configuration, load default...");
			tikaConfig = TikaConfig.getDefaultConfig();
		}
		
		/*
		 * Try to get mimeType in media unit if forceAutoDetectParser condition is false
		 */
		ResourceHelper h = RDFHelperFactory.getResourceHelper(cu);
		List<String> formatAnnots = h.getLitsOnPredSubj(cu.getUri(), DublinCore.FORMAT_PROPERTY_NAME);
		String mimeType;
		if (formatAnnots.size() > 0 && !forceAutoDetectParser)
			mimeType = formatAnnots.get(0);
		else
			mimeType = TikaExtractorService.NO_MIME_TYPE_DETECTED_PROPERTY;

		logger.info("Mime type detected in Resource: " + mimeType);
		
		Map<String, Parser> parserList = tikaConfig.getParsers();
		Parser parser;
		if (mimeType.equals(TikaExtractorService.NO_MIME_TYPE_DETECTED_PROPERTY) || !parserList.containsKey(mimeType))
			parser = new AutoDetectParser(tikaConfig);
		else
			parser = tikaConfig.getParser(mimeType);

		Metadata metadata = new Metadata();
		
		InputStream stream;
		try {
			stream = new FileInputStream(file);
		} catch (final FileNotFoundException fnfe) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Content not available.");
			throw new ProcessException(
					"Unable to open stream on content file.", wle, fnfe);
		}

		/*
		 * HTML handler is not used for the moment
		 */
		ContentHandler bodyHandler;
		if (TikaExtractorService.GENERATE_HTML) {
			SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory
					.newInstance();
			try {
				bodyHandler = factory.newTransformerHandler();
				((TransformerHandler)bodyHandler).getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
				((TransformerHandler)bodyHandler).getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
				((TransformerHandler)bodyHandler).setResult(new StreamResult(System.out));
			} catch (TransformerConfigurationException e1) {
				bodyHandler = new BodyContentHandler();
			}
		}
		else
			bodyHandler = new BodyContentHandler();
		
		MediaUnitContentHandler mediaUnitHandler = new MediaUnitContentHandler(
				bodyHandler, cu);
		try {
			parser.parse(stream, mediaUnitHandler, metadata, new ParseContext());
			stream.close();
		} catch (IOException e) {
			logger.error("Document stream could not be read.", e);
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Content not available.");
			throw new ProcessException("Document stream could not be read.",
					wle, e);
		} catch (SAXException e) {
			logger.error("SAX events could not be processed.", e);
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("SAX Error.");
			throw new ProcessException("SAX events could not be processed.",
					wle, e);
		} catch (TikaException e) {
			logger.error("Document could not be parsed.", e);
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Parse error.");
			throw new ProcessException("Document could not be parsed.", wle, e);
		}

		TikaExtractorService.fillMapWithMetadata(toAnnot, metadata);
		TikaExtractorService.cleanMap(toAnnot);
	}

	/**
	 * Modify the <code>Map</code> in parameter. Convert dates into W3C ISO8601
	 * standard format and remove empty properties
	 * 
	 * @param toAnnot
	 *            The <code>Map</code> of predicates and values to be cleaned
	 *            from empty String, List and convert dates into W3C ISO8601
	 *            standard format.
	 */
	protected static void cleanMap(Map<String, List<String>> toAnnot) {
		for (final String datePred : TikaExtractorService.DATE_PREDS) {
			if (toAnnot.containsKey(datePred)) {
				List<String> cleanedDates = new ArrayList<String>();
				for (final String date : toAnnot.get(datePred)) {
					final String newDate = TikaExtractorService
							.convertToISO8601Date(date);
					cleanedDates.add(newDate);
				}
				toAnnot.put(datePred, cleanedDates);
			}
		}
		Set<String> predToRemove = new HashSet<String>();
		for (Entry<String, List<String>> entry : toAnnot.entrySet()) {
			ListIterator<String> listIt = entry.getValue().listIterator();
			while (listIt.hasNext()) {
				final String val = listIt.next();
				if (val.trim().equals("")) {
					listIt.remove();
				}
			}
			if (entry.getValue().isEmpty()) {
				predToRemove.add(entry.getKey());
			}
		}

		// predToRemove.add(DublinCore.FORMAT_PROPERTY_NAME);

		for (final String keyToRemove : predToRemove) {
			toAnnot.remove(keyToRemove);
		}
	}

	/**
	 * The method converts the metadata extracted by Tika into a Map of
	 * predicates with their values that can be annotated.
	 * 
	 * It can map some Tikas properties with DublinCore and DCTerms ones and for
	 * any metadata it also create a dirty predicate using the base URI.
	 * 
	 * @param toAnnot
	 *            The empty map of predicate values.
	 * @param metadata
	 *            The dirty map of metadata extrated by Tika.
	 * @return A map of RDF predicates and their literal values.
	 */
	protected static Map<String, List<String>> fillMapWithMetadata(
			Map<String, List<String>> toAnnot, final Metadata metadata) {
		for (final String name : metadata.names()) {
			List<String> values = new ArrayList<String>();
			if (metadata.isMultiValued(name)) {
				values.addAll(Arrays.asList(metadata.getValues(name)));
			} else {
				if (metadata.get(name) != null)
					values.add(metadata.get(name).trim());
			}

			if (name.equals(MSOffice.AUTHOR)
					|| name.equals(org.apache.tika.metadata.DublinCore.CREATOR)) {
				List<String> list = toAnnot
						.get(DublinCore.CREATOR_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.CREATOR_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.CONTENT_LOCATION)
					|| name.equals(HttpHeaders.LOCATION)
					|| name.equals(org.apache.tika.metadata.DublinCore.SOURCE)) {
				List<String> list = toAnnot
						.get(DublinCore.SOURCE_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.SOURCE_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.CONTENT_TYPE)
					|| name.equals(org.apache.tika.metadata.DublinCore.FORMAT)
					|| name.equals(TikaMimeKeys.MIME_TYPE_MAGIC)) {
				List<String> list = toAnnot
						.get(DublinCore.FORMAT_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.FORMAT_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.CONTRIBUTOR)
					|| name.equals(MSOffice.LAST_AUTHOR)
					|| name.equals(MSOffice.COMPANY)
					|| name.equals(MSOffice.MANAGER)) {
				List<String> list = toAnnot
						.get(DublinCore.CONTRIBUTOR_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.CONTRIBUTOR_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(org.apache.tika.metadata.DublinCore.DATE)
					|| name.equals(MSOffice.LAST_PRINTED)) {
				List<String> list = toAnnot.get(DublinCore.DATE_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.DATE_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(HttpHeaders.LAST_MODIFIED)
					|| name
							.equals(org.apache.tika.metadata.DublinCore.MODIFIED)
					|| name.equals(MSOffice.LAST_SAVED)) {
				List<String> list = toAnnot.get(DCTerms.MODIFIED);
				if (list == null) {
					toAnnot.put(DCTerms.MODIFIED, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(MSOffice.CHARACTER_COUNT)
					|| name.equals(MSOffice.CHARACTER_COUNT_WITH_SPACES)
					|| name.equals(MSOffice.PAGE_COUNT)
					|| name.equals(MSOffice.WORD_COUNT)
					|| name.equals(MSOffice.PARAGRAPH_COUNT)
					) {
				List<String> valuesWithUnit;
				// Adding unit
				if (name.equals(MSOffice.CHARACTER_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " characters");
				else if (name.equals(MSOffice.CHARACTER_COUNT_WITH_SPACES))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " characters (with spaces)");
				else if (name.equals(MSOffice.PAGE_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " pages");
				else if (name.equals(MSOffice.WORD_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " words");
				else if (name.equals(MSOffice.PARAGRAPH_COUNT))
					valuesWithUnit = TikaExtractorService.addUnitOnValues(values, " paragraphs");
				else
					valuesWithUnit = new ArrayList<String>();
				
				List<String> list = toAnnot.get(DCTerms.EXTENT);
				if (list == null) {
					toAnnot.put(DCTerms.EXTENT, valuesWithUnit);
				} else {
					list.addAll(valuesWithUnit);
				}
			} else if (name.equals(MSOffice.KEYWORDS)
					|| name.equals(org.apache.tika.metadata.DublinCore.SUBJECT)) {
				List<String> list = toAnnot
						.get(DublinCore.SUBJECT_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.SUBJECT_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name.equals(CreativeCommons.LICENSE_LOCATION)
					|| name.equals(CreativeCommons.LICENSE_URL)) {
				List<String> list = toAnnot.get(DCTerms.LICENSE);
				if (list == null) {
					toAnnot.put(DCTerms.LICENSE, values);
				} else {
					list.addAll(values);
				}
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.DESCRIPTION)
					|| name.equals(MSOffice.NOTES)
					|| name.equals(MSOffice.CATEGORY)) {
				List<String> list = toAnnot
						.get(DublinCore.DESCRIPTION_PROPERTY_NAME);
				if (list == null) {
					toAnnot.put(DublinCore.DESCRIPTION_PROPERTY_NAME, values);
				} else {
					list.addAll(values);
				}
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.IDENTIFIER)) {
				toAnnot.put(DublinCore.IDENTIFIER_PROPERTY_NAME, values);
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.PUBLISHER)) {
				toAnnot.put(DublinCore.PUBLISHER_PROPERTY_NAME, values);
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.RELATION)) {
				toAnnot.put(DublinCore.RELATION_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.RIGHTS)) {
				toAnnot.put(DublinCore.RIGHTS_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.TITLE)) {
				toAnnot.put(DublinCore.TITLE_PROPERTY_NAME, values);
			} else if (name.equals(org.apache.tika.metadata.DublinCore.TYPE)) {
				toAnnot.put(DublinCore.TYPE_PROPERTY_NAME, values);
			} else if (name.equals(MSOffice.CREATION_DATE)) {
				toAnnot.put(DCTerms.CREATED, values);
			} else if (name
					.equals(org.apache.tika.metadata.DublinCore.COVERAGE)) {
				toAnnot.put(DublinCore.COVERAGE_PROPERTY_NAME, values);
			} else if (name.equals(MSOffice.COMMENTS)) {
				toAnnot.put(RDFS.COMMENT, values);
			}

			String predicate = "";
			boolean skip = false;
			String cleanedName = "";
			try {
				cleanedName = name.replace('(', '_').replace(' ', '_').replace(
						')', '_').replace("N°", "N_").replace("n°", "n.")
						.replace('$', '_').replace('/', '_').replace('\\', '_')
						.replace('#', '_').replace('\'', '_').replace('.', '_')
						.replace(',', '_').replace('?', '_').replace('!', '_')
						.replace('@', '_');
				predicate = new URL(BASE_URI + cleanedName).toURI().toString();
			} catch (final URISyntaxException urise) {
				logger.warn(
						"Unable to transform the property '" + name
								+ "' into a predicate (" + predicate + ")",
						urise);
				skip = true;
			} catch (final MalformedURLException murle) {
				logger.warn(
						"Unable to transform the property '" + name
								+ "' into a predicate (" + predicate + ")",
						murle);
				skip = true;
			}
			if (!skip) {
				toAnnot.put(predicate, new ArrayList<String>(values));
			}
		}

		return toAnnot;
	}

	/**
	 * @param args
	 *            The <code>ProcessArgs</code> of the process method.
	 * @return The <code>ComposedUnit</code> that must be contained by
	 *         <code>args</code>.
	 * @throws ProcessException
	 *             If <code>resource</code> in <code>args</code> is not a
	 *             <code>ComposedUnit</code>.
	 */
	protected static ComposedUnit checkArgs(final ProcessArgs args)
			throws ProcessException {
		WebLabException wle = new WebLabException();
		wle.setErrorId("E1");
		wle.setErrorMessage("Invalid parameter.");

		if (args == null) {
			throw new ProcessException("ProcessArgs was null.", wle);
		}
		final Resource res = args.getResource();
		if (res == null) {
			throw new ProcessException("Resource of ProcessArgs was null.", wle);
		}
		if (!(res instanceof ComposedUnit)) {
			throw new ProcessException("Resource of ProcessArgs was not a Com"
					+ "posedUnit, but a " + res.getClass().getName() + ".", wle);
		}
		return (ComposedUnit) res;
	}

	/**
	 * @param inDateStr
	 *            The input date that might be in two different formats. The
	 *            Office one e.g.: <code>Mon Jan 05 16:53:20 CET 2009</code> or
	 *            already in ISO8601 format. Else the date will be logged as
	 *            error, an replaced by the empty String.
	 * @return The date in ISO8601 format
	 */
	protected static String convertToISO8601Date(String inDateStr) {
		String outDateStr = "";
		String tmpDateStr = inDateStr;
		if (tmpDateStr != null && !tmpDateStr.trim().equals("")) {
			tmpDateStr = tmpDateStr.trim();
			SimpleDateFormat sdf;
			if (Character.isDigit(tmpDateStr.charAt(0))) {
				sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss",
						Locale.ENGLISH);
			} else {
				sdf = new SimpleDateFormat("EEE MMM d hh:mm:ss z yyyy",
						Locale.ENGLISH);
			}
			Date date = null;
			try {
				date = sdf.parse(tmpDateStr);
			} catch (final ParseException pe) {
				logger.warn(
						"Unable to read date: '" + tmpDateStr + "'.", pe);
			}
			if (date != null) {
				SimpleDateFormat simpleDate = new SimpleDateFormat("yyyy-MM-dd");
				outDateStr = simpleDate.format(date);
			}
		}
		return outDateStr;
	}
	
	/**
	 * Adding unit on each values of the list
	 * @param values the <code>List</code> of values
	 * @param unit the unit to add
	 * @return the <code>List</code> of values with unit
	 */
	protected static List<String> addUnitOnValues(List<String> values, String unit) {
		List<String> result = new ArrayList<String>();
		for (String val : values) {
			result.add(val + unit);
		}
		return result;
	}

	protected static TikaConfig getTikaConfig() throws ProcessException {
		TikaConfig tikaConfig;
		try {
			tikaConfig = new TikaConfig(
					new ClassPathResource("tika-config.xml").getFile());
		} catch (TikaException e) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Unable to load default Tika Config.");
			throw new ProcessException("Unable to load custom Tika Config.",
					wle, e);
		} catch (IOException e) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Unable to load default Tika Config.");
			throw new ProcessException("Unable to load custom Tika Config.",
					wle, e);
		} catch (SAXException e) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E3");
			wle.setErrorMessage("Unable to load default Tika Config.");
			throw new ProcessException("Unable to load custom Tika Config.",
					wle, e);
		}
		return tikaConfig;
	}
}
