/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2011 CASSIDIAN
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.services.normaliser.tika;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import javax.imageio.ImageIO;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.language.ProfilingWriter;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.ow2.weblab.content.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.factory.MediaUnitFactory;
import org.ow2.weblab.core.extended.ontologies.DublinCore;
import org.ow2.weblab.core.helper.PoKHelper;
import org.ow2.weblab.core.helper.RDFHelperFactory;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Image;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.model.processing.WProcessingAnnotator;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class MediaUnitContentHandler extends ContentHandlerDecorator {

	private final static Log logger = LogFactory.getLog(MediaUnitContentHandler.class);
	private static final int MIN_IMAGE_WIDTH = 100;
	private static final int MIN_IMAGE_HEIGHT = 100;
	private Annotation docLangAnnot;
	private Document document;

	/**
	 * to supports an incremental language detection
	 ***/
	private ProfilingWriter pWriter;
	private ProfilingWriter pTableWriter;

	private Text textInProcess;
	private Text tableInProcess;

	private String muHtmlElement;

	/**
	 * List of elements that require new line in extracted text
	 */
	private static List<String> NEW_LINE_ELEMENTS;
	private static List<String> TAB_LIST_ELEMENTS;

	static {
		List<String> tempList = new ArrayList<String>();
		tempList.add("ol");
		tempList.add("dt");
		tempList.add("dl");
		tempList.add("li");
		MediaUnitContentHandler.NEW_LINE_ELEMENTS = Collections.unmodifiableList(tempList);

		List<String> tempList2 = new ArrayList<String>();
		tempList2.add("li");
		tempList2.add("dd");
		MediaUnitContentHandler.TAB_LIST_ELEMENTS = Collections.unmodifiableList(tempList2);
	}

	private static List<String> TITLE_LIST_ELEMENTS;
	static {
		List<String> tempList3 = new ArrayList<String>();
		tempList3.add("h1");
		tempList3.add("h2");
		tempList3.add("h3");
		tempList3.add("h4");
		tempList3.add("h5");
		tempList3.add("h6");
		MediaUnitContentHandler.TITLE_LIST_ELEMENTS = Collections.unmodifiableList(tempList3);
	}

	private static List<String> BLOC_LIST_ELEMENTS;
	static {
		List<String> tempList4 = new ArrayList<String>();
		// tempList4.add("span");
		tempList4.add("div");
		tempList4.add("p");
		MediaUnitContentHandler.BLOC_LIST_ELEMENTS = Collections.unmodifiableList(tempList4);
	}

	public MediaUnitContentHandler(ContentHandler handler, Document document) {
		super(handler);

		this.document = document;
		this.muHtmlElement = "";
		this.textInProcess = null;
		this.tableInProcess = null;
	}

	private void openTextMediaUnit() {
		this.textInProcess = MediaUnitFactory.createAndLinkMediaUnit(this.document, Text.class);
		this.textInProcess.setContent("");
		this.pWriter = new ProfilingWriter();
	}

	private void OpenTableMediaUnit() {
		this.tableInProcess = MediaUnitFactory.createAndLinkMediaUnit(this.document, Text.class);
		this.tableInProcess.setContent("");

		this.pTableWriter = new ProfilingWriter();
	}

	private void closeTextMediaUnit() {
		String content = textInProcess.getContent().replaceAll("\t", "").replaceAll("\n", "").replaceAll("\r", "").trim();
		if (content.isEmpty() || content.length() < 4)
			document.getMediaUnit().remove(textInProcess);
		else if (textInProcess.getContent().length() > 200)
			annotate(this.textInProcess, pWriter.getLanguage().getLanguage());
		this.textInProcess = null;
		this.pWriter = null;

	}

	private void CloseTableMediaUnit() {
		String content = tableInProcess.getContent().replaceAll("\t", "").replaceAll("\n", "").replaceAll("\r", "").trim();
		if (content.isEmpty())
			document.getMediaUnit().remove(tableInProcess);
		else if (tableInProcess.getContent().length() > 200)
			this.annotate(this.tableInProcess, pTableWriter.getLanguage().getLanguage());

		this.tableInProcess = null;
		this.pTableWriter = null;
	}

	private void AddImageMediaUnit(Attributes atts) {
		String src = atts.getValue("src");
		String contentURI = "";
		BufferedImage image = null;

		// TODO

		if (src.endsWith(".jpg") || src.endsWith(".png") || src.endsWith(".bmp"))
			try {
				if (src.startsWith("http://")) {
					URL url = new URL(src);
					// BufferedReader br = new BufferedReader(new
					// InputStreamReader(new URL(src).openStream()));
					// InputStream inputStream = url.openStream();

					// Read from a URL
					image = ImageIO.read(url);
					contentURI = src;
				} else {
					// Read from an input stream
					// image = ImageIO.read(new BufferedInputStream(new
					// FileInputStream(src)));

				}
				if (image != null) {
					if (image.getWidth() > MIN_IMAGE_WIDTH && image.getHeight() > MIN_IMAGE_HEIGHT) {
						Image ImageMU = MediaUnitFactory.createAndLinkMediaUnit(this.document, Image.class);
						ImageMU.setUri(ImageMU.getUri());
						String baseUri = "http://weblab.eads.com/service/format/tika/";

						if (contentURI.isEmpty()) {
							File file = new File(src);
							contentURI = "weblab://images/content/" + file.getName();
							ContentManager contentManager = ContentManager.getInstance();
							if (contentManager == null)
								throw new WebLabUncheckedException("Unable to load required " + "properties file for content management.");
							try {
								contentManager.saveNativeContent(new FileInputStream(file), ImageMU);
							} catch (WebLabCheckedException e1) {
								logger.warn("Unable to copy image.", e1);
							}

						}

						for (int i = 0; i < atts.getLength(); i++) {
							if ("alt".equals(atts.getLocalName(i)) || "title".equals(atts.getLocalName(i))) {
								Annotation muImageAnnot = AnnotationFactory.createAndLinkAnnotation(ImageMU);
								PoKHelper pokH = RDFHelperFactory.getPoKHelper(muImageAnnot);
								pokH.setAutoCommitMode(false);
								if ("alt".equals(atts.getLocalName(i)))
									pokH.createLitStat(ImageMU.getUri(), baseUri + "alt", atts.getValue(i).replace("<br/>", ""));
								else
									pokH.createLitStat(ImageMU.getUri(), DublinCore.TITLE_PROPERTY_NAME, atts.getValue(i).replace("<br/>", ""));

								pokH.commit();
								break;
							}

						}

						try {
							Annotation muImageAnnot = AnnotationFactory.createAndLinkAnnotation(ImageMU);
							PoKHelper pokH = RDFHelperFactory.getPoKHelper(muImageAnnot);
							pokH.setAutoCommitMode(false);
							pokH.createLitStat(ImageMU.getUri(), new URL(baseUri + "width").toURI().toString(), String.valueOf(image.getWidth()));
							pokH.createLitStat(ImageMU.getUri(), new URL(baseUri + "height").toURI().toString(), String.valueOf(image.getHeight()));
							pokH.commit();
						} catch (MalformedURLException e1) {
							logger.warn("Malformed URL : " + baseUri + "width" + ")");
						} catch (URISyntaxException e1) {
							logger.warn("Unable to transform the property 'width" + "' into a predicate (" + baseUri + "width" + ")");
						}

						// TODO ??
						// ImageMU.setContent(normalisedContent);
					}
				}
			} catch (MalformedURLException e) {
				logger.info("Unable to read image. " + src + " can not be transformed to URI.");
			} catch (IOException e) {
				logger.info("Unable to convert to image : " + src);
			}
	}

	public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
		super.startElement(uri, localName, name, atts);

		if (name.equals("body")) {
			this.muHtmlElement = name;
		}
		/**
		 * If the current element is not under a table element
		 */
		if (this.tableInProcess == null) {
			/**
			 * If a title element is detected
			 */
			if (TITLE_LIST_ELEMENTS.contains(name)) {
				if (this.textInProcess == null) {
					this.openTextMediaUnit();
					muHtmlElement = name;
				} else {
					if (muHtmlElement.equals(name) || muHtmlElement.equals("body")) {
						this.closeTextMediaUnit();
						muHtmlElement = name;
						this.openTextMediaUnit();

					} else
						this.textInProcess.setContent(this.textInProcess.getContent() + System.getProperty("line.separator"));
				}
			}

			/**
			 * If a bloc element is detected
			 */
			if (BLOC_LIST_ELEMENTS.contains(name)) {
				if (this.textInProcess == null) {
					this.openTextMediaUnit();
					muHtmlElement = name;
				} else {
					/**
					 * If the current element is under a title element or
					 * another bloc element
					 */
					if (TITLE_LIST_ELEMENTS.contains(muHtmlElement) || BLOC_LIST_ELEMENTS.contains(muHtmlElement))
						this.textInProcess.setContent(this.textInProcess.getContent() + System.getProperty("line.separator"));
					else {
						this.closeTextMediaUnit();
						muHtmlElement = name;
						this.openTextMediaUnit();

					}
				}
			}

			/**
			 * If a table element is detected
			 */
			if (name.equals("table")) {
				this.OpenTableMediaUnit();
				// if(!TITLE_LIST_ELEMENTS.contains(muHtmlElement)&&
				// this.textInProcess != null)
				if (this.textInProcess != null)
					this.closeTextMediaUnit();
			}
		}

		/**
		 * If an image element is detected
		 */
		if (name.equals("img")) {
			AddImageMediaUnit(atts);
			if (this.tableInProcess == null && this.textInProcess != null) {
				if (BLOC_LIST_ELEMENTS.contains(muHtmlElement))
					this.textInProcess.setContent(this.textInProcess.getContent() + System.getProperty("line.separator"));
				else
					this.closeTextMediaUnit();
			}
		}

		if (this.textInProcess != null) {
			/**
			 * Insert line separator if the new element is in the new line
			 * element list
			 */
			if (MediaUnitContentHandler.NEW_LINE_ELEMENTS.contains(name)) {
				this.textInProcess.setContent(this.textInProcess.getContent() + System.getProperty("line.separator"));
			}
			/**
			 * Insert tab if the new element is a list item (and '-' character?)
			 */
			if (MediaUnitContentHandler.TAB_LIST_ELEMENTS.contains(name)) {
				this.textInProcess.setContent(this.textInProcess.getContent() + "\t");
			}
			if (name.equals("a") || name.equals("span"))
				this.textInProcess.setContent(this.textInProcess.getContent() + " ");
		}

	}

	public void endElement(String uri, String localName, String name) throws SAXException {

		super.endElement(uri, localName, name);

		if (this.tableInProcess != null) {
			/**
			 * Insert new line character if the new element is a table row
			 */
			if (name.equals("tr"))
				this.tableInProcess.setContent(this.tableInProcess.getContent() + "\n");
			/**
			 * Insert tab character if the new element is a table cell Trim text
			 * if not empty
			 */
			else if (name.equals("td"))
				this.tableInProcess.setContent(this.tableInProcess.getContent() + "\t");
			else if (name.equals("table")) {

				this.CloseTableMediaUnit();
			}
		}

		else if (this.textInProcess != null) {
			if (name.equals("br"))
				this.tableInProcess.setContent(this.tableInProcess.getContent() + "\n");

			// if ((name.equals(muHtmlElement)&& nbSimilarMuHtmlElement==0 &&
			// !TITLE_LIST_ELEMENTS.contains(muHtmlElement))||name.equals("body"))
			if ((name.equals(muHtmlElement) && !TITLE_LIST_ELEMENTS.contains(muHtmlElement)) || name.equals("body")) {
				this.closeTextMediaUnit();
				this.muHtmlElement = "body";
			} else if (BLOC_LIST_ELEMENTS.contains(name) || NEW_LINE_ELEMENTS.contains(name) || TITLE_LIST_ELEMENTS.contains(name))
				this.textInProcess.setContent(this.textInProcess.getContent() + System.getProperty("line.separator"));
			else
				this.textInProcess.setContent(this.textInProcess.getContent() + " ");
		}

	}

	public void characters(char[] ch, int start, int length) throws SAXException {
		super.characters(ch, start, length);

		if (muHtmlElement != "") {
			int end = start + length;
			StringBuilder sb = new StringBuilder();
			for (int i = start; i < end; i++) {
				sb.append(ch[i]);
			}
			String content = sb.toString().replaceAll("\t", "").replaceAll("\n", "").replaceAll("\r", "").trim();

			if (content.length() != 0) {
				content = sb.toString().replaceAll("\t", " ").replaceAll("\n", " ").replaceAll("\r", " ").replaceAll("  ", " ");
				if (this.textInProcess == null && this.tableInProcess == null)
					this.openTextMediaUnit();

				if (this.tableInProcess != null) {
					/**
					 * If inserted text is in "table" element, make sure there
					 * is no spaces or new lines characters
					 */
					// this.tableInProcess.setContent(this.tableInProcess.getContent()+
					// sb.toString().trim());
					this.tableInProcess.setContent(this.tableInProcess.getContent() + content);
					try {
						this.pTableWriter.append(sb.toString());
					} catch (IOException e) {
						logger.error(e.getMessage());
					}
				} else {
					// this.textInProcess.setContent(this.textInProcess.getContent()+
					// sb.toString());
					this.textInProcess.setContent(this.textInProcess.getContent() + content);
					try {
						this.pWriter.append(sb.toString());
					} catch (IOException e) {
						logger.error(e.getMessage());
					}
				}
			}
		}
	}

	/**
	 * @param mu
	 *            The resource to be annotated
	 * @param language
	 *            The language to annotate using dc:language property statements
	 *            on res.
	 */

	// private void annotate(Resource mu, final String language) {
	// Annotation annot = AnnotationFactory.createAndLinkAnnotation(mu);
	// PoKHelper pokH = RDFHelperFactory.getPoKHelper(annot);
	// pokH.setAutoCommitMode(false);
	// pokH.createLitStat(mu.getUri(), DublinCore.LANGUAGE_PROPERTY_NAME,
	// language);
	// pokH.commit();
	// }
	private void annotate(Resource mu, final String language) {
		if (docLangAnnot == null)
			docLangAnnot = AnnotationFactory.createAndLinkAnnotation(document);
		PoKHelper pokH = RDFHelperFactory.getPoKHelper(docLangAnnot);
		pokH.setAutoCommitMode(false);
		pokH.createLitStat(document.getUri(), DublinCore.LANGUAGE_PROPERTY_NAME, language);
		pokH.commit();

		Annotation muLangAnnot = AnnotationFactory.createAndLinkAnnotation(mu);
		pokH = RDFHelperFactory.getPoKHelper(muLangAnnot);
		pokH.setAutoCommitMode(false);
		pokH.createLitStat(mu.getUri(), DublinCore.LANGUAGE_PROPERTY_NAME, language);
		pokH.commit();
	}

}
