/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.services.normaliser.tika;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.tika.sax.ContentHandlerDecorator;
import org.ow2.weblab.core.extended.factory.MediaUnitFactory;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Text;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class MediaUnitContentHandler extends ContentHandlerDecorator {

	private Document document;
	private Text textInProcess;
	private boolean isInBody;
	/**
	 * Boolean to detect table elements
	 */
	private boolean tableElementDetected;

	/**
	 * List of elements that require new line in extracted text
	 */
	private static List<String> NEW_LINE_ELEMENTS;
	private static List<String> ITEM_LIST_ELEMENTS;

	static {
		List<String> tempList = new ArrayList<String>();
		tempList.add("p");
		tempList.add("table");
		tempList.add("dd");
		tempList.add("dt");
		tempList.add("li");
		MediaUnitContentHandler.NEW_LINE_ELEMENTS = Collections
				.unmodifiableList(tempList);

		List<String> tempList2 = new ArrayList<String>();
		tempList2.add("li");
		MediaUnitContentHandler.ITEM_LIST_ELEMENTS = Collections
				.unmodifiableList(tempList2);
	}

	public MediaUnitContentHandler(ContentHandler handler, Document document) {
		super(handler);
		this.document = document;
		this.isInBody = false;
		this.tableElementDetected = false;
	}

	public void startElement(String uri, String localName, String name,
			Attributes atts) throws SAXException {
		super.startElement(uri, localName, name, atts);

		if (name.equals("body")) {
			this.isInBody = true;
		}

		if (name.equals("div")) {
			this.textInProcess = MediaUnitFactory.createAndLinkMediaUnit(
					this.document, Text.class);
			this.textInProcess.setContent("");
		}

		if (this.textInProcess != null) {
			/**
			 * Insert line separator if the new element is in the new line
			 * element list
			 */
			if (MediaUnitContentHandler.NEW_LINE_ELEMENTS.contains(name)) {
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ System.getProperty("line.separator"));
			}
			/**
			 * Insert tab and '-' character if the new element is a list item
			 */
			if (MediaUnitContentHandler.ITEM_LIST_ELEMENTS.contains(name)) {
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ "\t- ");
			}
			/**
			 * Set flag if "table" element is detected
			 */
			if (name.equals("table")) {
				this.tableElementDetected = true;
			}
		}
	}

	public void endElement(String uri, String localName, String name)
			throws SAXException {
		super.endElement(uri, localName, name);

		if (this.textInProcess != null) {
			if (name.equals("div")) {
				this.textInProcess = null;
			}
			/**
			 * Insert new line character if the new element is a table row
			 */
			if (name.equals("tr")) {
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ "\n");
			}
			/**
			 * Insert tab character if the new element is a table cell Trim text
			 * if not empty
			 */
			if (name.equals("td")) {
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ "\t");
			}
		}
		/**
		 * Reset flag
		 */
		if (name.equals("table"))
			this.tableElementDetected = false;
	}

	public void characters(char[] ch, int start, int length)
			throws SAXException {
		super.characters(ch, start, length);

		if (this.isInBody) {
			if (this.textInProcess == null) {
				this.textInProcess = MediaUnitFactory.createAndLinkMediaUnit(
						this.document, Text.class);
				this.textInProcess.setContent("");
			}

			int end = start + length;
			StringBuilder sb = new StringBuilder();
			for (int i = start; i < end; i++) {
				sb.append(ch[i]);
			}
			/**
			 * If inserted text is in "table" element, make sure there is no
			 * spaces or new lines characters
			 */
			if (this.tableElementDetected)
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ sb.toString().trim());
			else
				this.textInProcess.setContent(this.textInProcess.getContent()
						+ sb.toString());
		}
	}
}
