package org.apache.tika.parser.html;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

import org.ow2.weblab.core.extended.properties.PropertiesLoader;
import org.ow2.weblab.services.normaliser.tika.TikaExtractorService;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import de.l3s.boilerpipe.extractors.KeepEverythingExtractor;

/**
 * Defines a HTML documents parser using boilerpipe/tika HtmlParser
 * The extractor used is defined in a properties file
 * 
 * @author lkhelif
 * 
 */
public class CustomBoilerpipeHtmlParser implements Parser {


    private final static Log logger = LogFactory.getLog(TikaExtractorService.class);
    
    private static final String HTML_PARSER_PROPERTIES_FILE_NAME ="html-parser.properties";
	private static final String BOILERPIPE_EXTRACTOR = "boilerpipeExtractor";
	private static final String BOILERPIPE_ARTICLE_EXTRACTOR = "Article";
	private static final String BOILERPIPE_DEFAULT_EXTRACTOR = "Default";
	private static final String BOILERPIPE_KEEP_EVERYTHING_EXTRACTOR = "KeepEverything";
	

	@Override
	public void parse(InputStream stream, ContentHandler handler,
			Metadata metadata, ParseContext context) throws IOException,
			SAXException, TikaException{

		try {

		    final Map<String, String> props = PropertiesLoader.loadProperties(HTML_PARSER_PROPERTIES_FILE_NAME);
		    
		    if (props.get(BOILERPIPE_EXTRACTOR).equals(BOILERPIPE_DEFAULT_EXTRACTOR))
		    	parseUsingDefaultExtractor(stream, metadata, handler, context);			
		    else if(props.get(BOILERPIPE_EXTRACTOR).equals(BOILERPIPE_ARTICLE_EXTRACTOR))
		    	parseUsingArticleExtractor(stream, metadata, handler, context);
		    else if(props.get(BOILERPIPE_EXTRACTOR).equals(BOILERPIPE_KEEP_EVERYTHING_EXTRACTOR))
		    	parseUsingKeepEverythingExtractor(stream, metadata, handler, context);
		    else
		    	parseUsingTikaParser(stream, metadata, handler, context);
		}	
	    catch (Exception e) {
			 logger.error("Document could not be parsed.");
			 throw new TikaException("Failed to parse HTML document", e);
	    }

	}


	private void parseUsingKeepEverythingExtractor(InputStream stream,Metadata metadata, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException {
		HtmlParser parser=new HtmlParser();
		logger.info("Using boilerpipe extractor filter : " + BOILERPIPE_KEEP_EVERYTHING_EXTRACTOR);
		
		try{
			BoilerpipeContentHandler boilerpipeContentHandler=new  BoilerpipeContentHandler(handler,KeepEverythingExtractor.INSTANCE);
			boilerpipeContentHandler.setIncludeMarkup(true);	
			parser.parse(stream, boilerpipeContentHandler, metadata, context);
		}
		catch (Exception e){
			logger.error("Extractor failed to extract content... Try to use another extractor.");
			parseUsingTikaParser(stream, metadata, handler, context);
		}
	}
	
	private void parseUsingArticleExtractor(InputStream stream,Metadata metadata, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException {
		HtmlParser parser=new HtmlParser();
		logger.info("Using boilerpipe extractor filter : " + BOILERPIPE_ARTICLE_EXTRACTOR);
		
		try{
			BoilerpipeContentHandler boilerpipeContentHandler=new  BoilerpipeContentHandler(handler,ArticleExtractor.INSTANCE);
			boilerpipeContentHandler.setIncludeMarkup(true);
			
			parser.parse(stream,boilerpipeContentHandler , metadata, context);
		}
		catch (Exception e){
			logger.error("Extractor failed to extract content... Try to use another extractor.");
			parseUsingDefaultExtractor(stream, metadata, handler, context);
		}
	}
	
	private void parseUsingDefaultExtractor(InputStream stream,Metadata metadata, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException {
		HtmlParser parser=new HtmlParser();		
		logger.info("Using boilerpipe extractor filter : " + BOILERPIPE_DEFAULT_EXTRACTOR);
		
		try{
			BoilerpipeContentHandler boilerpipeContentHandler=new  BoilerpipeContentHandler(handler);
			boilerpipeContentHandler.setIncludeMarkup(true);
			parser.parse(stream, boilerpipeContentHandler, metadata, context);
		}
		catch (Exception e){
			logger.error("Extractor failed to extract content... Try to use another extractor.");
			parseUsingKeepEverythingExtractor(stream, metadata, handler, context);
		}
	}
	
	private void parseUsingTikaParser(InputStream stream,Metadata metadata, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException {
		HtmlParser parser=new HtmlParser();
		logger.info("Using HTML Tika parser." );
    	parser.parse(stream, handler, metadata, context);
	}

	@Override
	public void parse(InputStream stream, ContentHandler handler,
			Metadata metadata) throws IOException, SAXException, TikaException {
		parse (stream,handler, metadata, new ParseContext());
	}
	
	@Override
	public Set<MediaType> getSupportedTypes(ParseContext context) {
		// TODO Auto-generated method stub
		return null;
	}
	
}

