/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.cleaner;

import java.util.HashSet;
import java.util.ListIterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import javax.jws.WebService;

import org.apache.commons.logging.LogFactory;
import org.weblab_project.core.exception.WebLabUncheckedException;
import org.weblab_project.core.helper.BeanHelper;
import org.weblab_project.core.helper.PoKHelperExtended;
import org.weblab_project.core.helper.RDFHelperFactory;
import org.weblab_project.core.model.Annotation;
import org.weblab_project.core.model.ComposedUnit;
import org.weblab_project.core.model.LowLevelDescriptor;
import org.weblab_project.core.model.MediaUnit;
import org.weblab_project.core.model.Resource;
import org.weblab_project.core.model.Segment;
import org.weblab_project.core.model.structure.Cell;
import org.weblab_project.core.model.structure.Line;
import org.weblab_project.core.model.structure.Table;
import org.weblab_project.services.analyser.Analyser;
import org.weblab_project.services.analyser.ProcessException;
import org.weblab_project.services.analyser.types.ProcessArgs;
import org.weblab_project.services.analyser.types.ProcessReturn;
import org.weblab_project.services.exception.WebLabException;

/**
 * Component that process any <code>Resource</code> to remove some statements from its <code>Annotation</code>.<br />
 * 
 * The cleaning method is recursively called on any sub-<code>Resource</code> of the given <code>Resource</code>.<br />
 * 
 * When an <code>Annotation</code> does not contain any statement after the cleaning, then this <code>Annotation</code> is removed ; and so does the <code>Segment</code> without
 * <code>Annotation</code> after cleaning.<br />
 * 
 * This service is easy to configure and use.
 * <ol>
 * <li>The only configuration file you need to change is AnnotationCleaner.xml. It is a Spring Bean container XML file. It features 2 main beans and 1 other. At least one of the two main beans is
 * compulsory but they can be used together.
 * <ul>
 * <li>The first main bean is: <code>exactSet</code>. It's a {@link Set} that contains {@link String}. Each RDF Statement that have exactly this {@link String} as predicate is removed.</li>
 * <li>The second main bean is: <code>regexSet</code>. It's a {@link Set} that contains regex {@link String}. Each RDF Statement that has as predicate a String that match this regular expression is
 * removed.</li>
 * <li>The last bean is: <code>removeType</code>. It's a {@link Boolean}. The default value is false. It as bean added to remove a specific Statement that may be included in annotation, defining the
 * {@link Resource} (on which is the {@link Annotation}) as a Resource. This Statement is only removed if nothing else is in the {@link Annotation}.</li>
 * </ul>
 * </li>
 * </ol>
 * 
 * @see Pattern
 */
@WebService(endpointInterface = "org.weblab_project.services.analyser.Analyser")
public class Cleaner implements Analyser {

	/**
	 * A set of exact predicate to have their statements removed
	 */
	private Set<String> exactPropsToRemove = new HashSet<String>();

	/**
	 * A set of regex defining predicate to have their statements removed
	 */
	private Set<Pattern> regexPropsToRemove = new HashSet<Pattern>();

	/**
	 * If the <b>only</b> statement remaining in the <code>Annotation</code> is a statement defining the type <code>Resource</code> on which is the <code>Annotation</code>.
	 */
	private boolean removeRDFTypeOnURI = false;

	private boolean initialised = false;

	private static final String REGEX_SET = "regexSet";
	private static final String EXACT_SET = "exactSet";
	private static final String REMOVE_TYPE = "removeType";

	private static final String BEAN = "AnnotationCleaner.xml";
	private static final String RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.weblab_project.services.analyser.Analyser#process(org.weblab_project.services.analyser.types.ProcessArgs)
	 */
	public ProcessReturn process(ProcessArgs args) throws ProcessException {
		Resource res = this.checkArgs(args);

		this.init();

		if (!this.exactPropsToRemove.isEmpty() || !this.regexPropsToRemove.isEmpty()) {
			// Useless to crawl the resource if nothing has to be done
			this.clean(res);
		}

		ProcessReturn ret = new ProcessReturn();
		ret.setResource(res);
		return ret;
	}

	/**
	 * Cleans any <code>Resource</code>.<br />
	 * 
	 * Process its <code>Annotation</code>s to clean them using the exact and regex predicate <code>Set</code>s. When an <code>Annotation</code> does not contains any statement after the cleaning, the
	 * <code>Annotation</code> is removed from <code>res</code>.<br />
	 * 
	 * This method is called recursively on any <code>Annotation</code> or <code>LowLevelDescriptor</code> contained by <code>res</code>.<br />
	 * 
	 * If <code>res</code> is a <code>MediaUnit</code> then {@link #cleanMediaUnit(MediaUnit)} is called on it.
	 * 
	 * @param res
	 *            The <code>Resource</code> to be cleaned.
	 */
	private void clean(Resource res) {
		if (!res.getAnnotation().isEmpty()) {
			ListIterator<Annotation> li = res.getAnnotation().listIterator();
			PoKHelperExtended ahe = RDFHelperFactory.getPoKHelperExtended(new Annotation());
			ahe.setAutoCommitMode(false);
			while (li.hasNext()) {
				ahe.setPoK(li.next());
				for (final String property : this.exactPropsToRemove) {
					ahe.removeStatsWithPred(property);
				}
				for (final String property : ahe.getPreds()) {
					for (final Pattern p : this.regexPropsToRemove) {
						Matcher m = p.matcher(property);
						if (m.matches()) {
							ahe.removeStatsWithPred(property);
						}
					}
				}

				// If nothing but defining the type of the resource annotated in
				// the annotation is found, remove it, else commit changes.
				final int size = ahe.getPreds().size();
				if (size == 0) {
					li.remove();
				} else if (size == (ahe.getRessOnPredSubj(res.getUri(), Cleaner.RDF_TYPE).size() + ahe.getAnonRessOnPredSubj(res.getUri(), Cleaner.RDF_TYPE).size() + ahe.getLitsOnPredSubj(
						res.getUri(), Cleaner.RDF_TYPE).size())
						&& this.removeRDFTypeOnURI) {
					li.remove();
				} else {
					ahe.commit();
				}
			}
		}

		for (Annotation annot : res.getAnnotation()) {
			// Clean annot over annot recursively
			this.clean(annot);
		}

		for (LowLevelDescriptor lld : res.getDescriptor()) {
			// Clean annot over descriptors recursively
			this.clean(lld);
		}

		if (res instanceof MediaUnit) {
			this.cleanMediaUnit((MediaUnit) res);
		}
	}

	/**
	 * Cleans any <code>MediaUnit</code>.<br />
	 * 
	 * Process its <code>Segment</code>s to clean them using {@link #clean(Resource)}. When a <code>Segment</code> does not contains any <code>Annotation</code> after the cleaning, the
	 * <code>Segment</code> is removed from <code>mu</code>.<br />
	 * 
	 * If <code>mu</code> contains inner <code>MediaUnit</code>s (true for <code>ComposedUnit</code>, <code>Table</code>, <code>Line</code> or <code>Cell</code>) those <code>MediaUnit</code> are
	 * cleaned too, using {@link #clean(Resource)}.
	 * 
	 * @param mu
	 *            The <code>MediaUnit</code> to be cleaned.
	 */
	private void cleanMediaUnit(MediaUnit mu) {
		ListIterator<Segment> li = mu.getSegment().listIterator();
		while (li.hasNext()) {
			Segment seg = li.next();
			this.clean(seg);
			if (seg.getAnnotation().isEmpty()) {
				li.remove();
			}
		}

		if (mu instanceof ComposedUnit) {
			for (MediaUnit innerMu : ((ComposedUnit) mu).getMediaUnit()) {
				this.clean(innerMu);
			}
		} else if (mu instanceof Table) {
			for (Line line : ((Table) mu).getLine()) {
				this.clean(line);
			}
		} else if (mu instanceof Line) {
			for (Cell cell : ((Line) mu).getCell()) {
				this.clean(cell);
			}
		} else if (mu instanceof Cell) {
			this.clean(((Cell) mu).getMediaUnit());
		}
	}

	/**
	 * @param args
	 *            The <code>ProcessArgs</code> of {@link #process(ProcessArgs)} method.
	 * @throws ProcessException
	 *             If <code>args</code> of its <code>Resource</code> is <code>null</code>.
	 * @return The <code>Resource</code> in <code>args</code>.
	 */
	private Resource checkArgs(final ProcessArgs args) throws ProcessException {
		if (args == null) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E1");
			wle.setErrorMessage("Invalid parameter.");
			throw new ProcessException("ProcessArgs was null.", wle);
		}
		if (args.getResource() == null) {
			WebLabException wle = new WebLabException();
			wle.setErrorId("E1");
			wle.setErrorMessage("Invalid parameter.");
			throw new ProcessException("Resource of ProcessArgs was null.", wle);
		}
		return args.getResource();
	}

	/**
	 * Initialise the service by retrieving the two sets of string from the BEAN file.
	 */
	private synchronized void init() {
		if (!this.initialised) {
			final BeanHelper bh = BeanHelper.getInstance().getSpecificInstance(Cleaner.BEAN, true);
			// Extract exact set
			Set<?> exactSet;
			try {
				exactSet = bh.getBean(Cleaner.EXACT_SET, Set.class);
			} catch (final WebLabUncheckedException wlue) {
				LogFactory.getLog(this.getClass()).warn("Unable to retrieve exact set.", wlue);
				exactSet = new HashSet<String>();
			}
			for (final Object obj : exactSet) {
				this.exactPropsToRemove.add(obj.toString().trim());
			}

			// Extract regex set
			Set<?> regexSet;
			try {
				regexSet = bh.getBean(Cleaner.REGEX_SET, Set.class);
			} catch (final WebLabUncheckedException wlue) {
				LogFactory.getLog(this.getClass()).warn("Unable to retrieve regex set.", wlue);
				regexSet = new HashSet<String>();
			}
			for (final Object obj : regexSet) {
				final Pattern p;
				try {
					p = Pattern.compile(obj.toString().trim());
				} catch (final PatternSyntaxException pse) {
					LogFactory.getLog(this.getClass()).warn("Pattern '" + obj.toString() + "' is invalid.", pse);
					continue;
				}
				this.regexPropsToRemove.add(p);
			}

			// rdf type boolean
			boolean b;
			try {
				b = bh.getBean(Cleaner.REMOVE_TYPE, Boolean.class).booleanValue();
			} catch (final WebLabUncheckedException wlue) {
				LogFactory.getLog(this.getClass()).warn("Unable to retrieve boolean.", wlue);
				b = false;
			}
			this.removeRDFTypeOnURI = b;

			this.initialised = true;
		}
	}
}
