/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.crawler;

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.content.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.ResourceFactory;
import org.ow2.weblab.core.model.ComposedResource;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.processing.WProcessingAnnotator;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.purl.dc.terms.DCTermsAnnotator;

/**
 * Use this component crawl a folder. This is a basic component, no thread, no
 * complex timings, no data comparison. A real crawler could use multiple
 * instances of this component.
 * 
 * @author EADS DS
 */
public class FolderCrawler {

	final protected ContentManager contentManager;

	final protected File folder;

	final protected FileFilter fileFilter;

	final protected FileFilter folderFilter;

	// final protected MimeInfo mimeInfo;

	final protected int bufferSize = 10000;

	final protected boolean recursiveMode;

	final private List<File> crawledFiles = new ArrayList<File>();

	final private byte[] lock = new byte[0];

	protected final static String CRAWLER_ID = "crawlerFolder";

	protected final static String CRAWLER_CONTENT_ID = "crawlerFolderContent";

	final static protected SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");

	//removing properties that allow offer to add isExposedAs : this part should be revised
//	final public static String CONFIG_FILE = "FolderCrawler.config";
//
//	public static final String EXPOSED_ROOT_PROPERTY_NAME = "exposedRoot";
//
//	public static final String EXPOSED_AS_URI_PROPERTY_NAME = "exposedAsUri";
//
//	final protected String exposedRoot;
//
//	final protected String exposedAsUri;

	private static final Log LOG = LogFactory.getLog(FolderCrawler.class);

	private final static FileFilter FOLDER_FILTER = new FileFilter() {

		public boolean accept(final File file) {
			if (file.isDirectory())
				return true;
			return false;
		}
	};

	/**
	 * Constructors
	 * 
	 * @param contentManager
	 *            The content manager
	 * @param folder
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @param folderFilter
	 *            A filter on the folder
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode,
			final FileFilter folderFilter) throws WebLabCheckedException {
		super();
		if (contentManager == null)
			throw new WebLabCheckedException("Content manager must be well instanciated.");
		this.contentManager = contentManager;
		if (!folder.exists() || folder.isFile() || !folder.canRead())
			throw new WebLabCheckedException("Folder to crawl '" + folder.getAbsolutePath() + "' is unvalid.");
		this.folder = folder;
		this.recursiveMode = recursiveMode;
		this.fileFilter = fileFilter;
		this.folderFilter = folderFilter;

//		final Map<String, String> props = PropertiesLoader.loadProperties(FolderCrawler.CONFIG_FILE);
//		this.exposedRoot = props.get(EXPOSED_ROOT_PROPERTY_NAME);
//		this.exposedAsUri = props.get(EXPOSED_AS_URI_PROPERTY_NAME);

		// MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.ExtensionMimeDetector");
		// MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
		// MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.WindowsRegistryMimeDetector");

		// try {
		// this.mimeInfo = MimeInfoFactory.create();
		// } catch (final IOException ioe) {
		// throw new
		// WebLabCheckedException("Unable to create mimeinfo librairy: " +
		// ioe.getMessage(), ioe);
		// } catch (final MimeInfoException mie) {
		// throw new
		// WebLabCheckedException("Unable to create mimeinfo librairy: " +
		// mie.getMessage(), mie);
		// }
	}

	/**
	 * Constructors
	 * 
	 * @param contentManager
	 *            The content manager
	 * @param folder
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode)
			throws WebLabCheckedException {
		this(contentManager, folder, fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
	}

	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter) throws WebLabCheckedException {
		this(folderToCrawl, fileFilter, false, FolderCrawler.FOLDER_FILTER);
	}

	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode)
			throws WebLabCheckedException {
		this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
	}

	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @param folderFilter
	 *            The folder filter to be used
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode,
			final FileFilter folderFilter) throws WebLabCheckedException {
		this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, folderFilter);
	}
	
	/**
	 * Constructors
	 * 
	 * @param internFolder
	 * @param folderToCrawl
	 * @throws WebLabCheckedException
	 */
	public FolderCrawler(final String folderToCrawl) throws WebLabCheckedException {
		this(folderToCrawl, new FileFilter() {
			public boolean accept(final File file) {
				if (file.isFile())
					return true;
				return false;
			}
		});
	}

	/**
	 * @return The number of file crawled.
	 */
	public int getNbFiles() {
		return this.crawledFiles.size();
	}

	/**
	 * Crawls the folder using the file filter and fills the crawled files list.
	 */
	public void startCrawl() {
		if (this.folder == null || this.fileFilter == null) {
			throw new WebLabUncheckedException("Folder to crawl and file filter " + "should have been defined previously.");
		}
		synchronized (this.lock) {
			this.listAndAddFiles(this.folder);
		}
		FolderCrawler.LOG.info(this.crawledFiles.size() + " crawled files in FolderCrawler: " + this.toString());
		FolderCrawler.LOG.debug("Crawled files: " + this.folder);
	}

	/**
	 * @param newFolder
	 *            The folder to be crawled
	 */
	protected void listAndAddFiles(final File newFolder) {
		if (newFolder.isDirectory()) {
			FolderCrawler.LOG.debug("Add content of folder: " + newFolder.getAbsolutePath());
			final boolean debug = FolderCrawler.LOG.isDebugEnabled();

			for (final File file : newFolder.listFiles(this.fileFilter)) {
				if (!this.crawledFiles.contains(file)) {
					if (debug) {
						FolderCrawler.LOG.trace("Add file: " + file.getAbsolutePath());
					}
					this.crawledFiles.add(file);
				}
			}
			if (this.recursiveMode) {
				for (final File dir : newFolder.listFiles(this.folderFilter)) {
					this.listAndAddFiles(dir);
				}
			}
		}
	}

	/**
	 * @param offset
	 *            the starting point in the collection. If negative, 0 is used.
	 * @param limit
	 *            if negative of null, Integer.MAX_VALUE is used.
	 * @return A resource collection
	 */
	public ComposedResource getCrawledDocuments(final int offset, final int limit) {
		int theOffset = offset;
		int theLimit = limit;

		synchronized (this.lock) {
			final long time = System.currentTimeMillis();
			final ComposedResource col = ResourceFactory.createResource(FolderCrawler.CRAWLER_ID, "tempCollection-" + time, ComposedResource.class);

			if (this.crawledFiles.isEmpty()) {
				FolderCrawler.LOG.warn("Either you haven't done a startCrawl before or folder (" + this.folder + ") was empty.");
				return col;
			}

			if (theOffset >= this.crawledFiles.size()) {
				FolderCrawler.LOG.warn("Every files have already been crawled.");
				return col;
			}

			if (theOffset < 0) {
				FolderCrawler.LOG.warn("Offset was negative, 0 used instead.");
				theOffset = 0;
			}

			if (theLimit <= 0) {
				FolderCrawler.LOG.info("Limit was null or negative. Integer.MAX_VALUE will be used.");
				theLimit = Integer.MAX_VALUE;
			}

			int cpt = theOffset;
			boolean toContinue = true;
			do {
				if (cpt < this.crawledFiles.size()) {
					final File file = this.crawledFiles.get(cpt);
					if ((!file.exists()) || (!file.isFile()) || (!file.canRead())) {
						/*
						 * If the file changed of status between startCrawl and
						 * getCrawledDocuments, we remove it from the list and
						 * continue the loop.
						 */
						this.crawledFiles.remove(cpt);
						FolderCrawler.LOG.warn("File (" + file + ") is not crawlable");
						continue;
					}

//					final URI contentUri;
//
//					try {
//						contentUri = new URI("weblab://" + CRAWLER_CONTENT_ID + "/" + time + "/" + cpt);
//					} catch (URISyntaxException e) {
//						// This can not append but well, hacker can do
//						// anything...
//						FolderCrawler.LOG.error(e);
//						continue;
//					}

					Document document = ResourceFactory.createResource(CRAWLER_ID, "file" + cpt, Document.class);

					FolderCrawler.LOG.debug("Loading file: " + file.getAbsolutePath());
					try {
//						contentUri = 
							this.contentManager.saveNativeContent(new FileInputStream(file), document);
					} catch (final WebLabCheckedException wlce) {
						throw new WebLabUncheckedException("Unexpected error with content manager.", wlce);
					} catch (FileNotFoundException e) {
						throw new WebLabUncheckedException("Cannot create an InputStream on file ["+file+"].",e);
					}

					String path;
					try {
						path = file.getCanonicalPath();
					} catch (final IOException ioe) {
						FolderCrawler.LOG.warn("Unable to get canonical path of file: " + file.getAbsolutePath() + "; absolute path will be used instead.");
						path = file.getAbsolutePath();
					}
					
					// Add WebLab Processing Annotations
					WProcessingAnnotator wpa = new WProcessingAnnotator(document);
//					wpa.writeNativeContent(contentUri);		// this annotation is now added by the ContentManager
					wpa.writeGatheringDate(new Date());
					wpa.writeOriginalFileName(file.getName());
					wpa.writeOriginalFileSize(file.length());

					// Add Dublin Core Annotation
					DublinCoreAnnotator dca = new DublinCoreAnnotator(document);
					dca.writeSource(path);

					// Add Dublin Core Terms Annotations
					DCTermsAnnotator dcta = new DCTermsAnnotator(document);
					dcta.writeExtent(file.length());
					dcta.writeModified(new Date(file.lastModified()));

					// PoKHelperExtended helper =
					// RDFHelperFactory.getPoKHelperExtended(AnnotationFactory.createAndLinkAnnotation(document));
					// helper.createResStat(document.getUri(),
					// WebLab.HAS_NATIVE_CONTENT, contentUri);
					// helper.createLitStat(document.getUri(),
					// WebLab.HAS_GATHERING_DATE,
					// FolderCrawler.DATE_FORMAT.format(new Date()));
					// helper.createLitStat(document.getUri(), DCTerms.EXTENT,
					// file.length() + " bytes");
					// helper.createLitStat(document.getUri(),
					// WebLab.HAS_ORIGINAL_FILE_SIZE, "" + file.length());
					// helper.createLitStat(document.getUri(), DCTerms.MODIFIED,
					// FolderCrawler.DATE_FORMAT.format(new
					// Date(file.lastModified())));
					// helper.createLitStat(document.getUri(),
					// DublinCore.SOURCE_PROPERTY_NAME, path);
					// helper.createLitStat(document.getUri(),
					// WebLab.HAS_ORIGINAL_FILE_NAME, file.getName());

					
					///TODO
//					if (this.exposedAsUri != null && this.exposedRoot != null && (!this.exposedAsUri.isEmpty()) && (!this.exposedRoot.isEmpty())) {
//						wpa.writeExposedAs(this.exposedRoot + contentUri.hashCode());
//						// helper.createLitStat(document.getUri(),
//						// this.exposedAsUri, this.exposedRoot +
//						// contentUri.hashCode());
//					}

					// String mimeType;
					// try {
					// mimeType = this.mimeInfo.getMimeType(file);
					// } catch (final IOException ioe) {
					// mimeType = FolderCrawler.UNKNOWN_MIME_TYPE;
					// FolderCrawler.LOG.warn("Unable to find a mime type." +
					// ioe.getMessage());
					// }
					// if (mimeType == null) {
					// mimeType = FolderCrawler.UNKNOWN_MIME_TYPE;
					// FolderCrawler.LOG.warn("Unable to find a mime type.");
					// }
					//
					// // TODO Remove this since it's a dirty hack for ms files.
					// if (mimeType.equals("application/x-ole-storage")) {
					// final String ext =
					// file.getName().substring(1 +
					// file.getName().lastIndexOf('.')).toLowerCase(
					// Locale.getDefault());
					// if (ext.equals("doc")) {
					// mimeType = "application/msword";
					// } else if (ext.equals("ppt") || ext.equals("pps")) {
					// mimeType = "application/vnd.ms-powerpoint";
					// } else if (ext.equals(".dot")) {
					// mimeType = "application/x-dot";
					// } else if (ext.equals("vsd")) {
					// mimeType = "application/vnd.visio";
					// }
					// }
					// helper.createLitStat(document.getUri(),
					// DublinCore.FORMAT_PROPERTY_NAME, mimeType);

					// helper.createLitStat(document.getUri(),
					// DublinCore.FORMAT_PROPERTY_NAME,
					// MimeUtil.getMimeTypes(file).iterator().next().toString());

					// helper.setNSPrefix("wlp",
					// WebLab.PROCESSING_PROPERTY_NAMESPACE);
					// helper.setNSPrefix("dc", DublinCore.NAMESPACE);
					// helper.setNSPrefix("dct", DCTerms.NAMESPACE);
					// helper.commit();
					col.getResource().add(document);
					cpt++;

					if (cpt - theOffset >= theLimit) {
						toContinue = false;
					}
				} else {
					toContinue = false;
				}
			} while (toContinue);
			FolderCrawler.LOG.info((this.crawledFiles.size() - cpt) + " files remaining in foldercrawler " + this.toString());
			return col;
		}

	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return "Folder to crawl: '" + this.folder.getAbsolutePath() + "'.";
	}
}
