/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2012 Cassidian, an EADS company
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.service.language;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import de.spieleck.app.cngram.NGram;
import de.spieleck.app.cngram.NGramMetric;
import de.spieleck.app.cngram.NGramProfile;
import de.spieleck.app.cngram.NGramProfileImpl;

/**
 * This is a patched version of NGramProfiles class from CNGram project.
 * 
 * It has been added to enable the use of custom language models. When using the default version, it's only possible to load languages models that are present
 * in the jar, next to the NGramProfiles
 * class. We only copy-paste the code, add the right behaviour by adding a constructor and an init method on a folder (containing language models). We also
 * removed some warning by changing the access
 * right on some fields and qualifying accesses with this. We also handle generics when possible. And last but not least, we change the code of original loading
 * method to use the same ClassLoader than
 * before (i.e. original NGramProfiles one).
 * 
 * "Manage a set of profiles and determine "most similar" ones to a given profile. Allows access to the complete results of previous last ranking. Note this
 * uses a competetive ranking approach, which is memory efficient, time efficient for not too many languages and provides contextual scoring of ngrams."
 * 
 * @author Inspired from frank nestel
 * @author EADS IPCC Team
 * @see de.spieleck.app.cngram.NGramProfiles
 */
public class NGramProfilesPatched {


	public final static String NOLANGNAME = "--";


	public final static char END_CHAR = (char) 0;


	public final static DecimalFormat DF = new DecimalFormat("+0.00;-0.00");


	public final static double LOWSTATSAFETY = 0.5; // was 0.5;



	List<NGramProfile> profiles = null;


	private final HashSet<NGram> allNGrams = new HashSet<NGram>(10000);


	private int maxLen = -1;


	Trie myTrie = null;


	float[][] vals;


	private int mode;


	public NGramProfilesPatched() throws IOException {
		this(1);
	}


	public NGramProfilesPatched(final int mode) throws IOException {
		final InputStream ip = NGramProfile.class.getResourceAsStream("profiles.lst");
		final BufferedReader br = new BufferedReader(new InputStreamReader(ip));
		this.mode = mode;
		this.init(br);
	}


	public NGramProfilesPatched(final BufferedReader br) throws IOException {
		this.init(br);
	}


	private void init(final BufferedReader br) throws IOException {
		this.profiles = new ArrayList<NGramProfile>();
		String line;
		while ((line = br.readLine()) != null) {
			if (line.charAt(0) == '#') {
				continue;
			}
			final InputStream is = NGramProfile.class.getResourceAsStream(line + "." + NGramProfile.NGRAM_PROFILE_EXTENSION);
			final NGramProfileImpl np = new NGramProfileImpl(line);
			np.load(is);
			this.profiles.add(np);
			final Iterator<?> iter = np.getSorted();
			while (iter.hasNext()) {
				final NGram ng = (NGram) iter.next();
				if (ng.length() > this.maxLen) {
					this.maxLen = ng.length();
				}
				this.allNGrams.add(ng);
			}
		}
		this.myTrie = null;
	}


	public Ranker getRanker() {
		synchronized (this.profiles) {
			if (this.myTrie == null) {
				// Create a reverse reference of all strings which makes it easy to create reverse Trie's
				final String[] ngs = new String[this.allNGrams.size()];
				final Iterator<?> it = this.allNGrams.iterator();
				int j = 0;
				while (it.hasNext()) {
					final NGram ng = (NGram) it.next();
					ngs[j++] = NGramProfilesPatched.reverse(ng);
				}
				Arrays.sort(ngs);
				// Create Strings in correct order but sorted from reverse end.
				final String[] ng1 = new String[this.allNGrams.size()];
				for (int i = 0; i < ngs.length; i++) {
					ng1[i] = NGramProfilesPatched.reverse(ngs[i]);
				}
				this.myTrie = NGramProfilesPatched.createTrie(ngs, 0, 0, ngs.length);
				this.vals = new float[ngs.length][this.profiles.size()];
				final int[] lengthes = new int[ngs.length];
				for (int k = 0; k < this.profiles.size(); k++) {
					final NGramProfile ngp = this.profiles.get(k);
					final double[] norm = new double[this.maxLen + 1];
					final int[] count = new int[this.maxLen + 1];
					for (int i = 0; i < ngs.length; i++) {
						final NGram ng = ngp.get(ng1[i]);
						if ((ng != null) && (ng.getCount() > NGramProfilesPatched.LOWSTATSAFETY)) {
							final int ngl = ng.length();
							lengthes[i] = ngl; // write at least once, read once :-|
							final double raw1 = ng.getCount() - NGramProfilesPatched.LOWSTATSAFETY;
							count[ngl]++;
							norm[ngl] += raw1;
							this.vals[i][k] = (float) raw1;
						}
					}
					for (int i = 1; i <= this.maxLen; i++) {
						norm[i] *= (1.0 + count[i]) / count[i];
						norm[i] += 1.0;
					}
					for (int i = 0; i < ngs.length; i++) {
						final NGram ng = ngp.get(ng1[i]);
						if ((ng != null) && (ng.getCount() > 0)) {
							final int ngl = ng.length();
							final double trans = this.vals[i][k] / norm[ngl];
							this.vals[i][k] = (float) trans;
						}
					}
				}
				// Horizontal additive zero sum + nonlinear weighting
				for (int i = 0; i < ngs.length; i++) {
					double sum = 0.0;
					for (int k = 0; k < this.profiles.size(); k++) {
						final double h = this.vals[i][k];
						sum += h;
					}
					final double av = sum / this.profiles.size();
					/**
					 * Assumed minimum amount of score for significance. XXX Heuristics for the following constant: Higher means faster and less noise.
					 * Lower means better adaption to mixed language text
					 */
					final double n = (this.modeTrans(av, ng1[i].length()) / av / 100.0) * (-Math.log(av));
					for (int k = 0; k < this.profiles.size(); k++) {
						this.vals[i][k] = (float) ((this.vals[i][k] - av) * n);
					}
				}
			}
		}

		return new Ranker() {


			protected static final double DEFAULT_MAX_RSCORE = 0.5;


			private final double[] score = new double[NGramProfilesPatched.this.profiles.size() + 1];


			private final double[] rscore = new double[NGramProfilesPatched.this.profiles.size() + 1];


			private boolean flushed = false;

			{
				this.reset();
			}


			@Override
			public RankResult getRankResult() {
				this.flush();
				final double[] pscore = new double[NGramProfilesPatched.this.profiles.size()];
				double sum = 0.0;
				for (int i = 0; i <= NGramProfilesPatched.this.profiles.size(); i++) {
					sum += this.rscore[i];
				}
				for (int i = 0; i < NGramProfilesPatched.this.profiles.size(); i++) {
					pscore[i] = this.rscore[i] / sum;
				}
				return new SimpleRankResult(pscore, true);
			}


			@Override
			public void reset() {
				Arrays.fill(this.rscore, 0.0);
				Arrays.fill(this.score, 0.0);
				this.rscore[this.score.length - 1] = DEFAULT_MAX_RSCORE;
			}


			@Override
			public void flush() {
				if (!this.flushed) {
					this.flushed = true;
					double maxValue = -1.0;
					for (final double element : this.score) {
						maxValue = Math.max(maxValue, element);
					}
					final double limit = maxValue / 2.0;
					final double f = 1.0 / (maxValue - limit);
					for (int i = 0; i < this.score.length; i++) {
						final double delta = this.score[i] - limit;
						if (delta > 0.0) {
							this.rscore[i] += delta * f;
						}
						// We do not reset to zero, this makes classification contextual
						this.score[i] /= 2.0;
					}
				}
			}


			@Override
			public void account(final CharSequence seq, final int pos) {
				Trie currentNode = NGramProfilesPatched.this.myTrie;
				int p2 = pos;
				while (currentNode != null) {
					char ch;
					if (p2 == -1) {
						ch = ' ';
					} else {
						ch = Character.toLowerCase(seq.charAt(p2));
						if (NGramProfilesPatched.isSeparator(ch)) {
							ch = ' ';
						}
					}
					final Trie t2 = currentNode.subtree(ch);
					if (t2 == null) {
						break;
					}
					if (t2.id >= 0) {
						this.flushed = false;
						for (int i = 0; i < NGramProfilesPatched.this.profiles.size(); i++) {
							this.score[i] += NGramProfilesPatched.this.vals[t2.id][i];
						}
					}
					if (p2-- == -1) {
						break;
					}
					currentNode = t2.center;
				}
				final char startChar = seq.charAt(pos);
				final boolean startSep = NGramProfilesPatched.isSeparator(startChar);
				double max = 0.0;
				for (final double element : this.score) {
					max = Math.max(max, element);
				}
				if (startSep && (max > 1.0)) {
					this.flush();
				}
			}


			@Override
			public void account(final CharSequence seq) {
				for (int i = 0; i < seq.length(); i++) {
					this.account(seq, i);
				}
			}


			@Override
			public void account(final Reader reader) throws IOException {
				BufferedReader br;
				if (reader instanceof BufferedReader) {
					br = (BufferedReader) reader;
				} else {
					br = new BufferedReader(reader);
				}
				String line;
				while ((line = br.readLine()) != null) {
					this.account(line);
				}
			}
		};
	}


	private double modeTrans(final double x, final int l) {
		final double f;
		switch (this.mode) {
			case 1:
			case 10:
				if (l == 1) {
					return x;
				}
				f = 1.0 / (l + 1);
				return Math.pow(x / f, f);
			case 9:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / Math.sqrt(f);
			case 8:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / Math.sqrt(f);
			case 7:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / f;
			case 6:
				f = 1.0 / l;
				return Math.pow(x, f) / Math.sqrt(f);
			case 5:
				f = 1.0 / l;
				return Math.pow(x, f) / f;
			case 3:
				f = 1.0 / l;
				return Math.pow(x, f);
			case 2:
				f = 1.0 / l;
				return Math.pow(x / f, f);
			case 4:
				f = 1.0 / l;
				return Math.pow(x * f, f);
			default:
				return x;
		}
	}


	public String getProfileName(final int i) {
		if ((i < 0) || (i >= this.profiles.size())) {
			return NGramProfilesPatched.NOLANGNAME;
		}
		return this.profiles.get(i).getName();
	}


	static boolean isSeparator(final char ch) {
		return ((ch <= ' ') || Character.isWhitespace(ch) || Character.isDigit(ch) || (".!?:,;".indexOf(ch) >= 0));
	}


	private static String reverse(final CharSequence seq) {
		final StringBuilder sb = new StringBuilder(seq.length());
		for (int i = 0; i < seq.length(); i++) {
			sb.insert(0, seq.charAt(i));
		}
		return sb.toString();
	}


	private static Trie createTrie(final String[] array, final int pos, final int start, final int end) {
		if (start >= end) {
			return null;
		}
		final int mid = (start + end) / 2;
		final Trie nt = new Trie();
		nt.split = array[mid].charAt(pos);
		int goRight = mid;
		while ((goRight < end) && (NGramProfilesPatched.charAt(array[goRight], pos) == nt.split)) {
			goRight++;
		}
		int goLeft = mid;
		while ((goLeft > start) && (NGramProfilesPatched.charAt(array[goLeft - 1], pos) == nt.split)) {
			goLeft--;
		}
		// Try to move "end" nodes directly into the id field!
		int goLeft2 = goLeft;
		if (array[goLeft].length() == (pos + 1)) {
			nt.id = goLeft;
			goLeft2++;
		}
		nt.center = NGramProfilesPatched.createTrie(array, pos + 1, goLeft2, goRight);
		nt.left = NGramProfilesPatched.createTrie(array, pos, start, goLeft);
		nt.right = NGramProfilesPatched.createTrie(array, pos, goRight, end);
		return nt;
	}


	public final static char charAt(final CharSequence cs, final int pos) {
		if (pos < cs.length()) {
			return cs.charAt(pos);
		}
		return NGramProfilesPatched.END_CHAR;
	}


	private final static class Trie {


		char split;


		Trie left;


		Trie right;


		Trie center;


		int id = -1;


		public Trie() {
			super();
		}


		public Trie subtree(final char c) {
			Trie current = this;
			do {
				if (c == current.split) {
					return current;
				} else if (c > current.split) {
					current = current.right;
				} else {
					current = current.left;
				}
			} while (current != null);
			return null;
		}
	}


	/**
	 * Note this class returns a complete match result, for the
	 * sake of thread safety!
	 */
	public RankResult rank(final NGramMetric metric, final NGramProfile profile) {
		this.profiles.iterator(); // Probably useless but was present
		final double[] scores = new double[this.profiles.size()];
		for (int i = 0; i < this.profiles.size(); i++) {
			scores[i] = metric.diff(profile, (this.profiles.get(i)));
		}
		return new SimpleRankResult(scores, false);
	}


	private class SimpleRankResult implements RankResult {


		private final double[] scores;


		private final NGramProfile[] profs;


		private double remain;


		public SimpleRankResult(final double[] scorex, final boolean inverse) {
			this.scores = new double[scorex.length];
			System.arraycopy(scorex, 0, this.scores, 0, scorex.length);
			this.profs = new NGramProfile[this.scores.length];
			this.remain = 1.0;
			for (int i = 0; i < this.scores.length; i++) {
				final NGramProfile prof = NGramProfilesPatched.this.profiles.get(i);
				final double m = this.scores[i];
				this.remain -= m;
				int j = i;
				while ((--j >= 0) && (inverse ^ (m < this.scores[j]))) {
					this.scores[j + 1] = this.scores[j];
					this.profs[j + 1] = this.profs[j];
				}
				this.scores[j + 1] = m;
				this.profs[j + 1] = prof;
			}
		}


		@Override
		public NGramProfilesPatched getProfiles() {
			return NGramProfilesPatched.this;
		}


		@Override
		public double getScore(final int pos) {
			if (pos == this.getLength()) {
				return this.remain;
			}
			int p = pos;
			if (p < 0) {
				p += this.getLength();
			}
			return this.scores[p];
		}


		@Override
		public String getName(final int pos) {
			if (pos == this.getLength()) {
				return NGramProfilesPatched.NOLANGNAME;
			}
			int p = pos;
			if (p < 0) {
				p += this.getLength();
			}
			return this.profs[p].getName();
		}


		@Override
		public int getLength() {
			return this.profs.length;
		}


		@Override
		public String toString() {
			final StringBuilder sb = new StringBuilder();
			sb.append("SimpleRankResult: [");
			for (int p = 0; p < this.getLength(); p++) {
				sb.append(this.getName(p));
				sb.append(" - ");
				sb.append(this.getScore(p));
				if (p < (this.getLength() - 1)) {
					sb.append(" --|-- ");
				} else {
					sb.append("]");
				}
			}
			return sb.toString();
		}
	}


	public int getProfileCount() {
		return this.profiles.size();
	}


	public Set<NGram> getAllNGrams() {
		return this.allNGrams;
	}


	public interface RankResult {


		public NGramProfilesPatched getProfiles();


		public int getLength();


		public double getScore(int pos);


		public String getName(int pos);
	}


	public interface Ranker {


		public RankResult getRankResult();


		public void reset();


		public void flush();


		public void account(CharSequence seq, int pos);


		public void account(CharSequence seq);


		public void account(Reader reader) throws IOException;
	}


	public NGramProfilesPatched(final File folder) throws IOException {
		this.init(folder);
	}


	protected void init(final File folder) throws IOException {
		this.profiles = new ArrayList<NGramProfile>();
		final File[] files = folder.listFiles(new FilenameFilter() {


			@Override
			public boolean accept(final File dir, final String name) {
				return name.endsWith("." + NGramProfile.NGRAM_PROFILE_EXTENSION);
			}

		});
		if ((files == null) || (files.length == 0)) {
			throw new IOException("No profile found in directory " + folder.getAbsolutePath());
		}
		for (final File file : files) {
			final String name = file.getName();
			final NGramProfileImpl np = new NGramProfileImpl(name.replace("." + NGramProfile.NGRAM_PROFILE_EXTENSION, ""));
			np.load(new FileInputStream(file));
			this.profiles.add(np);
			for (final Iterator<?> iterator = np.getSorted(); iterator.hasNext();) {
				final NGram ng = (NGram) iterator.next();
				if (ng.length() > this.maxLen) {
					this.maxLen = ng.length();
				}
				this.allNGrams.add(ng);
			}
		}
		this.myTrie = null;
	}


	@Override
	public String toString() {
		final StringBuilder sb = new StringBuilder();
		sb.append("NGramProfilesPatched has " + this.getProfileCount() + " language profiles: [");
		for (int p = 0; p < this.getProfileCount(); p++) {
			sb.append(this.getProfileName(p));
			if (p < (this.getProfileCount() - 1)) {
				sb.append(", ");
			} else {
				sb.append("]");
			}
		}
		return sb.toString();
	}


}
