package org.ow2.weblab.service.language;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import de.spieleck.app.cngram.NGram;
import de.spieleck.app.cngram.NGramMetric;
import de.spieleck.app.cngram.NGramProfile;
import de.spieleck.app.cngram.NGramProfileImpl;
import de.spieleck.app.cngram.NGramProfiles;

/**
 * This is a patched version of NGramProfiles class from CNGram project.
 * 
 * It has been added to enable the use of custom language models. When using the default version, it's only possible to load languages models that are present in the jar, next to the NGramProfiles
 * class. We only copy-paste the code, add the right behaviour by adding a constructor and an init method on a folder (containing language models). We also removed some warning by changing the access
 * right on some fields and qualifying accesses with this. We also handle generics when possible. And last but not least, we change the code of original loading method to use the same ClassLoader than
 * before (i.e. original NGramProfiles one).
 * 
 * "Manage a set of profiles and determine "most similar" ones to a given profile. Allows access to the complete results of previous last ranking. Note this uses a competetive ranking  approach, which is memory efficient, time efficient for not too many languages and provides contextual scoring of ngrams."
 * 
 * @author Inspired from frank nestel
 * @author EADS IPCC Team
 * @see NGramProfiles
 */
public class NGramProfilesPatched {
	public final static String NOLANGNAME = "--";

	public final static char END_CHAR = (char) 0;

	public final static DecimalFormat DF = new DecimalFormat("+0.00;-0.00");

	public final static double LOWSTATSAFETY = 0.5; // was 0.5;

	List<NGramProfile> profiles = null;

	private HashSet<NGram> allNGrams = new HashSet<NGram>(10000);

	private int firstNGrams;

	private int maxLen = -1;

	Trie myTrie = null;

	float[][] vals;

	private int mode;

	public NGramProfilesPatched() throws IOException {
		this(1);
	}

	public NGramProfilesPatched(int mode) throws IOException {
		InputStream ip = NGramProfile.class.getResourceAsStream("profiles.lst");
		BufferedReader br = new BufferedReader(new InputStreamReader(ip));
		this.mode = mode;
		init(br);
	}

	public NGramProfilesPatched(BufferedReader br) throws IOException {
		init(br);
	}

	private void init(BufferedReader br) throws IOException {
		this.profiles = new ArrayList<NGramProfile>();
		this.firstNGrams = 0;
		String line;
		while ((line = br.readLine()) != null) {
			if (line.charAt(0) == '#')
				continue;
			InputStream is = NGramProfile.class.getResourceAsStream(line + "." + NGramProfile.NGRAM_PROFILE_EXTENSION);
			NGramProfileImpl np = new NGramProfileImpl(line);
			np.load(is);
			this.profiles.add(np);
			Iterator<?> iter = np.getSorted();
			while (iter.hasNext()) {
				NGram ng = (NGram) iter.next();
				if (ng.length() > this.maxLen)
					this.maxLen = ng.length();
				this.firstNGrams++;
				this.allNGrams.add(ng);
			}
		}
		this.myTrie = null;
	}

	public Ranker getRanker() {
		//		int[] otherCount = new int[this.maxLen + 1];
		if (this.myTrie == null) {
			synchronized (this.profiles) {
				if (this.myTrie == null) {
					// Create a reverse reference of all strings
					// which makes it easy to create reverse Trie's
					String[] ngs = new String[this.allNGrams.size()];
					Iterator<?> it = this.allNGrams.iterator();
					int j = 0;
					while (it.hasNext()) {
						NGram ng = (NGram) it.next();
						ngs[j++] = reverse(ng);
					}
					Arrays.sort(ngs);
					// Create Strings in correct order but sorted from reverse end.
					String[] ng1 = new String[this.allNGrams.size()];
					for (int i = 0; i < ngs.length; i++) {
						ng1[i] = reverse(ngs[i]);
					}
					this.myTrie = createTrie(ngs, 0, 0, ngs.length);
					this.vals = new float[ngs.length][this.profiles.size()];
					int[] lengthes = new int[ngs.length];
					for (int k = 0; k < this.profiles.size(); k++) {
						NGramProfile ngp = this.profiles.get(k);
						double norm[] = new double[this.maxLen + 1];
						int count[] = new int[this.maxLen + 1];
						for (int i = 0; i < ngs.length; i++) {
							NGram ng = ngp.get(ng1[i]);
							if (ng != null && ng.getCount() > LOWSTATSAFETY) {
								int ngl = ng.length();
								lengthes[i] = ngl; // write at least once, read once :-|
								double raw1 = ng.getCount() - LOWSTATSAFETY;
								count[ngl]++;
								norm[ngl] += raw1;
								this.vals[i][k] = (float) raw1;
							}
						}
						for (int i = 1; i <= this.maxLen; i++) {
							norm[i] *= (1.0 + count[i]) / count[i];
							norm[i] += 1.0;
						}
						for (int i = 0; i < ngs.length; i++) {
							NGram ng = ngp.get(ng1[i]);
							if (ng != null && ng.getCount() > 0) {
								int ngl = ng.length();
								double trans = this.vals[i][k] / norm[ngl];
								this.vals[i][k] = (float) trans;
							}
						}
					}
					// Horizontal additive zero sum + nonlinear weighting
					for (int i = 0; i < ngs.length; i++) {
						double sum = 0.0;
						for (int k = 0; k < this.profiles.size(); k++) {
							double h = this.vals[i][k];
							sum += h;
						}
						double av = sum / this.profiles.size();
						/**
						 * Assumed minimum amount of score for significance.
						 * XXX Heuristics for the following constant:
						 * Higher means faster and less noise
						 * Lower means better adaption to mixed language text
						 */
						double n = modeTrans(av, ng1[i].length()) / av / 100.0 * (-Math.log(av));
						for (int k = 0; k < this.profiles.size(); k++)
							this.vals[i][k] = (float) ((this.vals[i][k] - av) * n);
					}
				}
			}
		}
		return new Ranker() {
			private double score[] = new double[NGramProfilesPatched.this.profiles.size() + 1];
			private double rscore[] = new double[NGramProfilesPatched.this.profiles.size() + 1];
			private boolean flushed = false;

			{
				reset();
			}

			public RankResult getRankResult() {
				flush();
				double pscore[] = new double[NGramProfilesPatched.this.profiles.size()];
				double sum = 0.0;
				for (int i = 0; i <= NGramProfilesPatched.this.profiles.size(); i++) {
					sum += this.rscore[i];
				}
				for (int i = 0; i < NGramProfilesPatched.this.profiles.size(); i++) {
					pscore[i] = this.rscore[i] / sum;
				}
				return new SimpleRankResult(pscore, true);
			}

			public void reset() {
				for (int i = 0; i < this.score.length; i++) {
					this.rscore[i] = this.score[i] = 0.0;
				}
				this.score[this.score.length - 1] = 0.0;
				this.rscore[this.score.length - 1] = 0.5; // 0.2 is too low;
			}

			public void flush() {
				if (!this.flushed) {
					this.flushed = true;
					double maxValue = -1.0;
					for (int i = 0; i < this.score.length; i++) {
						maxValue = Math.max(maxValue, this.score[i]);
					}
					double limit = maxValue / 2.0;
					double f = 1.0 / (maxValue - limit);
					for (int i = 0; i < this.score.length; i++) {
						double delta = this.score[i] - limit;
						if (delta > 0.0)
							this.rscore[i] += delta * f;
						// We do not reset to zero, this makes classification contextual
						this.score[i] /= 2.0;
					}
				}
			}

			public void account(CharSequence seq, int pos) {
				// System.out.println("--");            
				Trie currentNode = NGramProfilesPatched.this.myTrie;
				int p2 = pos;
				while (currentNode != null) {
					char ch;
					if (p2 == -1) {
						ch = ' ';
					} else {
						ch = Character.toLowerCase(seq.charAt(p2));
						if (isSeparator(ch))
							ch = ' ';
					}
					Trie t2 = currentNode.subtree(ch);
					if (t2 == null)
						break;
					// System.out.println("- "+(pos-p2)+"|"+ch+"|"+t2+"| t2.split="+t2.split+" t2.id="+t2.id+" ("+p2+")");
					if (t2.id >= 0) {
						this.flushed = false;
						// double max = 0.0;                
						for (int i = 0; i < NGramProfilesPatched.this.profiles.size(); i++) {
							// max = Math.max(max, vals[t2.id][i]);
							this.score[i] += NGramProfilesPatched.this.vals[t2.id][i];
						}
						/*
						 * if (p2 >= 0 )System.out.print("<"+seq.subSequence(p2,pos+1)+">:");else System.out.print("< "+seq.subSequence(0,pos+1)+">:");
						 * int llh = pos - p2;
						 * System.out.print("     ".subSequence(0,5 - pos + p2));
						 * for(int i = 0; i < profiles.size(); i++)
						 * {
						 * System.out.print(" "+getProfileName(i)
						 * +(vals[t2.id][i] == max ? '*':':')
						 * +DF.format(vals[t2.id][i])
						 * );
						 * }
						 * System.out.println();
						 */
					}
					if (p2-- == -1)
						break;
					currentNode = t2.center;
				}
				char startChar = seq.charAt(pos);
				boolean startSep = isSeparator(startChar);
				double max = 0.0;
				for (int i = 0; i < this.score.length; i++) {
					max = Math.max(max, this.score[i]);
				}
				if (startSep && max > 1.0) {
					/*
					 * System.out.println(" - "+DF.format(max)
					 * +" "+DF.format(score[score.length-1])+" "+pos
					 * +" "+seq.charAt(pos-2)+seq.charAt(pos-1)+seq.charAt(pos)
					 * );
					 */
					flush();
				}
			}

			public void account(CharSequence seq) {
				for (int i = 0; i < seq.length(); i++)
					account(seq, i);
			}

			public void account(Reader reader) throws IOException {
				BufferedReader br;
				if (reader instanceof BufferedReader)
					br = (BufferedReader) reader;
				else
					br = new BufferedReader(reader);
				String line;
				while ((line = br.readLine()) != null) {
					account(line);
				}
			}
		};
	}

	private double modeTrans(double x, int l) {
		double f;
		switch (this.mode) {
			case 1:
			case 10:
				if (l == 1)
					return x;
				f = 1.0 / (l + 1);
				return Math.pow(x / f, f);
			case 9:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / Math.sqrt(f);
			case 8:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / Math.sqrt(f);
			case 7:
				f = 1.0 / (l + 1);
				return Math.pow(x, f) / f;
			case 6:
				f = 1.0 / l;
				return Math.pow(x, f) / Math.sqrt(f);
			case 5:
				f = 1.0 / l;
				return Math.pow(x, f) / f;
			case 3:
				f = 1.0 / l;
				return Math.pow(x, f);
			case 2:
				f = 1.0 / l;
				return Math.pow(x / f, f);
			case 4:
				f = 1.0 / l;
				return Math.pow(x * f, f);
		}
		return x;
	}

	public String getProfileName(int i) {
		if (i < 0 || i >= this.profiles.size())
			return NOLANGNAME;
		return this.profiles.get(i).getName();
	}

	static boolean isSeparator(char ch) {
		return (ch <= ' ' || Character.isWhitespace(ch) || Character.isDigit(ch) || ".!?:,;".indexOf(ch) >= 0);
	}

	private static String reverse(CharSequence seq) {
		StringBuilder sb = new StringBuilder(seq.length());
		for (int i = 0; i < seq.length(); i++)
			sb.insert(0, seq.charAt(i));
		return sb.toString();
	}

	private static Trie createTrie(String[] array, int pos, int start, int end) {
		if (start >= end)
			return null;
		/*
		 * if ( start == end )
		 * {
		 * // XXX Some special Trie-Node and moving to getters for links
		 * // could save memory here!
		 * Trie leaf = new Trie();
		 * leaf.split = array[start].charAt(pos);
		 * leaf.id = start;
		 * return leaf;
		 * }
		 */
		int mid = (start + end) / 2;
		Trie nt = new Trie();
		// System.out.println("=! "+start+" "+mid+"["+array[mid]+"] "+end);    
		nt.split = array[mid].charAt(pos);
		int goRight = mid;
		while (goRight < end && charAt(array[goRight], pos) == nt.split)
			goRight++;
		int goLeft = mid;
		while (goLeft > start && charAt(array[goLeft - 1], pos) == nt.split)
			goLeft--;
		// Try to move "end" nodes directly into the id field!
		int goLeft2 = goLeft;
		if (array[goLeft].length() == pos + 1) {
			// System.out.println("=# "+goLeft+" <"+nt.split+"> <"+array[goLeft]+">");      
			nt.id = goLeft;
			goLeft2++;
		}
		// System.out.println("== "+start+" "+goLeft+"["+array[goLeft]+"]|"+(goLeft2 < array.length ?"["+array[goLeft2]+"]":"[]")+goLeft2+" ("+mid+","+nt.split+") "+(goRight < array.length ? "["+array[goRight]+"]":"[]")+goRight+"-"+end);
		nt.center = createTrie(array, pos + 1, goLeft2, goRight);
		nt.left = createTrie(array, pos, start, goLeft);
		nt.right = createTrie(array, pos, goRight, end);
		return nt;
	}

	public final static char charAt(CharSequence cs, int pos) {
		if (pos < cs.length())
			return cs.charAt(pos);
		return END_CHAR;
	}

	private final static class Trie {
		static int count = 0;
		char split;
		Trie left;
		Trie right;
		Trie center;
		int id = -1;

		public Trie() {
			count++;
		}

		public Trie subtree(char c) {
			Trie current = this;
			do {
				if (c == current.split)
					return current;
				else if (c > current.split)
					current = current.right;
				else
					current = current.left;
			} while (current != null);
			return null;
		}
	}

	/**
	 * Note this class returns a complete match result, for the
	 * sake of thread safety!
	 */
	public RankResult rank(NGramMetric metric, NGramProfile profile) {
		this.profiles.iterator(); // Probably useless but was present
		double[] scores = new double[this.profiles.size()];
		for (int i = 0; i < this.profiles.size(); i++)
			scores[i] = metric.diff(profile, (this.profiles.get(i)));
		return new SimpleRankResult(scores, false);
	}

	private class SimpleRankResult implements RankResult {
		private double scores[];
		private NGramProfile[] profs;
		private double remain;

		public SimpleRankResult(double[] scorex, boolean inverse) {
			this.scores = new double[scorex.length];
			System.arraycopy(scorex, 0, this.scores, 0, scorex.length);
			this.profs = new NGramProfile[this.scores.length];
			this.remain = 1.0;
			for (int i = 0; i < this.scores.length; i++) {
				NGramProfile prof = NGramProfilesPatched.this.profiles.get(i);
				double m = this.scores[i];
				this.remain -= m;
				int j = i;
				while (--j >= 0 && (inverse ^ (m < this.scores[j]))) {
					this.scores[j + 1] = this.scores[j];
					this.profs[j + 1] = this.profs[j];
				}
				this.scores[j + 1] = m;
				this.profs[j + 1] = prof;
			}
		}

		public NGramProfilesPatched getProfiles() {
			return NGramProfilesPatched.this;
		}

		public double getScore(final int pos) {
			if (pos == getLength())
				return this.remain;
			int p = pos;
			if (p < 0)
				p += getLength();
			return this.scores[p];
		}

		public String getName(int pos) {
			if (pos == getLength())
				return NOLANGNAME;
			int p = pos;
			if (p < 0)
				p += getLength();
			return this.profs[p].getName();
		}

		public int getLength() {
			return this.profs.length;
		}
	}

	public int getProfileCount() {
		return this.profiles.size();
	}

	public Set<NGram> getAllNGrams() {
		// XXX make this read only or is this slowing down too much?
		return this.allNGrams;
	}

	public interface RankResult {
		public NGramProfilesPatched getProfiles();

		public int getLength();

		public double getScore(int pos);

		public String getName(int pos);
	}

	public interface Ranker {
		public RankResult getRankResult();

		public void reset();

		public void flush();

		public void account(CharSequence seq, int pos);

		public void account(CharSequence seq);

		public void account(Reader reader) throws IOException;
	}

	public NGramProfilesPatched(File folder) throws IOException {
		this.init(folder);
	}

	protected void init(File folder) throws IOException {
		this.profiles = new ArrayList<NGramProfile>();
		this.firstNGrams = 0;
		for (File file : folder.listFiles()) {
			final String name = file.getName();
			if (name.endsWith("." + NGramProfile.NGRAM_PROFILE_EXTENSION)) {
				NGramProfileImpl np = new NGramProfileImpl(name.replace("." + NGramProfile.NGRAM_PROFILE_EXTENSION, ""));
				np.load(new FileInputStream(file));
				this.profiles.add(np);
				for (Iterator<?> iterator = np.getSorted(); iterator.hasNext();) {
					NGram ng = (NGram) iterator.next();
					if (ng.length() > this.maxLen) {
						this.maxLen = ng.length();
					}
					this.firstNGrams++;
					this.allNGrams.add(ng);
				}
			} else {
				continue;
			}
		}
		this.myTrie = null;
	}
}
