/*
 * Copyright 2016 Global Crop Diversity Trust
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.genesys.taxonomy.checker;

import java.util.Arrays;

/**
 * Based on pseudocode at https://en.wikipedia.org/wiki/Most_frequent_k_characters and http://rosettacode.org/wiki/Most_frequent_k_chars_distance
 * 
 * Does not handle digits [0-9] for obvious reasons.
 */
public class MostFrequentKChars {

	/**
	 * Get the hash for an input string with at most K most frequent characters.
	 * 
	 * <pre>
	 * 	String function MostFreqKHashing (String inputString, int K)
	 * 		def string outputString
	 * 		for each distinct character
	 * 		    count occurrence of each character
	 * 		for i := 0 to K
	 * 		    char c = next most freq ith character  (if two chars have same frequency then get the first occurrence in inputString)
	 * 		    int count = number of occurrence of the character
	 * 		    append to outputString, c and count
	 * 		end for
	 * 		return outputString
	 * </pre>
	 * 
	 * @param string the string
	 * @param k the k
	 * @return the most frequent k hash
	 */
	public static String getMostFrequentKHash(String string, int k) {
		return toHashString(calculateHash(string, k));
	}

	/**
	 * Generate the hash as int[]. Array contains the character (cast to int) followed by the frequency.
	 * 
	 * Every 2nd element is the character.
	 *
	 * @param string input string
	 * @param k limit result to k most frequent characters
	 * @return the int[]
	 */
	static int[] calculateHash(String string, int k) {
		char[] input = string.toCharArray();
		int[] occurrences = new int[string.length()];
		int[] hash = new int[2 * k];
		// System.err.println("occurences=" + Arrays.toString(occurrences));

		// track maximum occurence value
		int maxOcc = 0;

		// count occurrence of each character
		for (char c : input) {
			// IGNORE DIGITS
			if (c >= '0' && c <= '9')
				continue;
			int charOccurence = ++occurrences[string.indexOf(c)];
			if (charOccurence > maxOcc)
				maxOcc = charOccurence;
		}
		// System.err.println("occurences=" + Arrays.toString(occurrences));
		// System.err.println("maxocc=" + maxOcc);

		// find at top K occurences
		int limit = 0, pos = 0;
		while (maxOcc > 0 && limit < k) {
			int nextOcc = 0;
			for (int i = 0; i < occurrences.length; i++) {
				if (occurrences[i] == maxOcc) {
					hash[pos++] = input[i];
					hash[pos++] = occurrences[i];
					// System.err.println("pos = " + pos);
					if (++limit >= k) {
						return hash;
					}
				} else if (occurrences[i] < maxOcc && occurrences[i] > nextOcc) {
					nextOcc = occurrences[i];
					// System.err.println("nextocc=" + nextOcc);
				}
			}
			maxOcc = nextOcc;
		}

		return hash;
	}

	/**
	 * Calculate the similarity of the two hashes.
	 *
	 * @param hash1 the hash1
	 * @param hash2 the hash2
	 * @return the most freq k similarity
	 */
	public static int getMostFreqKSimilarity(String hash1, String hash2) {
		return getMostFreqKSimilarity(decodeHash(hash1), decodeHash(hash2));
	}

	/**
	 * Calculate the similarity of the two hashes.
	 * 
	 * <pre>
	 * 		int function MostFreqKSimilarity (String inputStr1, String inputStr2, int limit)
	 * 		    def int similarity
	 * 		    for each c = next character from inputStr1
	 * 		        lookup c in inputStr2
	 * 		        if c is null
	 * 		             continue
	 * 		        // similarity += frequency of c in inputStr1
	 * 		        similarity += frequency of c in inputStr1 + frequency of c in inputStr2
	 * 		    // return limit - similarity
	 * 		    return similarity
	 * </pre>
	 * 
	 * @param hash1 the hash1
	 * @param hash2 the hash2
	 * @return the most freq k similarity
	 */
	public static int getMostFreqKSimilarity(int[] hash1, int[] hash2) {
		int similarity = 0;

		for (int i = 0; i < hash1.length; i++) {
			char c = (char) hash1[i++];
			int freq1 = hash1[i];
			// System.err.println("c=" + c + " f=" + freq1);
			int freq2 = findFrequency(hash2, c);

			if (freq2 >= 0) {
				// System.err.println("found c=" + c + " f2=" + freq2);
				similarity += Math.min(freq1, freq2);
			}
		}

		return similarity;
	}

	/**
	 * Wrapper function.
	 * 
	 * <pre>
	 * 		int function MostFreqKSDF (string inputStr1, string inputStr2, int K, int maxDistance)
	 * 		    return maxDistance - MostFreqKSimilarity(MostFreqKHashing(inputStr1,K), MostFreqKHashing(inputStr2,K))
	 * </pre>
	 * 
	 * @param inputStr1 the input str1
	 * @param inputStr2 the input str2
	 * @param K the k
	 * @param maxDistance the max distance
	 * @return the int
	 */
	public static int mostFreqKSDF(String inputStr1, String inputStr2, int K, int maxDistance) {
		return maxDistance - getMostFreqKSimilarity(calculateHash(inputStr1, K), calculateHash(inputStr2, K));
	}

	/**
	 * Most freq ksdf.
	 *
	 * @param inputStr1 the input str1
	 * @param inputStr2 the input str2
	 * @param K the k
	 * @return the double
	 */
	public static double mostFreqKSDF(String inputStr1, String inputStr2, int K) {
		int[] hash1 = calculateHash(inputStr1, K);
		int[] hash2 = calculateHash(inputStr2, K);
		return 1.0 * getMostFreqKSimilarity(hash1, hash2) / (Math.max(getFrequencySum(hash1), getFrequencySum(hash2)));
	}

	/**
	 * Get the sum of frequencies of all chars represented in the hash.
	 *
	 * @param hash the hash
	 * @return sum of character frequencies
	 */
	static double getFrequencySum(int[] hash) {
		// System.err.println(Arrays.toString(hash));
		double sum = 0;
		for (int i = 1; i < hash.length; i += 2) {
			int freq = hash[i];
			sum += freq;
		}
		return sum;
	}

	/**
	 * Find frequency of char c in hash.
	 * 
	 * @param hash the hash
	 * @param c character to find
	 * @return frequency, or -1 if char not found
	 */
	private static int findFrequency(int[] hash, char c) {
		for (int i = 0; i < hash.length; i++) {
			char c2 = (char) hash[i++];
			if (c == c2)
				return hash[i];
		}
		return -1;
	}

	/**
	 * Convert the hash formatted string "a10b8c7" to int[].
	 *
	 * @param hash1 the hash1
	 * @return the int[]
	 */
	static int[] decodeHash(String hash1) {
		int[] h = new int[hash1.length()];
		int pos = 0;
		for (int i = 0; i < hash1.length(); i++) {
			h[pos++] = hash1.charAt(i);

			int endIndex = i + 1;
			char c;
			while (endIndex < hash1.length() && ((c = hash1.charAt(endIndex)) >= '0' && c <= '9')) {
				endIndex++;
			}
			h[pos++] = Integer.parseInt(hash1.substring(i + 1, endIndex));
			i = endIndex - 1;
		}
		return Arrays.copyOf(h, pos);
		// System.err.println(Arrays.toString(h));
		// return h;
	}

	/**
	 * Encode a hash array to String.
	 *
	 * @param h1 hash array as generated
	 * @return String representation of the hash array (e.g. "i3b2")
	 */
	public static String toHashString(int[] h1) {
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < h1.length; i++) {
			char c = (char) h1[i++];

			if (c == 0)
				break;

			int freq = h1[i];
			sb.append(c).append(freq);
		}
		return sb.toString();
	}
}
