/*
 * FastIC -- contains methods for computing IC and Chi statistics for a given text.  
 *   It is part of the hcrypto.analyzer package and uses other classes from that
 *   package.
 * The speed of the algorithm for computing IC depends on the fact that the text's
 *  frequency data are stored in a 2-dimensional array, column[][], which
 *  can be updated piecemeal, rather than recomputing the IC from scratch each
 *  time the text is rearranged.
 */
package hcrypto.analyzer;

public class FastIC {

    private static final int englishFrequency[] = { // Per thousand
	73, 9, 30, 44, 130,         // A..E   E=130 occurences out of 1000, roughly 13%
	28, 16, 35, 74, 	    // F..I
	2, 3, 35, 25,               // J..M
	78, 74, 27, 3,              // N..Q
	77, 63, 93, 27,             // R..U
	13, 16, 5, 19, 1};          // V..Z
    private int englN = 1000;

    private int N = 0;
    private int letter[] = new int[26];  // Frequency of letters a..z
    private double ic = 0;
    private double colIC[];
    private int colMax[];
    private FrequencyRecord freqs[];      // hcyrpto.analyzer.FrequencyRecord
    private int top5ColMax[][];
    private int bestShifts[] = null;       // Best shift per column
    private int column[][] = null;      // Frequency of letters in each column
    double chiSqrs[] = new double[26];  // One chi value for each shift
    private int period = 0;

    public FastIC() {
    }

    /**
     * This constructor initializes FastIC and computes the IC for its monoalphabetic text.
     * @param text -- a String giving the text to be analyzed.
     */
    public FastIC(String text) {
	ic = calcIC(text);
    }
	
    /**
     * This constructor initializes FastIC and computes the IC for its polyalphabetic text.
     * @param text -- a String giving the text to be analyzed.
     * @param period -- the number of columns that are periodically encrypted in the text.
     */
    public FastIC(String text, int period) {
	bestShifts = new int[period];
	colIC = new double[period];
	colMax = new int[period];
	top5ColMax = new int[period][5];
	this.period = period;
	column = new int[period][26]; 
	for (int j = 0; j < column.length; j++)
	    for (int k = 0; k < column[j].length; k++)
		column[j][k] = 0;
	ic = calcICwColumns(text, period);
    }

    /**
     * swapFreqs() swaps the frequency data for two letters in a given column.
     * @param col -- the column containing the letters
     * @param letter1, letter2: two letters in the text
     */
    private void swapFreqs(int col, int letter1, int letter2) {
	int temp = column[col][letter1];
	column[col][letter1] = column[col][letter2];
	column[col][letter2] = temp;
    }


    /**
     * swapFreqs() swaps the frequency data for two letters in the letter array
     * @param letter1, letter2: two letters in the text
     */
    public void swapFreqsCol(int letter1, int letter2) {
	int temp = letter[letter1];            // And in main array
	letter[letter1] = letter[letter2];
	letter[letter2] = temp;
	for (int k = 0; k < period; k++)
	    swapFreqs(k, letter1, letter2);
    }

    /**
     * calcICwColumns() assumes the text is polyalphabetic with
     *  a period of d and calculates the IC for the text as a whole.
     * @param text -- the text being analyzed
     * @param d -- the period of the text.
     */
    public double calcICwColumns (String text, int d) {
	for (int k = 0; k < letter.length; k++)
	    letter[k] = 0;
	for (int k = 0; k < text.length(); k++) {  // Count letters in text
	    ++N;
	    ++letter[text.charAt(k) - 'a'];        // Update overal distribution
	    ++column[k % d][text.charAt(k)-'a'];   // Update column distribution
	}
	double sum = 0;            // Calculate the IC
	int M = 0;
	int max = 0,  maxLoc = 0;;
	for (int k = 0; k < column.length; k++) {
	    sum = M = max = maxLoc = 0;
	    for (int j = 0; j < column[k].length; j++) {
		sum += column[k][j] * (column[k][j] - 1.0);
		M += column[k][j];
		if (column[k][j] > max) {
		    max = column[k][j];
		    maxLoc = j;
		}
	    }
	    colIC[k] = sum / (M * M - 1.0);
	    colMax[k] = maxLoc;
	    calcTop5Max(k, column[k]);   // Update count for 5 most frequent
	}

	//	for (int i = 0; i < d; i++)
	//	    System.out.println("ic[" + i + "]= " + colIC[i]);

	sum = 0;
	for (int k = 0; k < letter.length; k++) {
	    sum += letter[k] * (letter[k] - 1.0);
	}
	return sum / (N * (N- 1.0));
    }

    /**
     * getAvgFirstNIC() computes the average of the ICs
     *  for columns 0 through col.
     * @param col -- an int giving Nth column
     */
    public double getAvgFirstNIC(int col) {
	double sum = 0;
	for (int k = 0; k < col; k++)
	    sum += colIC[k];
	return sum/(col+1);
    }

    /**
     * calcTop5Max() calculates the 5 most frequent letters in a given column.
     * @param col -- the number of columns
     * @param column[] -- frequency distribution for the given column
     */ 
    private void calcTop5Max(int col, int column[]) {
	freqs = new FrequencyRecord[column.length];
	for (int k = 0; k < freqs.length; k++) {
	    freqs[k] = new FrequencyRecord((char)('a' + k), column[k]);
	}
	java.util.Arrays.sort(freqs);
	for (int k = 0; k < top5ColMax[col].length; k++) {
	    top5ColMax[col][k] = freqs[25-k].ch - 'a';;
	}
    }

    /** getSortedColumn() -- returns a sorted distribution of the frequencies
     *   in a given column.
     * @param col -- the index of the column in the column[][] array.
     */
    public FrequencyRecord[]  getSortedColumn(int col) {
	freqs = new FrequencyRecord[column[col].length];
	for (int k = 0; k < freqs.length; k++) {
	    freqs[k] = new FrequencyRecord((char)('a' + k), column[col][k]);
	}
	java.util.Arrays.sort(freqs);
	return freqs;
    }

    /**
     * getColumnTop5() returns an integer representation for the 5 most
     *   frequent letters in a column.
     * @param col -- the index of the column
     */
    public int[] getColumnTop5(int col) {
	return top5ColMax[col];
    }

    /** 
     * getTop5() returns the entire top5ColMax array, which gives 
     *   the letters and their frequencies for the top 5 most frequent
     *   letters in each column.
     */
    public int[][] getTop5() {
	return top5ColMax;
    }
					
    /**
     * getColumnIC() returns the IC for a given column.
     * @param col -- the index of the column
     */
    public double getColumnIC(int col) {
	return colIC[col];
    }

    /**
     * getColumnMax() returns the most frequent letter in a column.
     * @param col -- the column's index.
     */
    public int getColumnMax(int col) {
	return colMax[col];
    }

    /**
     * printColumnDistributions() prints the letter distributions for each column of the
     *  polyalphabetic text.
     */
    public void printColumnDistributions() {
	System.out.println("   \t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\tT\19\t20\t21\t22\t23\t24\t25");
	System.out.println("   \tA\tB\tC\tD\tE\tF\tG\tH\tI\tJ\tK\tL\tM\tN\tO\tP\tQ\tR\tS\tT\tU\tV\tW\tX\tY\tZ\n");
	for (int k = 0; k < period; k++) {
	    System.out.print("k=" + k + ":\t");
	    for (int j = 0; j < 26; j++)
		System.out.print(column[k][j] + "\t");
	    System.out.println();
	}
    }

    /**
     * printShiftedDistributions() prints the letter distributions for each column of the
     *  polyalphabetic text assuming the parameter gives the columns keyword.
     * @param keyword -- a String giving the shift for each column, where shift
     *  is applied as  (ch + shift) % 26 for each character in the column
     */
    public void printShiftedColumnDistributions(String keyword) {
	System.out.println("   \t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\tT\19\t20\t21\t22\t23\t24\t25\n");
	System.out.println("   \tA\tB\tC\tD\tE\tF\tG\tH\tI\tJ\tK\tL\tM\tN\tO\tP\tQ\tR\tS\tT\tU\tV\tW\tX\tY\tZ\n");
	for (int k = 0; k < period; k++) {
	    int shift = keyword.charAt(k) - 'a';
	    System.out.print("k=" + k + ":\t");
	    for (int j = 0; j < 26; j++)
		System.out.print(column[k][(j+shift) % 26] + "\t");
	    System.out.println();
	    System.out.print("k=" + k + ":\t");
	    for (int j = 0; j < 26; j++)
		System.out.print((char)('a' + (j + shift) % 26)  + "\t");
	    System.out.println();
	}
    }

    /**
     * getChiColPerEnglish() computes the Chi square value for a given column
     *  as compared to English language frequencies.
     * @param col -- the column's index.
     */
    public double getChiColPerEnglish(int col) {
	double sumC1 = 0, sumC2 = 0;
	double sumProd = 0;
	for (int k = 0; k < 26; k++) {  
	    sumProd += englishFrequency[k] * column[col][k];
	    sumC1 += englishFrequency[k];
	    sumC2 += column[col][k];
	}
	//	    System.out.println("col1N= " + sumC1 + "\tcolN= " + sumC2);
	return sumProd/(sumC1*sumC2);
    }
	    

    /**
     * getBestChiTestPerEnglish() -- computes as a side effect the shift that
     *  yields the best Chi square value compared to English frequencies for
     *  a given column.  The best shift is stored in the bestShifts[] array.
     * @param col -- the column's index.
     * @return -- returns the best Chi Square value
     */
    public double getBestChiTestPerEnglish(int col) {
	double maxChi = 0;
	int max = 0;
	for (int j = 0; j < 26; j++) {
	    int sumC1 = 0, sumC2 = 0;
	    double sumProd = 0;
	    for (int k = 0; k < 26; k++) {  
		sumProd += englishFrequency[k] * column[col][(k + j) % 26];
		sumC1 += englishFrequency[k];
		sumC2 += column[col][k];
	    }
	    //	    System.out.println("col1N= " + sumC1 + "\tcolN= " + sumC2);
	    chiSqrs[j] = sumProd/(sumC1*sumC2);
	    //	    System.out.println("col= " + col + "\tchi["+j+"]= " + chiSqrs[j]);
	    if (chiSqrs[j] > maxChi) {
		maxChi = chiSqrs[j];
		max = j;
	    }
	}
	bestShifts[col] = max;
	//	System.out.println("Best shift for column " + col + " = " + bestShifts[col] + " maxChi= " + maxChi);
	return maxChi;
    }

    /**
     * getBestShiftForColumn() returns the best shift as predicted by the Chi square
     *  test for a given column.
     * @param col -- the column's index
     */
    public int getBestShiftForColumn(int col) {
	//	for (int k = 0; k < bestShifts.length; k++) 
	//	    System.out.println("k= " + bestShifts[k]);
	return bestShifts[col];
    }
    /**
     * Calculates the chi statistic on the distribution of letters in
     *  the text given by its parameter, compared against the expected
     *  distribution of English letters.
     *  See Bauer, p 291ff.
     * @param text -- the text being analyzed.
     */
    public double chi(String text) {
	//	System.out.println("Chi of " + text);
        int freqs[] = new int[26];
	for (int k = 0; k < text.length(); k++)  // Calculate frequencies in text
	    ++freqs[(int)(text.charAt(k)-'a')];
	double chi = 0;
	double sumProd = 0, sumEng = 0, sumText = 0;
	for (int k = 0; k < 26; k++) {  
	    sumProd += englishFrequency[k] * freqs[k];
	    sumEng += englishFrequency[k];
	    sumText += freqs[k];
	}	
	chi = sumProd/(sumEng*sumText);
	return chi;
    }

    /**
     * Calculates the sum of the square of differences between the
     *  frequency distribution of text compared to English.
     * @param text -- the text being analyzed
     */
    public double sqrdiff(String text) {
	//	System.out.println("Chi of " + text);
        int freqs[] = new int[26];
	for (int k = 0; k < text.length(); k++)  // Calculate frequencies in text
	    ++freqs[(int)(text.charAt(k)-'a')];
	double sqrdiff = 0;
	for (int k = 0; k < 26; k++) {  
	    double diff = (1.0*englishFrequency[k]/englN) - (1.0*freqs[k]/text.length());
	    sqrdiff += (diff * diff);
	}	
	return sqrdiff;
    }


    /** 
     * getBestShiftByChiTest() returns the shift value that produces the best
     *  chi square statistic when comparing two columns of text.  So it determines
     *  the best relative shift between two columns, based on Chi square.
     * The chi square values for each shift are stored in chiSqrs[] array.
     * @param col1, col2 -- the indexes of the columns. 
     */
    public int getBestShiftByChiTest(int col1, int col2) {
	//	System.out.println("N= " + N);
	double maxChi = 0;
	int max = 0;
	for (int j = 0; j < 26; j++) {
	    int sumC1 = 0, sumC2 = 0;
	    double sumProd = 0;
	    for (int k = 0; k < 26; k++) {  
		sumProd += column[col1][k] * column[col2][(k + j) % 26];
		sumC1 += column[col1][k];
		sumC2 += column[col2][k];
	    }
	    //	    System.out.println("col1N= " + sumC1 + "\tcol2N= " + sumC2);
	    chiSqrs[j] = sumProd/(sumC1*sumC2);
	    //	    System.out.println("chi["+j+"]= " + chiSqrs[j]);
	    if (chiSqrs[j] > maxChi) {
		maxChi = chiSqrs[j];
		max = j;
	    }
	}
	return max;
    }

    public double getBestChiSqrValue(int n) {
	return chiSqrs[n];
    }

    /**
     * calcIC() calculates the Index of Coincidence for a given text.
     * @param text -- the given text.
     */
    public double calcIC(String text) {
	for (int k = 0; k < letter.length; k++)
	    letter[k] = 0;
	for (int k = 0; k < text.length(); k++) {
	    ++N;
	    ++letter[text.charAt(k) - 'a'];
	}
	double sum = 0;
	for (int k = 0; k < letter.length; k++) {
	    sum += letter[k] * (letter[k] - 1.0);
	}
	return sum / (N * (N- 1.0));
    }

    /**
     * quickCalcIC() recalculates the IC based on the pre-computed
     *  frequencies stored in the letter[] distribution.
     */
    public double quickCalcIC() {
	double sum = 0;
	for (int k = 0; k < letter.length; k++) {
	    sum += letter[k] * (letter[k] - 1.0);
	}
	return ic = sum / (N * (N- 1.0));
    }

    /**
     *  getIC() returns the pre-computed IC.
     */
    public double getIC() {
	return ic;
    }

    /**
     * add() adds the letter frequencies for its string to the letter[] array.
     * @param s -- a String of text.
     */
    public void add(String s) {
	for (int k = 0; k < s.length(); k++)
	    ++letter[s.charAt(k)-'a'];
	N += s.length();
	ic = quickCalcIC();
    }

    /**
     * subtract() subtracts the letter frequencies for its string from the letter[] array.
     * @param s -- a String of text.
     */
    public void subtract(String s) {
	for (int k = 0; k < s.length(); k++)
	    --letter[s.charAt(k)-'a'];
	N -= s.length();
	ic = quickCalcIC();
    }

    /**
     * addColumn() adds the letters frequencies for its string to the letter[] array
     *  and to the frequency distribution for a given column.
     * @param s -- a String of text.
     * @param col -- the index of the column.
     */
    public void addColumn(String s, int col) {
	for (int k = 0; k < s.length(); k++) {
	    ++letter[s.charAt(k)-'a'];
	    ++column[col][s.charAt(k)-'a'];
	}
	N += s.length();
	ic = quickCalcIC();
    }

    /**
     * subtractColumn() subtracts the letters frequencies for its string from the letter[] array
     *  and from the frequency distribution for a given column.
     * @param s -- a String of text.
     * @param col -- the index of the column.
     */
    public void subtractColumn(String s, int col) {
	for (int k = 0; k < s.length(); k++) {
	    --letter[s.charAt(k)-'a'];
	    --column[col][s.charAt(k)-'a'];
	}
	N -= s.length();
	ic = quickCalcIC();
    }

    /**
     * add() adds a letter count for a given character to the letter[] distribution.
     * @param ch -- a letter
     * @param val -- it's frequency.
     */
    public void add(char ch, int val) {
	letter[ch-'a'] += val;
	++N;
    }

    /**
     * subtract() subtracts a letter count for a given character from the letter[] distribution.
     * @param ch -- a letter
     * @param val -- it's frequency.
     */
    public void subtract(char ch, int val) {
	letter[ch-'a'] -= val;
	--N;
    }

}
