/*
 * File: TextStatistics.java
 * @author R. Morelli <ralph.morelli@trincoll.edu>
 * 
 * Description: This class defines a SINGLETON object that is designed to
 *  compute and store various useful statistics for a give text. The
 *  text is represted as a String.
 */
 
package hcrypto.analyzer;

import hcrypto.cipher.*;
import java.util.*;

public class TextStatistics implements Analyzer {

    public static final double KAPPA_R = 0.0385;  // Kappa value for a random distribution of letters
    public static final double KAPPA_P = 0.0667;  // Kappa value for a normal english distribution
    
    public static final double englishFrequency[] = { 
	0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 0-9
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 10-19
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 20-29
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 30-39
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 40-49
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                // 50-59
        0.0005,0.0005,0.0005,0.0005,0.0005,                          // 60-64
	0.073,                              // A = 65   i.e., 73 occurences out of 1000, roughly 7.3
	0.009, 0.030, 0.044, 0.130,         // E   13%
	0.028, 0.016, 0.035, 0.074, 	
	0.002, 0.003, 0.035, 0.025, 
	0.078, 0.074, 0.027, 0.003, 
	0.077, 0.063, 0.093, 0.027, 
	0.013, 0.016, 0.005, 0.019, 0.001,  // V-Z=90
        0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,                        // 91-96
	0.073,                              // a = 97   i.e., 73 occurences out of 1000.0005, roughly 7.3
	0.009, 0.030, 0.044, 0.130,         // e   13%
	0.028, 0.016, 0.035, 0.074, 	
	0.002, 0.003, 0.035, 0.025, 
	0.078, 0.074, 0.027, 0.003, 
	0.077, 0.063, 0.093, 0.027, 
	0.013, 0.016, 0.005, 0.019, 0.001,  // v-z = 122
        0.0005,0.0005,0.0005,0.0005,0.0005 };                        // 123-127

/* 
	This 27 x 27 (space is character 27) table is taken from 
	Soukoreff, R. W. & MacKenzie, I. S. (1995). Theoretical upper and lower 
	bounds on typing speed using a stylus and soft keyboard. 
	Behaviour & Information Technology, 14, 370-379. 
	*/

    public static final int digram_chars = 107199;
    public static final int digram_data[][] = {
	/*a*/	{2,144,308,382,1,67,138,9,322,7,146,664,177,1576,1,100,0,802,683,785,87,233,57,14,319,12,50},
	/*b*/	{136,14,0,0,415,0,0,0,78,18,0,98,1,0,240,0,0,88,15,7,256,1,1,0,13,0,36},
	/*c*/	{368,0,13,0,285,0,0,412,67,0,178,108,0,1,298,0,1,71,7,154,34,0,0,0,9,0,47},
	/*d*/	{106,1,0,37,375,3,19,0,148,1,0,22,1,2,137,0,0,83,95,3,52,5,2,0,51,0,2627},
	/*e*/	{670,8,181,767,470,103,46,15,127,1,35,332,187,799,44,90,9,1314,630,316,8,172,106,87,189,2,4904},
	/*f*/	{145,0,0,0,154,86,0,0,205,0,0,69,3,0,429,0,0,188,4,102,62,0,0,0,4,0,110},
	/*g*/	{94,1,0,0,289,0,19,288,96,0,0,55,1,31,135,0,0,98,42,6,57,0,1,0,2,0,686},
	/*h*/	{1164,0,0,0,3155,0,0,1,824,0,0,5,1,0,487,2,0,91,8,165,75,0,8,0,32,0,715},
	/*i*/	{23,7,304,260,189,56,233,0,1,0,86,324,255,1110,88,42,2,272,484,558,5,165,0,15,0,18,4},
	/*j*/	{2,0,0,0,31,0,0,0,9,0,0,0,0,0,41,0,0,0,0,0,56,0,0,0,0,0,0},
	/*k*/	{2,0,0,0,337,0,0,0,127,0,0,10,1,82,3,1,0,0,50,0,3,0,0,0,8,0,309},
	/*l*/	{332,4,6,289,591,59,7,0,390,0,38,546,30,1,344,34,0,11,121,74,81,17,19,0,276,0,630},
	/*m*/	{394,50,0,0,530,6,0,0,165,0,0,4,28,4,289,77,0,0,53,2,85,0,0,0,19,0,454},
	/*n*/	{100,2,98,1213,512,5,771,5,135,8,63,80,0,54,349,0,3,2,148,378,49,3,2,2,115,0,1152},
	/*o*/	{65,67,61,119,34,80,9,1,88,3,123,218,417,598,336,138,0,812,195,415,1115,136,398,2,47,5,294},
	/*p*/	{142,0,1,0,280,1,0,24,97,0,0,169,0,0,149,64,0,110,48,40,68,0,3,0,14,0,127},
	/*q*/	{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,66,0,0,0,0,0,0},
	/*r*/	{289,10,22,133,1139,13,59,21,309,0,53,71,65,106,504,9,0,69,318,190,89,22,5,0,145,0,1483},
	/*s*/	{196,9,47,0,626,0,1,328,214,0,57,48,31,16,213,107,8,0,168,754,175,0,32,0,34,0,2228},
	/*t*/	{259,2,31,1,583,1,2,3774,252,0,0,75,1,2,331,0,0,187,209,154,132,0,84,0,121,1,2343},
	/*u*/	{45,53,114,48,71,10,148,0,65,0,0,247,87,278,3,49,1,402,299,492,0,0,0,1,7,3,255},
	/*v*/	{27,0,0,0,683,0,0,0,109,0,0,0,0,0,33,0,0,0,0,0,1,0,0,0,11,0,0},
	/*w*/	{595,3,0,6,285,0,0,472,374,0,1,12,0,103,264,0,0,35,21,4,2,0,0,0,0,0,326},
	/*x*/	{17,0,9,0,9,0,0,0,10,0,0,0,0,0,1,22,0,0,0,23,8,0,0,0,0,0,21},
	/*y*/	{11,10,0,0,152,0,1,1,32,0,0,7,1,0,339,16,0,0,81,2,1,0,2,0,0,0,1171},
	/*z*/	{3,0,0,0,26,0,0,0,2,0,0,4,0,0,2,0,0,0,3,0,0,0,0,0,3,9,2},
	/*_*/	{1882,1033,864,515,423,1059,453,1388,237,93,152,717,876,478,721,588,42,494,1596,3912,134,116,1787,0,436,2,0},
    };

    private String text;
    private FrequencyTable freqTable;
    
    public int charCount = 0;       
    public int alphabeticsCount = 0;       
    public double coincidenceIndex = 0;
    private int[] histogram;

    public TextStatistics() {
    }

    public TextStatistics(String text) {
        setup(text);
    }
    

    public TextStatistics(String text, boolean stripString) {
        if (stripString)
            setup(stripString(text));
        else
            setup(text);
    }
    
    /** Static Utility Methods Used by Analyzers */

    public static double getEnglishFrequency( char ch ) {
         ch = Character.toLowerCase(ch);
         return englishFrequency[ch];
    }

    /**
     *  Static methods for Computing chi square statistic
     */
    public static double chiSquare(int expected[], int observed[]) {
        double sum = 0;
        for (int k = 0; k < expected.length; k++) 
            sum += ((expected[k] - observed[k]) * (expected[k] - observed[k])) / expected[k];
        return sum;
     }

    public static int getIndexOfMinimum(double a[]) {
        int min = 0;
        double minValue = a[0];
        for (int k = 0; k < a.length; k++)
            if (a[k] < minValue) {
                minValue = a[k];
                min = k;
            }  
        return min;
    }


    /**
     * This method removes all characters not in the alphabet
     *  from the string.
     */
    public static String removeNonAlphabetics(String s, Alphabet a) {
        StringBuffer sb = new StringBuffer();
        for (int k = 0; k < s.length(); k++) {
            char ch = s.charAt(k);
	    if (a.isInAlphabet(ch))
                sb.append(ch);

        }
        return sb.toString();
    }

    public static String removeDuplicates (String s) {
	StringBuffer sb = new StringBuffer();
	StringTokenizer st = new StringTokenizer(s);
	while (st.hasMoreTokens()) {
	    String word = st.nextToken();
	    if (sb.toString().indexOf(word) == -1) {
		sb.append(word);
		sb.append(" ");
	    }
	}
	return sb.toString();
    }



    /** Part of Analyzer interface */

    public void setup(String text) {
        this.text = text;
	//        freqTable = new FrequencyTable(text);
        freqTable = new FrequencyTable(text, AlphabetFactory.ALPH_az);
        calcStatistics();
    }
    public void run() { /* Everything is done in setup */ }
    
    public String getReport() {
        return toString();
    }
    
    public void print(){
        System.out.print( toString() );
    }
    
    public String toString() {   
        java.text.NumberFormat nf = java.text.NumberFormat.getNumberInstance();
        nf.setMaximumFractionDigits(3); 
  
        return "Statistics \n" +
               "Number of characters \t" + charCount + "\n" +
               "Alphabetic characters \t" + alphabeticsCount + "\n" +
               "Coincidence Index \t" + nf.format(coincidenceIndex) + "\n" ;
    }

    public void calcStatistics() {
       charCount = freqTable.getCharCount();
       alphabeticsCount = freqTable.getAlphabeticsCount();
       IndexOfCoincidence ioc = new IndexOfCoincidence();
       ioc.setup(text);
       coincidenceIndex = ioc.getIOC();
    }
    
    public double getCoincidenceIndex() {
        return coincidenceIndex;
    }

    public FrequencyTable getFrequencyTable() {
        return freqTable;
    }

    /**
     * removes all but alphabetic characters
     */
    private String stripString (String s) {
        StringBuffer sb = new StringBuffer();
        for (int k = 0; k < s.length(); k++) 
            if (Character.isLetter(s.charAt(k))) 
               sb.append(s.charAt(k));
        return sb.toString();
    }
     
    // More methods that return useful statistics

    /**
     * returns the total number of characters in the text
     */
    public int getCharCount() {
        return freqTable.getCharCount();
    }

    /**
     * returns the frequency of its char parameter
     */
    public int getFrequency(char inChar) {
        return freqTable.getCount(inChar);
    }

    /**
     * returns the the kth highest frequency character
     */
    public String getKHighestFreqChar (int k) {
        return freqTable.getKHighestFreqChar(k);
    }

    /**
     * returns a 2-D table of frequencies
     */
    public String getFrequencyReport() {
        return freqTable.getReport();
    }






}
