package hcrypto.analyzer;

 /*
 * File: NgramArray.java
 * @author R.Walde <rwalde@nyc.rr.com>
 *
 *  Description: This class stores the frequencies of N successive
 *  characters appearing in a large text to give typical frequencies
 *  in English (or other language).  It is intended to be used with
 *  N = 2, 3, or 4  (bigrams, trigrams, or  tetragrams).
 *  These frequencies can be used to cryptanalyze simple substitution
 *  ciphers and cryptograms in particular.  Using the sum of the inverses
 *  of the frequencies as an evaluation of how accurately a substitution
 *  matches the usual N-gram frequencies in a language is an idea
 *  described by Alex Griffing the developer of the "Automatic Cryptogram
 *  Solver".
 *
 * <P>Copyright: This program is in the public domain. You can modify it as you
 *  see fit as long as you properly acknowledge its original author.
 *  It would also be nice if you forwarded your changes to
 *  <A HREF= "mailto:ralph.morelli@trincoll.edu">ralph.morelli@trincoll.edu</A> so
 *  they can possibly be added to the "official" version.
 *  MODIFIED by REW 5/2/03 to define freqDist(), sqrDist(), and setStepSize().
 */

import hcrypto.cipher.*;
import java.io.*;
import java.text.NumberFormat;

public class NgramArray implements Analyzer{

    private Alphabet alphabet; // Defines which chars occur in N-grams.
    private int alphSize; // The size of the alphabet.
    private int arrSize; // The size of the array freq[]
    private int mask; // To erase binary 1's at left of index numbers.
    private int NN;  // The N of the N-gram = sequence of N chars.
    private int textSize; // The number of alphabet chars in the text file.
    private float[] freq; //Frequency of each N-gram
    private int stepSize; //When computing the Dist() functions

    //REW Added 6/9/2003 to compute and store most frequent N-grams
    private final int NUMMOSTFREQ  = 100;
    private float[] mostFreqValue = new float[NUMMOSTFREQ];
    private int[] mostFreqIndex = new int[NUMMOSTFREQ];
    private String[] mostFreqWord = new String[NUMMOSTFREQ];

    /**
     * NgramArray(N, filename, alph) - constructor
     * NN = N is the size of the N-gram N=2, 3, or 4
     * filename is a file containing a large text typical of the language
     *   of the cryptotext
     * alphabet = alph describes which letters occur in the N-grams
     * freq = the distribution of letters in text.
     */

    public NgramArray(int N, String fileName, Alphabet alph) throws Exception{

      NN = N;
      alphabet = alph;
      stepSize = 1;
      if (alphabet == null)
           throw new Exception("In NgramArray: Null alphabet");
      alphSize = alphabet.getSize();
       if ((NN > 1) && (NN < 5) && (alphSize <= 32)){
          mask = (1 << (NN * 5))-1;
          arrSize = alphSize * (1 << ((NN-1) * 5));
          freq = new float[arrSize];
          setup(fileName);
          //System.out.println("mask = " + mask);
       } //if
       else {
        if (alphSize > 32)
           throw new Exception("In NgramArray: Invalid Alphabet size = " + alphSize);
         if ((NN < 2) || (NN > 4) )
           throw new Exception("In NgramArray: Invalid N-gram size = " + NN);
       }  //else
    }  //NgramArray(N, filename,alphabet)

    /**
     * From the Analyzer interface.  In this case the freq array is calculated.
     * The element freq[index] is the reciprocal of (1 + frequency of the N-gram)
     * represented by index in the text read from a file.
     *
     */

    public void setup(String fileName){
        int pos = 0; // position of char in line
        int index = 0; // index for the N-gram in the array freq
        try{
            String line = null;
            int len = 0; // will store the length of line.
            char ch;
            int chNum = 0; // Will count the number of acceptable chars.
            int k = 0; //Used for loop to invert array values.
	    System.out.println("Filename= " + fileName);
            BufferedReader inStream = new BufferedReader(new FileReader(fileName));
            line = inStream.readLine();
            while (line != null) {
                len = line.length();
                if (len > 0){
                    line = line.toLowerCase();
		    //		    line = removeWhiteSpace(line);  // RAM: Playfair counts digraphs
		    //		    len = line.length();            // RAM: Playfair
		    //		    boolean toggle = false;         // RAM: Playfair experiment
                    for (pos = 0; pos < len; pos++){
                        ch  = line.charAt(pos);
                        if (alphabet.isInAlphabet(ch)){
			    //			    toggle = !toggle;             // Playfair
                            chNum++;
                            index = ((index<<5) | alphabet.charToInt(ch)) & mask;
                            if (chNum >= NN) freq[index] = freq[index] + (float)1.0;
			    //                            if (chNum >= NN && toggle) freq[index] = freq[index] + (float)1.0;       // Playfair
                        }//if  ch is in alphabet
                    } //for each pos in line
                }//if
                line = inStream.readLine();
            } //while
            textSize  = chNum;

            setupMostFreqWords(); // Added 6/9/2003

        }//try
        catch(Exception exc){
            System.out.println("In NgramArray  setup() - " +exc.toString());
	    exc.printStackTrace();
        } //catch
    } // setup()


    private String removeWhiteSpace(String s) {
	StringBuffer sb = new StringBuffer();
	for (int k = 0; k < s.length(); k++)
	    if (s.charAt(k) != ' ')
		sb.append(s.charAt(k));
	return new String(sb.toString());
    }

    /**
     * From the Analyzer interface.
     */

    public String getReport() {
        return toString();
    }

    /**
     * From the Analyzer interface.
     * Everything is done in setup.
     */

    public void run() {
    }

    /**
     * Puts characters with frequencies into a string.
     */

    public String toString(){
      StringBuffer sb = new StringBuffer();
      sb.append("\n");
      sb.append("Alphabet size = " + alphSize+"\n");
      sb.append("This array stores reciprocals of frequencies of "+NN+"-grams\n");
      sb.append("Array size = "+ arrSize+"\n");
      sb.append("Number of alphabet chars in the text file= "+textSize+"\n");
      try{
        if (NN == 4){
          int index = 0;
          char ch = 't';
          index = ((index<<5) | alphabet.charToInt(ch)) & mask;
          ch = 'h';
          index = ((index<<5) | alphabet.charToInt(ch)) & mask;
          ch = 'e';
          index = ((index<<5) | alphabet.charToInt(ch)) & mask;
          ch = 'n';
          index = ((index<<5) | alphabet.charToInt(ch)) & mask;
          sb.append("freq of /then/ = " + freq[index] + "\n");
        } //if
      } //try
      catch(Exception exc){
        System.out.println("in ngramArray toString() - " + exc.toString());
      } //catch
      return sb.toString();
    }//toString()

    public void print(){
        System.out.println(toString());
    } //print()

   public int getAlphSize() {
        return alphSize;
   } //getAlphSize()

   public Alphabet getAlphabet() {
        return alphabet;
   } //getAlphabet()


   public int getNN() {
       return NN;
   } //getNN()

   public int getArrSize() {
        return arrSize;
   } //getArrSize()

   public void setStepSize(int newStep) {
        if (newStep > 0)  stepSize = newStep;
   } //setStepSize()

   public int getStepSize() {
        return stepSize;
   } //getStepSize()

   public String getMostFreqWord(int k) {  //Added 6/9/2003
        return mostFreqWord[k];
   } //getMostFreqWord()

   private void setupMostFreqWords(){
     int j, k, m;  //loop variables

    // System.out.println("Starting most frequent words setup");
     try{
       for (j = 0; j < NUMMOSTFREQ; j++){ //Initialize two arrays
            mostFreqValue[j] = (float)0.0;
            mostFreqIndex[j] = 0;
       }//for

       for (k = 0; k < arrSize; k++){  //Go through the entire freq[] array
           if (freq[k] > mostFreqValue[NUMMOSTFREQ - 1]) { //if element is large enough
               j = NUMMOSTFREQ - 1;      // Move values and insert it.
               while ((j >0) && (freq[k] > mostFreqValue[j-1])){
                   mostFreqValue[j] = mostFreqValue[j - 1];
                   mostFreqIndex[j] = mostFreqIndex[j - 1];
                   j--;
               }//while
               mostFreqValue[j] = freq[k];
               mostFreqIndex[j] = k;
           } //if
       }//for

       for (j = 0; j < NUMMOSTFREQ; j++){ //For each most frequent index
         m = mostFreqIndex[j];
         String temp = "";
         for (k = 0; k < NN; k++){ //Compute the corresponding word
             temp = alphabet.intToChar(m%32) + temp;
             m = m/32;
         }//for k
         mostFreqWord[j] = temp;
       }//for j
   }  //try

   catch(Exception exc){
       System.out.println(exc.toString());
   } //catch

   //System.out.println("Finished most frequent words setup");
  // System.out.println("mostFreqValue[0]=" + mostFreqValue[0]);
  // System.out.println("mostFreqIndex[0]=" + mostFreqIndex[0]);
  // System.out.println("mostFreqWord[0]=" + mostFreqWord[0]);
  // System.out.println("mostFreqValue[1]=" + mostFreqValue[1]);
  // System.out.println("mostFreqIndex[1]=" + mostFreqIndex[1]);
  // System.out.println("mostFreqWord[1]=" + mostFreqWord[1]);
   } //setupMostFreqWords()

   public void writeFreqWordsToFile(String fileName){
       try{
        FileWriter outStream = new FileWriter(fileName);
        for (int k = 0; k < NUMMOSTFREQ; k++){
            outStream.write("_"+getMostFreqWord(k));
            if ((k+1)%5 == 0)outStream.write("_\n");
        }//for
        outStream.close();
       }  //try

      catch(Exception exc){
        System.out.println(exc.toString());
      } //catch

   }//writeFreqWordsToFile()

  /*  Method writeBigramFreqToFile() used once to generate these bigram frequencies.
  This was done to check that the frequencies look accurate.
  A slightly edited versions of these frequencies based on book.txt
  are stored in the text bigramFreq.txt.

      public void writeBigramFreqToFile(String fileName){
          NumberFormat nf = NumberFormat.getInstance();
        nf.setMaximumFractionDigits(6);
        nf.setMinimumFractionDigits(6);
       try{
        FileWriter outStream = new FileWriter(fileName);
        outStream.write("{");
        for (int j = 0; j < 27;  j++) {
              outStream.write("{");
               for (int k = 0; k < 27; k++){
                   outStream.write(nf.format(freq[32*j+k]/(textSize - 1))+",");
               }//for
               outStream.write("},\n");
        }//for
        outStream.write("};");
        outStream.close();
       }  //try

      catch(Exception exc){
        System.out.println(exc.toString());
      } //catch

   }//writeBigramFreqToFile()
   */

    /**    absDiffBigramDist(ctext)
   * Computes a distance between this NgramArray and the distribution of
   * bigrams in the string ctext. The distance is obtained by summing the
   * absolute values of the relative frequency of the birams in the text
   * used to create this NgramArray object minus the relative frequency
   * of the corresponding bigram in ctext.
   */

   public float absDiffBigramDist(String ctext)throws Exception{

    if ((NN != 2) || ( alphSize != 27))
     throw new Exception("Invalid Alphabet (not 27 letters or NN != 2 in absDiffBigramDist()");

      float total = (float)0.0;
      float[][] bigramFreq = new float[27][27];
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int ind1 = 0; // indices corresponding to the two chars of the bigram
      int ind2 = 0;
      int charCount = 0; //chars in the alphabeet

      for (int k = 0; k < len; k++){   //Compute frequencies for ctext
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              ind1 = ind2;
              ind2 = alphabet.charToInt(ch);
	            if ((charCount >= NN)&&(charCount % stepSize == 0))
		             bigramFreq[ind1][ind2] += (float)1.0;
          }//if
      } //for

      for (int i = 0; i < 27; i++)
          for (int j = 0; j < 27; j++)
            total += Math.abs((freq[32*i+j]/(textSize - 1) ) - (bigramFreq[i][j]/(charCount - 1) ) );

      return total;
   } //absDiffBigramDist(ctext)


    /**    sqrDiffBigramDist(ctext)
   * Computes a distance between this NgramArray and the distribution of
   * bigrams in the string ctext. The distance is obtained by summing the
   * squares of the relative frequency of the birams in the text
   * used to create this NgramArray object minus the relative frequency
   * of the corresponding bigram in ctext.
   */

   public float sqrDiffBigramDist(String ctext)throws Exception{

    if ((NN != 2) || ( alphSize != 27))
     throw new Exception("Invalid Alphabet (not 27 letters or NN != 2 in absDiffBigramDist()");

      float total = (float)0.0;
      float[][] bigramFreq = new float[27][27];
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int ind1 = 0; // indices corresponding to the two chars of the bigram
      int ind2 = 0;
      int charCount = 0; //chars in the alphabeet

      for (int k = 0; k < len; k++){   //Compute frequencies for ctext
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              ind1 = ind2;
              ind2 = alphabet.charToInt(ch);
	            if ((charCount >= NN)&&(charCount % stepSize == 0))
		             bigramFreq[ind1][ind2] += (float)1.0;
          }//if
      } //for

      for (int i = 0; i < 27; i++)
          for (int j = 0; j < 27; j++)
            total += ((freq[32*i+j]/(textSize-1))-(bigramFreq[i][j]/(charCount-1)))
                  *((freq[32*i+j]/(textSize-1))-(bigramFreq[i][j]/(charCount-1)));

      return total;
   } //sqrDiffBigramDist(ctext)


   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams in the string ctext. The distance is obtained by summing the
   * reciprocals of the relative frequency of the N-gram in the text used
   * to creat this NgramArray object.
   */

   public float recipDist(String ctext)throws Exception{
      float total = (float)0.0;
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int charCount = 0;

      for (int k = 0; k < len; k++){
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              index = ((index<<5) | alphabet.charToInt(ch)) & mask;
	      if ((charCount >= NN)&&(charCount % stepSize == 0))
		  total += (float)(1.0/(1.0 + freq[index]));
          }//if
      } //for
      return total;
   } //recipDist(ctext)


   /**
   * Same as previous method except this one returns an array that 
   *  remembers the indices of the smallest and greatest nGram scores
   *  in the cryptogram.
   */
   public float recipDist(String ctext, int indx[])throws Exception{
       float min = (float)1000000.0;
       int minLoc = 0, maxLoc = 0;
       float max = (float)0;
       float score;
      float total = (float)0.0;
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int charCount = 0;

      for (int k = 0; k < len; k++){
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              index = ((index<<5) | alphabet.charToInt(ch)) & mask;
	      if ((charCount >= NN)&&(charCount % stepSize == 0)) {
		  score = (float)(1.0/(1.0 + freq[index]));
		  //		  total += (float)(1.0/(1.0 + freq[index]));
		  total += score;
		  if (score > max) {
		      max = score;
		      maxLoc = k;
		  } else if (score < min) {
		      min = score;
		      minLoc = k;
		  }
	      }
          }//if
      } //for
      indx[0] = maxLoc;
      indx[1] = minLoc;
      return total;
   } //recipDist(ctext)


   /**
   * Same as previous method except this one returns a double array that 
   *  remembers the scores at each index of the cryptogram.
   */
   public float recipDist(String ctext, double scores[])throws Exception{
       float score;
      float total = (float)0.0;
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int charCount = 0;

      for (int k = 0; k < len; k++){
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              index = ((index<<5) | alphabet.charToInt(ch)) & mask;
	      if ((charCount >= NN)&&(charCount % stepSize == 0)) {
		  score = (float)(1.0/(1.0 + freq[index]));
		  //		  total += (float)(1.0/(1.0 + freq[index]));
		  total += score;
		  scores[k] = score;
	      }
          }//if
      } //for
      return total;
   } //recipDist(ctext)

   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams of a crypto text as represented by an array of int values.
   * The distance is obtained by summing the corresponding
   * reciprocals of the relative frequency of the N-gram in the text used
   * to creat this NgramArray object.  This function is more efficient
   * than the previous function when used in an N-gram hill climber.
   */

   public float recipDist(int[] cnums)throws Exception{
       for (int k = 0; k < cnums.length; k++)
	   System.out.print(" " + cnums[k]);
       System.out.println("\n");
      float total = (float)0.0;
      int len =  cnums.length;
      //      System.out.println("NN=" + NN + " step=" + stepSize + " len=" + len);
      if (len < NN) return total; // Should not happen
      int index = 0;   //into the array freq[]
      int k; //loop variable
      for ( k = 0; k < len; k++){
              index = ((index<<5) | cnums[k]) & mask;
	      if( ((k+1) >=  NN) && ((k+1) % stepSize == 0) )
		  total += (float)(1.0/(1.0 + freq[index]));
      } //for
      return total;
   } //recipDist(cnums)


   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams in the string ctext. The distance is obtained by summing the
   * the relative frequency of the N-gram in the text used
   * to creat this NgramArray object.
   */

   public float freqDist(String ctext)throws Exception{
      float total = (float)0.0;
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int charCount = 0;

      for (int k = 0; k < len; k++){
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              index = ((index<<5) | alphabet.charToInt(ch)) & mask;
               if ((charCount >= NN)&&(charCount % stepSize == 0))
                      total += freq[index];
          }//if
      } //for
      return total/textSize;
   } //freqDist(ctext)

   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams of a crypto text as represented by an array of int values.
   * The distance is obtained by summing the corresponding
   * relative frequency of the N-gram in the text used
   * to creat this NgramArray object.  This function is more efficient
   * than the previous function when used in an N-gram hill climber.
   */

   public float freqDist(int[] cnums)throws Exception{
      float total = (float)0.0;
      int len =  cnums.length;
      if (len < NN) return total; // Should not happen
      int index = 0;   //into the array freq[]
      int k; //loop variable
      for ( k = 0; k < len; k++){
              index = ((index<<5) | cnums[k]) & mask;
         if( ((k+1) >=  NN) && ((k+1) % stepSize == 0) )
         total += freq[index];
      } //for
      return total/textSize;
   } //freqDist(cnums)


   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams in the string ctext. The distance is obtained by summing the
   * squares of the relative frequency of the N-gram in the text used
   * to creat this NgramArray object.
   */

   public float sqrDist(String ctext)throws Exception{
      float total = (float)0.0;
      int len =  ctext.length();
      char ch;
      int index = 0;   //into the array freq[]
      int charCount = 0;

      for (int k = 0; k < len; k++){
          ch = ctext.charAt(k);
          if (alphabet.isInAlphabet(ch)) {
              charCount++;
              index = ((index<<5) | alphabet.charToInt(ch)) & mask;
               if ((charCount >= NN)&&(charCount % stepSize == 0))
                      total += freq[index] * freq[index];
          }//if
      } //for
      return total/(textSize*textSize);
   } //sqrDist(ctext)

   /**
   * Computes a distance between this NgramArray and the distribution of
   * N-grams of a crypto text as represented by an array of int values.
   * The distance is obtained by summing the corresponding
   * squares of the relative frequency of the N-gram in the text used
   * to creat this NgramArray object.  This function is more efficient
   * than the previous function when used in an N-gram hill climber.
   */

   public float sqrDist(int[] cnums)throws Exception{
      float total = (float)0.0;
      int len =  cnums.length;
      if (len < NN) return total; // Should not happen
      int index = 0;   //into the array freq[]
      int k; //loop variable
      for ( k = 0; k < len; k++){
              index = ((index<<5) | cnums[k]) & mask;
         if( ((k+1) >=  NN) && ((k+1) % stepSize == 0) )
         total += freq[index]*freq[index];
      } //for
      return total/(textSize*textSize);
   } //sqrDist(cnums)



   public static void main(String[] args){

      try{
        //FileWriter outStream = new FileWriter("hereiam.txt");
        //outStream.write("testing");
        //outStream.close();
        NumberFormat nf = NumberFormat.getInstance();
        nf.setMaximumFractionDigits(6);
        nf.setMinimumFractionDigits(6);
        StringBuffer sb = new StringBuffer();

        char[] arr = {'a','z',' ',' '};
        Alphabet alph = new Alphabet(arr);
        NgramArray nga = new NgramArray(2,"book.txt",alph);
        System.out.println(nga.getReport());
       // nga.writeBigramFreqToFile("bigramFreq.txt");

        String test = "now is the time for all good men to come to the aid of their country";
        System.out.println("test = " + test);
        sb.append("recipDist(test) = " + nf.format(nga.recipDist(test)) + "\n");
        sb.append("freqDist(test) = " + nf.format(nga.freqDist(test)) + "\n");
        sb.append("sqrDist(test) = " + nf.format(nga.sqrDist(test)) + "\n");
        sb.append("absDiffBigramDist(test) = " + nf.format(nga.absDiffBigramDist(test)) + "\n");
        sb.append("sqrDiffBigramDist(test) = " + nf.format(nga.sqrDiffBigramDist(test)) + "\n");
 /*
        int chNum = 0;   //Now test the other recipDist() function.
        int k; //loop variables
        int len = test.length();
        for (k = 0; k < len; k++)
            if (alph.isInAlphabet(test.charAt(k))) chNum++;
        int[] testNums = new int[chNum];
        chNum = 0;
        sb.append("testNums = ");
        for (k = 0; k < len; k++)
            if (alph.isInAlphabet(test.charAt(k))){
                testNums[chNum] = alph.charToInt(test.charAt(k));
                sb.append(testNums[chNum] + " ");
                chNum++;
            } //if
        sb.append("\nrecipDist(testNums) = " + nf.format(nga.recipDist(testNums)) + "\n");
       sb.append("freqDist(testNums) = " + nf.format(nga.freqDist(testNums)) + "\n");
      sb.append("sqrDist(testNums) = " + nf.format(nga.sqrDist(testNums)) + "\n");
*/
      System.out.println(sb.toString());
/*        //Test the Frequent N-gram array 6/9/2003
       System.out.println("mostFreqWord[0]=_"+nga.getMostFreqWord(0)+"_");
       System.out.println("mostFreqWord[99]=_"+nga.getMostFreqWord(99)+"_");
       nga.writeFreqWordsToFile("freqNgrams.txt");
*/
      }  //try

      catch(Exception exc){
        System.out.println(exc.toString());
      } //catch
    }//main()

} // NgramArray class
