/*
 * File: WordBasedAnalyzer.java
 * @author R. Morelli <ralph.morelli@trincoll.edu>
 * 
 * Description: This class analyzes traditional cryptograms of
 *  the sort you find in the newspaper. It assumes the text
 *  is a word-delimited cryptogram created using a simple substitution 
 *  cipher. 
 * 
 * Credits: The algorithm here is based on the algorithm reported by
 *  George W. Hart in To decode short cryptograms, CACM 37(9), Sept. 1994, 
 *  102-108.
 *
 * At present this analyzer only works for the alphabet 'a' to 'z'.
 *
 *  To compile and run from the TestAnalyzer application:
 *
 *  cd ~crypto/hcryptoj/1.4/applications/testanalyzer
 *  javac -classpath ../../classes -d ../../classes ../../source/hcrypto/analyzer/NgramAnalyzer.java
 *  java -classpath ../../classes:. TestAnalyzer analyzers.NgramAnalyzer ga_paramfiles ngram.cgrams.txt
 *
 */

package analyzers;
import hcrypto.analyzer.*;
import hcrypto.cipher.*;
import hcrypto.engines.*;
import hcrypto.provider.*;
import java.util.*;
import java.text.*;

public class WordBasedAnalyzer extends CryptoAnalyzer implements Analyzer {
    private static final int MAX_KEY = 26;
    private static final int MAX_DEPTH = 36;
    private static final int MAX_LOOPS = 50000;
    //    private int dict_type = PatternDictionary.KUCERA_3500;
    private int dict_type = PatternDictionary.KUCERA_340;

    private String bestDecrypt;        // Best result
    private StringBuffer resultSB;
    private Cipher cipher;
    private SubstitutionKey sKey;
    private PatternDictionary dict;
    
    private DateFormat dateformat = DateFormat.getTimeInstance(DateFormat.LONG);

    private StringBuffer encipherKey;

    private int nLoops = 0;
    private double bestScore = 0;
    private int nTokens = 0;
    private Token tokens[];
    private long time0 = System.currentTimeMillis();
    private long keyCount= 0;
    protected GaParameters params;
    
    /**
     * WordBasedAnalyzer() -- this constructor is given an object containing parameter settings
     * @param params -- an object containing param1=val1 param2=val2 ...
     */
    public WordBasedAnalyzer(GaParameters params) {
	this.params = params;
	dict_type = params.eval_dict;
    }

    public WordBasedAnalyzer() { } // Required for TestAnalyzer

    /**
     * This method initializes the Analyzer. The text is broken into tokens
     * that are stored in an array.  Each token is associated with a pattern
     * -- 'that' has the pattern '1231'. The tokens are sorted with those
     * having the smallest (non empty) pattern set given preference.
     * @param text a String pointing to the text being analyzed
     */
    public void setup(String text) { 
	super.setup(text);       
	num.setMaximumFractionDigits(2);
	initGa();
    }

    /**
     *  This method is called by setup().
     */ 
    public void initGa() {
        dict = new PatternDictionary(dict_type);
        resultSB = new StringBuffer();

        Provider.addProvider(new DefaultProvider("Default"));     // Setup the cipher engine and key
        cipher = Cipher.getInstance("Substitution");
        sKey = (SubstitutionKey) HistoricalKey.getInstance("Substitution", cipher.getProvider());

        encipherKey = new StringBuffer();   // Initialize the key data
        for (int k = 0; k < MAX_KEY; k++)
	    encipherKey.append("*");

        tokens = initWordArray(text);     // Break the cryptogram into tokens based on their
        java.util.Arrays.sort(tokens);    //  patterns, giving prefernce to less frequent patterns.
        nTokens = tokens.length;

	nLoops = 0;
	bestScore = 0;

        // The next lines of code can be commented out after development.
	//        for (int k = 0; k < tokens.length; k++)
	//            System.out.println (tokens[k].getToken() + " " + tokens[k].getPattern() + "(" + tokens[k].getScore() + ") " );
	//	resultSB.append("UNICITY (depth limit) = " + MAX_UNICITY + "\n");
	//	display.setText("Simple Substitution Analyzer: Begin Analysis\n\n");
    }

    /**
     * This method converts a cryptogram (a string of tokens) into an array of Tokens. 
     * A Token is an object that stores the word, its pattern, its score, and its
     *  pattern set, a set of known words with the same pattern.
     * It uses a StringTokenizer that filters punctuation from the text.
     * @param text a String representing the text.
     */
    private Token[] initWordArray(String text) {
        StringTokenizer st = new StringTokenizer(TextUtilities.removeDuplicates(text.toLowerCase()),
                                    " \r\f\t\n;:\".,!@#$%'^&*(_-+=\\`~{{}}?|");
        Token tokens[] = new Token[st.countTokens()];
        int k = 0;
//        System.out.println("THE CLEAN MESSAGE: " + st.countTokens() + " tokens");
        while (st.hasMoreTokens()) {
	    String word = st.nextToken();
	    tokens[k] = new Token(word, dict);       // Store the word in a Token object
//		System.out.print(tokens[k].getToken() + " ");
	    ++k;
	}
//	System.out.println();
        return tokens;
    }


    /**
     * This method is part of the Analyzer interface. It runs the analysis.
     */
    public void run() {
	resultSB.append(dateformat.format(new Date()) + "\n");
	stopThread = false;           // stopThread declared in superclass
        doAnalysis();
        System.out.println("Finished: Iterations = " + nLoops + " Best score is " + num.format(bestScore) 
			   + " KeyCount = " + keyCount);
	resultSB.append("Time Used = " + (System.currentTimeMillis() - time0) + "ms");
	resultSB.append("DECRYPTED MESSAGE :\n" + bestDecrypt + "\n");
	resultSB.append("SOLUTION          : " + solution + "\n");
	resultSB.append("PERCENT WORDS: " + num.format(TextUtilities.percentWords(solution,bestDecrypt)) + "\n");
	resultSB.append("WRONG CHARS: " + TextUtilities.countInCorrectChars(solution,bestDecrypt) + "\n");

	if (display != null) {
	    display.append("\nFinished");
	}
    }
    
    /**
     * This method is part of the Analyzer interface. It returns the report
     * generated by the analysis.
     */
    public String getReport() {
        return toString();
    }

    /**
     * This method returns the report generated by the analysis.
     */
    public String toString() {
        return resultSB.toString();    
    }

    /**
     *  This method performs an analysis of the text. It assumes the text
     *  is a word-delimited cryptogram created using a simple substitution cipher.
     *  The anlysis is performed by a recursive backtracking search of the key space.
     */
    public void doAnalysis() { 
        resultSB.append("\n");
	solve(tokens,encipherKey);
//	System.out.print("Time Used = " + (System.currentTimeMillis() - time0) + "ms");
//	System.out.print(" KeyCount = " + keyCount + "\n");
    }

    /** 
     * This nonrecursive version of solve() just invokes the recursive version.
     */
    private void solve (Token[] tokens, StringBuffer encipherKey) {
	//	for (int k = 0; k < tokens.length; k++)
	//	    System.out.print(tokens[k].getToken() + " ");
	//	System.out.println();
        solve(tokens, 0, encipherKey, 0.0);
    }

    /**
     * This method removes the asterisks from the encipherKey
     *  turning it into a valid keyword for HistoricalKey.
     *  It takes a encipherKey of the form "abc*e***..." and
     *  replaces the asterisks with unused letters giving a
     *  valid key -- e.g., "abcdefgh...".
     # @param encipherKey a String containing asterisks
     */
    private String makeKeyword(String encipherKey) {
//	System.out.println("makekeyword " + encipherKey);
        String alphabet = new String("abcdefghijklmnopqrstuvwxyz");
        StringBuffer keyword = new StringBuffer(encipherKey);
        int alph = 0;                           // Points to next unused alphabet char
        int indx = -1;
        char alphCh = 'a';
        for (int k = 0; k < keyword.length(); k++) {
            char ch = keyword.charAt(k);
            if (ch == '*') {
                do {
		    alphCh = alphabet.charAt(alph);
		    ++alph;
		    indx = keyword.toString().indexOf(alphCh);  // Find the next unused character
		} while (indx != -1);
		keyword.setCharAt(k, alphCh);
	    }
	}
	return keyword.toString();
    }

    /**
     * Uses a cipher engine to decrypt the message using encipherKey as the key.
     * This method returns true if ALL the tokens in the message are found in
     * the dictionary.
     * @param encipherKey a String giving the encryption key
     * @param score the key's score relative to other keys
     */
    private void testMessage(String encipherKey, double score, int depth) {
        String decrypt = null;
	//        System.out.println("encipherKey = " + encipherKey);
	String keyword = makeKeyword(encipherKey);
        try {
            sKey.init(keyword + "/az");
            cipher.init(sKey);
            decrypt = cipher.decrypt(text);
            if (score > bestScore) {
                bestScore = score;
		bestDecrypt = decrypt;
		if (params.verbose) {
		    System.out.println("Here's a decryption based on KEYWORD= " + keyword);
		    System.out.print("Depth = " + depth + " Time Used = " + (System.currentTimeMillis() - time0) + "ms");
		    System.out.println(" KeyCount " + keyCount + " score= " + score);
		    System.out.println(decrypt + "\n");
		    if (display != null)
			display.setText(decrypt);
		    //		    resultSB.append("Here's a decryption based on KEYWORD= " + keyword + "\n");
		    //		    resultSB.append("Time Used = " + (System.currentTimeMillis() - time0) + "ms");
		    //		    resultSB.append(" KeyCount " + keyCount + "\n");
		    //		    resultSB.append("words = " + nWords + " tokens= " + nTokens + " score= " + totalScore + "\n");
		    //		    resultSB.append(decrypt + "\n\n");
		}
		nLoops = 0;
	    }
	    else 
		++nLoops;
        } catch (Exception e) {
	    e.printStackTrace();
        }
    }

    /**
     * This method performs a recursive backtracking search of the keyspace.
     * The initial value of the key is "*****...". A dictionary is used to
     * match tokens from the cryptogram. As matches are found, the key is
     * filled in. Backtracking occurs whenever a token cannot be matched
     * against any of the dictionary words, with the same pattern, given 
     * the current encipherKey.

     * Search strategy: the tokens in the cryptogram are sorted to prefer
     *  those with the smallest (non empty) pattern sets. The search proceeds
     *  left to right along the cryptogram. Substitutions are made using the
     *  pattern words with the highest frequencies. When the end of the cryptogram
     *  is reached, the message is tested and printed out if its score is better 
     *  than the previous best score. If a token's pattern set is empty or 
     *  if none of the words in the pattern set match (due to the preceeding substitutions) 
     *  the token is skipped. 

     * @param tokens is an array of cryptogram tokens, sorted to favor those
     *   with the fewest number of matching pattern words
     * @param depth is the number of token's processed
     * @param score is the number of token=word matches
     * @param encipherKey is StringBuffer that is recursively filled in.
     */
    private void solve(Token [] tokens, int depth, StringBuffer encipherKey, double score) {
	//        if (depth >= Math.min(tokens.length, MAX_DEPTH) ) {
        if (depth >= Math.min(tokens.length, MAX_DEPTH) || threadIsStopped()) {
            testMessage(encipherKey.toString(), score, depth);  
	} else if (nLoops <= MAX_LOOPS) {
	    String token = tokens[depth].getToken();           // Get the current token
	    String pattern = tokens[depth].getPattern();       // Get its pattern -- eg 1231 = that
	    String words[] = tokens[depth].getPatternSet();    // And get its matching words.
	    String word = null;

	    for (int k = 0; words != null && k < words.length; k++) {
		//		System.out.print(depth + "\tToken= " + token + "(" + pattern + ") and key " + encipherKey.toString());
		word = words[k];
		StringBuffer newencipherKey = new StringBuffer(encipherKey.toString());
		++keyCount;
		if (isMatch(token, word, newencipherKey)) {
		    //		    System.out.println("\t" + depth + " " + token + " MATCHES " + word + " " + newencipherKey.toString());
		    solve(tokens, depth + 1, newencipherKey, score + 1);
		} 
		//		else
		    //		    System.out.println("\t" + depth + " " + token + " DOES NOT MATCH " + word);
	    }
	    solve(tokens, depth + 1, encipherKey, score);  // Skip over the word
	}
    }

    /**
     * This method determines whether an encrypted word matches token, where token is
     *  is possibly a matching pattern word taken from the dictionary.
     * @param word the encrypted word
     * @param token a candidate plaintext  word from word's pattern set
     * @param newkey the revised key that is built as a side effect
     */
    private boolean isMatch(String token, String word, StringBuffer newkey){
//	System.out.print("Try matching " + token + " and " + word);
        for (int k= 0; k < token.length(); k++) {
            char t1 = token.charAt(k);
            char w1 = word.charAt(k);
	    char ew1 = newkey.charAt(w1 - 'a');  // Encrypt of w1
            if (ew1 != '*' && ew1 != t1)         
		return false;
	    int indx = newkey.toString().indexOf(t1);
	    char dt1 = '*';
	    if (indx != -1)
		dt1 = (char)('a' + indx);        // Decrypt of t1
//	    System.out.println("t1= " + t1 + " w1= " + w1 + " ew1= " + ew1 + " dt1= " + dt1);
	    if (dt1 != '*' && dt1 != w1)
		return false;
	    newkey.setCharAt(w1 - 'a', t1);
	}
//	System.out.println(" Matched!");
        return true;
    }


}
