/*
 * File: PatternDictionary.java
 * @author R. Morelli
 * Date: 6/4/2002
 *
 * This file implements a searchable dictionary of words and their
 *  relative frequencies. It is designed to be used
 *  by cryptanalysis objects. It stores words in terms of their
 *  patterns. For example, "there" and "these" both have the pattern
 *  12343. So the key for these words would be "12343" and both words
 *  would be stored in the Hashtable associated with that key.
 *
 *  File format: PatternDictionary expects its source file to contain
 *   words and frequencies, one set per line.  A good example
 *   is the file kucera340.txt, which is a file of the the 340 most
 *   frequent words taken from the Kucera-Francis word list, which is 
 *   available from the MRC Psycholinguistic database:
 *       http://www.psy.uwa.edu.au/mrcdatabase/uwa_mrc.htm
 *   The program assumes that the first line of the file gives the total 
 *   number of words in in the corpus that was used to compile the relative 
 *   frequencies. The relative frequencies are integer values. For example,
 *   the Kucera-Francis word list is based on a corpus with 1,000,000 words.
 *   The format is:
                    TOTAL_WORDS              1000000
		    THE                      69971 
		    OF                       36411 
		    ...                      ...
 *
 *  To Test: 
 *    java -classpath classes hcrypto.analyzer.PatternDictionary kucera340.txt
 *    java -classpath classes hcrypto.analyzer.PatternDictionary sourcefile
 */

package hcrypto.analyzer;

import java.util.*;
import java.io.*;

public class PatternDictionary extends Dictionary {
    private int nWords = 0;  // Total number of words in the dictionary

    public PatternDictionary() {       // The default uses the internal Kucera3500 wordlist
	this(KUCERA_3500);
    }

    public PatternDictionary(int i) {       // The default uses the internal Kucera340 wordlist
	//	System.out.print("PatternDictionary: Using internal word list: ");
	switch (i) {
	case 1: init(Kucera340.wordlist);  System.out.println("Kucera340"); break;
	case 2: init(Kucera100.wordlist); System.out.println("Kucera100"); break;
	case 3: init(Kucera50.wordlist); System.out.println("Kucera50"); break;
	case 4: init(Kucera3500.wordlist); System.out.println("Kucera3500"); break;
	case 5: init(Kucera10.wordlist);  System.out.println("Kucera10"); break;
	default: init(Kucera340.wordlist); System.out.println("Kucera340"); break;
	}
    }

    /**
     * This constructor creates a dictionary from the named file.
     *  If linebreaks is true, it assumes words and frequencies are listed one
     *  per line with the first line containing the TOTAL_WORDS nnnnn in the
     *  corpus.
     * @param filename a String giving the name of the dictionary file
     */
    public PatternDictionary(String filename) {  
        init(filename);
    }

    /**
     * This version of init() creates the PatternDictionary from an internal array.
     */
    private void init(String wordlist[][]) {
	double freq = 0;
	String word = null;
	double total_words = Integer.parseInt(wordlist[0][1]);

	for (int k = 1; k < wordlist.length; k++) {
	    word = wordlist[k][0];          
	    freq = Integer.parseInt(wordlist[k][1])/total_words;
	    insert(new PatternWord(word, freq));
	    ++nWords;
	}
	//        System.out.println("size = " + dict.size());
    }

    /**
     * This version of init() creates the PatternDictionary from an external file.
     */
    private void init(String filename) {
        System.out.print("PatternDictionary: Reading words from " + filename);
	double freq = 0;
	double total_words = 0;
        StringTokenizer st = null;
        try {
            BufferedReader inStream = 
                new BufferedReader(new FileReader(filename));

            String line = inStream.readLine();        // Read the first line
	    st = new StringTokenizer(line);

	    String word = st.nextToken();                        // Ignore word
	    if (st.hasMoreTokens())
		total_words = Integer.parseInt(st.nextToken());  // Read the total words

            line = inStream.readLine();       // Read the second line

	    while (line != null) {
		st = new StringTokenizer(line);
		word = st.nextToken();          
		if (st.hasMoreTokens()) 
		    freq = Integer.parseInt(st.nextToken())/total_words;
		insert(new PatternWord(word, freq));
		++nWords;
		line = inStream.readLine();    
	    }
            inStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println(" size = " + dict.size());
    }

    public int nWords() {        
	// System.out.println("Pattern dictionary size = " + dict.size());
	return nWords;
    }

    private void insert(PatternWord word) {
	String lcw = word.getWord();
	String key = word.getPattern();
	if (dict.containsKey(key)) {
	    Hashtable h = (Hashtable) dict.get(key);
	    h.put(lcw, word);
	}
	else {
	    Hashtable h = new Hashtable(500, LOAD_FACTOR);
	    h.put(lcw, word);
	    dict.put(key, h);
	}
    }

    public double getFrequency(String word) {
        String pattern = makePattern(word);
        Hashtable wordtable = (Hashtable)dict.get(pattern);
        if (wordtable != null && wordtable.containsKey(word)) {
	    PatternWord pw = (PatternWord)wordtable.get(word);
	    return pw.getFrequency();
	}
	return 0;
    }

    /**
     * This method returns a pattern of the string. For example,
     *  if the word is "there" the pattern would be 12343. Words longer
     *  than 9 letters using UPPERCASE letters. For example, the
     *  word "appendectomy" would have the pattern "12234536789A".
     */
    public static String makePattern(String s) {
        String word = s.toLowerCase();
        String patternLetters = "123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
        int p = 0;
        StringBuffer sb = new StringBuffer();
        for (int k = 0; k < Math.min(word.length(),patternLetters.length()); k++) {
            char ch = word.charAt(k);
            int index = word.substring(0,k).indexOf(ch);
            if (index == -1) {
                sb.append(patternLetters.charAt(p));
                ++p;
            }
            else
                sb.append(sb.charAt(index));
	}
        return sb.toString();
    }


    public double getFreq(String word) {
	return getFrequency(word);
    }

    public boolean contains (String word) {
	//        return dict.get(word) != null;
        String pattern = makePattern(word);
        Hashtable wordtable = (Hashtable)dict.get(pattern);
        if (wordtable != null)
	    return wordtable.containsKey(word);
	return false;
    }

    public boolean containsPattern (String pattern) {
	return dict.containsKey(pattern);
    }

    public int countWordsForPattern(String pattern) {
        Hashtable h = (Hashtable) dict.get(pattern);
        if (h != null)
	    return h.size();
	return 0;
    }

    public String[] getPatternWordArray(String word) {
        String pattern = makePattern(word);
	if (!containsPattern(pattern)) 
	    return null;
	StringTokenizer st = new StringTokenizer(getWordList(word));
	String words[] = new String[st.countTokens()];
	int k = 0;
	while (st.hasMoreTokens())
	    words[k++] = new String(st.nextToken());
	return words;
    }

    public String getWordList (String word) {
        StringBuffer sb = new StringBuffer();
        String pattern = makePattern(word);
        Hashtable ht = (Hashtable) dict.get(pattern);
	if (ht == null)
	    return "";
        Enumeration words = ht.keys();
        while (words.hasMoreElements()) {
            sb.append(words.nextElement());
	    sb.append(" ");
	}
        return sb.toString();
    }

    //    public int size() {
    //        return dict.size();
    //    }

    public static void main(String args[]) {
	PatternDictionary d;
        if (args.length < 1) {
	    System.out.println("Usage: java PatternDictionary filename");
	    d = new PatternDictionary();
	    //	    return;
	}
	else {
	    d = new PatternDictionary(args[0]);
	}
	
        System.out.println("The pattern dictionary has " + d.nWords() + " words  and " 
			   + d.size() + " patterns");

        try {
            BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
            String word;
            System.out.print("Search for >>");
            word = input.readLine();
            while (word.length() != 0) {
                String pattern = d.makePattern(word);
                if (d.containsPattern(pattern))
		    System.out.println(pattern + " "+ d.getFrequency(word) + " " + d.getWordList(word));
                else            
                    System.out.println(word + " with pattern " + pattern + " is not found");
                System.out.print("Search for >>");
                word = input.readLine();
            } 
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}
