'''
Created on Nov 3, 2010

@author: rcd
'''
import string
import FileUtilities
import HTMLWriter
    

def read_common_words (filename):
    """
    Given a string representing the name of a file of common words, one per line,
      return a set of the words in the file (in case there are duplicates)
    Note, all words should be lower cased for later comparison.
    """
    file = FileUtilities.get_file(filename)
    return set([word.lower() for word in FileUtilities.get_words(file)])


def sanitize_word (word):
    """
    Given a string representing a single word, return a cleaned up version of that word:
      - the word should be lower cases for common comparison
      - any leading or trailing punctuation should be removed
      - any leading or trailing digits should be removed 
    """
    # TODO: students fill this in
    return word


def word_is_taggable (word, commonWords):
    """
    Given a string that represents a word and a set of strings that should not appear
      in the final word cloud, return True only if the word exists (i.e., has 1 or more 
      characters) and it is not one of the common words.
    """
    # TODO: students fill this in
    return True


def count_words (filename, commonWords):
    """
    Given a string representing the name of a file of text and a set of strings that 
      represent common words that should not be counted, return a list of (word, count) 
      tuples, containing the number of times each non-common word occurs in the file.
    Note, each word in the file should be sanitized before being counted.
    """
    wordCounts = {}
    file = FileUtilities.get_file(filename)
    for word in FileUtilities.get_words(file):
        # TODO: students fill this in
        pass
    return wordCounts.items()


def top_words (wordList, numToKeep):
    """
    Given a list of (word, count) tuples and a number of the most occurring ones to 
      keep, return the numToKeep words that have the highest frequency (i.e., the 
      highest associated count).
    """
    # TODO: students fill this in
    return wordList


def size_words (wordList, numDivisions):
    """
    Given a list of (word, count) tuples and a number of divisions to group them into,
      return a list of (word, size) tuples, where size represents the font size in 
      which the word will be displayed.  
    Thus the higher the word's count, the bigger its size should be. All sizes should 
      be between 10 and 48, with sizes be evenly divided between those values.  
    For example, if numDivisions is 6, then the words should be sized based on their 
      counts at one of [10, 17, 24, 31, 38, 45]. Thus if the most frequently occurring 
      word has a count of 90, then a word that occurs at most 15 times should have a
      size of 10, a word occurring 16 to 30 times should have a size of 17, ans so on
      all the way up to a count of 76 to 90 that has a size of 45.
    """
    MIN_SIZE = 10
    MAX_SIZE = 48
    # TODO: students fill this in
    return wordList


def print_words (filename, words):
    """
    Given a string representing the name of a file of text and a list of (word, size)
      tuples, write a word cloud to the file containing all the words in alphabetical 
      order in a font sized to their associated value.
    """
    output = open(filename, "w")
    HTMLWriter.start(output)
    for (word, size) in sorted(words):
        HTMLWriter.format_sized_word(output, word, size)
    HTMLWriter.finish(output)

def make_cloud():
    file = 'little_brother'
    commonWords = read_common_words('common.txt')
    wordList = count_words('../data/'+file+'.txt', commonWords)
    wordList = top_words(wordList, 100)
    wordList = size_words(wordList, 6)
    print_words('../data/'+file+'_cloud.html', wordList)


if __name__ == "__main__":
    make_cloud()