'''
Created on Nov 3, 2010

@author: rcd
'''
import string
import FileUtilities
import HTMLWriter
    

def readCommonWords (filename):
    """
    Given a string representing the name of a file of common words, one per line,
      return a set of the words in the file (in case there are duplicates)
    Note, all words should be lower cased for later comparison.
    """
    f = FileUtilities.getFile(filename)
    return set([ word.lower() for word in FileUtilities.getWords(f) ])


def cleanWord (word):
    """
    Given a string representing a single word, return a cleaned up version of that word:
      - the word should be lower cases for common comparison
      - any leading or trailing punctuation should be removed
    """
    # written during class
    return word.strip(string.punctuation).lower()


def isTaggable (word, commonWords):
    """
    Given a string that represents a word and a set of strings that should not appear
      in the final word cloud, return True only if the word exists (i.e., has 1 or more 
      characters) and it is not one of the common words.
    """
    # TODO: students complete this function
    return True


def countWords (filename, commonWords):
    """
    Given a string representing the name of a file of text and a set of strings that 
      represent common words that should not be counted, return a list of (word, count) 
      tuples, containing the number of times each non-common word occurs in the file.
    Note, each word in the file should be cleaned before being counted.
    """
    # written during class
    f = FileUtilities.getFile(filename)
    words = [ cleanWord(w) for w in FileUtilities.getWords(f) ]
    return [ [ words.count(w), w ] for w in set(words) if isTaggable(w, commonWords) ]


def topWords (wordList, numToKeep):
    """
    Given a list of (word, count) tuples and a number of the most occurring ones to 
      keep, return the numToKeep words that have the highest frequency (i.e., the 
      highest associated count).
    """
    # written during class
    return sorted(wordList, reverse=True)[:numToKeep]


def sizeWords (wordList, numDivisions):
    """
    Given a list of (count, word) tuples and a number of divisions to group them into,
      return a list of (word, size) tuples, where size represents the font size in 
      which the word will be displayed.  
    Thus the higher the word's count, the bigger its size should be. All sizes should 
      be between 10 and 48, with sizes be evenly divided between those values.  
    For example, if numDivisions is 6, then the words should be sized based on their 
      counts at one of [10, 17, 24, 31, 38, 45]. Thus if the most frequently occurring 
      word has a count of 90, then a word that occurs at most 15 times should have a
      size of 10, a word occurring 16 to 30 times should have a size of 17, ans so on
      all the way up to a count of 76 to 90 that has a size of 45.
    """
    MIN_SIZE = 10
    MAX_SIZE = 48
    # TODO: students complete this function
    return [ (word, count) for (count,word) in wordList ]


def printWords (filename, words):
    """
    Given a string representing the name of a file of text and a list of (word, size)
      tuples, write a word cloud to the file containing all the words in alphabetical 
      order in a font sized to their associated value.
    """
    output = open(filename, "w")
    HTMLWriter.start(output)
    for (word, size) in sorted(words):
        # TODO: use HTMLWriter to format each word
        output.write(word)
    HTMLWriter.finish(output)


def makeCloud (filename):
    commonWords = readCommonWords('common.txt')
    wordList = countWords(filename + '.txt', commonWords)
    wordList = topWords(wordList, 100)
    wordList = sizeWords(wordList, 6)
    printWords(filename + '_cloud.html', wordList)


makeCloud('alice')