'''
Created on Mar 5, 2014

@author: rcd
'''
def readFileAsWords (filename):
    """
    open the given, read all the words in it, and close the file
    return a list of words.
    """
    wordFile = open(filename)
    wordList = wordFile.read().split()
    wordFile.close()
    return wordList


def cleanWord (word):
    """
    given a string remove all punctuation
    """
    result = ''
    for c in word:
        if c.isalpha():
            result += c
    return result.lower()


def countWords (wordList):
    """
    given a list of strings, return a list of tuples (word, count)
    that is how many times each word appears in the list
    """
    wordList = [ cleanWord(w) for w in wordList ]
    wordSet = set(wordList)
    results = []
    for word in wordSet:
        results += [ (wordList.count(word), word) ]
    return results


def topWords (wordCountList, topN):
    """
    given a list of tuples (word, count) return the top N words in
    the number of occurrences.
    """
    return sorted(wordCountList, reverse=True)[:topN]



words = readFileAsWords('alice.txt')
print(len(words))
print(len(set(words)))
print(words[5000:5500])
words = [ cleanWord(w) for w in words ]
print(len(set(words)))
print(words[5000:5500])
wordCounts = countWords(words)
print(len(wordCounts))
print(topWords(words, 20))

