'''
Created on Oct 7, 2013

@author: rcd
'''
import string

# get entire text of 'Alice in Wonderland' as a list of words
f = open("alice.txt")
words = f.read().split()
f.close()

# what exactly is a word?
words = [ s.strip(string.punctuation).lower() for s in words ]
# apparently some words were just punctuation!
words = [ s for s in words if len(s) > 0 ]

# how many total words are there?
print("total words = " + str(len(words)))
# how many unique words?
print("unique words = " + str(len(set(words))))

# how many times does each word occur in the text?
# note this results in a list of lists, where each sublist
# is [ count, word ] for each unique word in the text
wordCounts = [ [words.count(w), w] for w in set(words) ]
print(wordCounts)

# now find the top N occurring words in the text
# since count is first element in each sub-list, 
# sorts by number
n = 15
wordCounts = sorted(wordCounts, reverse=True)[:n] 
print(wordCounts)

# now "bucket" each word so we can size them for a web page
# note this also reverses the order of each sublist so it 
# becomes [ word, bucket ] for each unique word in the text
size = 200
wordCounts = [ [ wc[1], wc[0] / size ]  for wc in wordCounts ]
print(wordCounts)

# for printing, sort alphabetically since first element in 
# each sub-list is a stirng
wordCounts = sorted(wordCounts)
print(wordCounts)