'''
Created on Oct 7, 2013

@author: rcd
'''
import string

# get entire text of 'Alice in Wonderland' as a list of words
f = open("alice.txt")
words = f.read().split()
f.close()

# what exactly is a word?
words = [ s.strip(string.punctuation).lower() for s in words ]
# apparently some words were just punctuation!
words = [ s for s in words if len(s) > 0 ]

# how many total words are there?
print("total words = " + str(len(words)))
# how many unique words?
print("unique words = " + str(len(set(words))))

# how many times does each word occur in the text?
# note this results in a list of lists, where each sublist
# is [ count, word ] for each unique word in the text
wordCounts = [ [words.count(w), w] for w in set(words) ]
print(wordCounts)

# note, the above algorithm is VERY slow (which is fine in
# this course), so most CompSci professionals would prefer
# the following code to do the SAME thing
d = {}
for w in words:
    if w in d:
        d[w] += 1
    else: 
        d[w] = 1
# note, same number of items as the set above
print(len(d.items()))
# creates a list of tuples of the form (key, value)
# in no particular order (like a set)
print(d.items())
# can get just the keys or just the values
print(d.keys())
# these lists are "in parallel" so the indices correspond
print(d.values())

# and, some sorting options
# basic sort, smallest to largest starting with first item in sub-list
wordCounts = sorted(wordCounts)
print(wordCounts)

# sort by calling "key" function on each element (in this case len)
# for this example, this is silly because every element is a list with len == 2
wordCounts = sorted(wordCounts, key=len)
print(wordCounts)

# sort from smallest to largest starting with second item, then first
# note, itemgetter is a function that gets the given items for comparison
#   instead of the entire list
# note, attrgetter can be used for named elements 
#   (i.e., namedtuples and dictionaries)
import operator
#from operator import itemgetter
wordCounts = sorted(wordCounts, key=operator.itemgetter(1, 0))
print(wordCounts)

# define our own specific comparison function for sorting:
#   takes two elements from sequence (in this case each is [ number, string ])
#   returns negative number if A is less than B, positive if A is greater than B
#   and 0 if they are equal
def compare (listA, listB):
    if listA[1] == listB[1]:
        return listA[0] - listB[0]
    elif listA[1] < listB[1]:
        return -1
    else:
        return 1
# sort based on the comparison function given
sorted(wordCounts, cmp=compare)
