'''
Created on Oct 16, 2012

@author: ola
'''
import time, bisect, string, urllib

def clean(word):
    '''
    remove punctuation from beginning and end of word,
    convert to lower case, return such a cleaned word
    '''
    for i in range(len(word)):
        if not word[i] in string.punctuation:
            break
    j = len(word)-1
    for j in range(len(word)-1,0,-1):
        if not word[j] in string.punctuation:
            break
    return word[i:j+1].lower()

def linear(words):
    '''
    words is a list of strings, return a list of
    pairs, each pair is [w,c] where c is the count of 
    how many times w occurs in words
    '''
    data = []
    for w in words:
        found = False
        for elt in data:
            if elt[0] == w:
                elt[1] += 1
                found = True
                break
        if not found:
            data.append([w,1])
    return data

def binary(words):
    '''
    words is a list of strings, return a list of
    pairs, each pair is [w,c] where c is the count of 
    how many times w occurs in words
    '''
    data = []
    for w in words:
        elt = [w,1]
        index = bisect.bisect_left(data, elt)
        if index == len(data):
            data.append(elt)
        elif data[index][0] != w:
            data.insert(index,elt)
        else:
            data[index][1] += 1
    return data

def dictionary(words):
    '''
    words is a list of strings, return a list of
    pairs, each pair is [w,c] where c is the count of 
    how many times w occurs in words
    '''
    d = {}
    for w in words:
        if w not in d:
            d[w] = 1
        else:
            d[w] += 1
    return [[w,d[w]] for w in d]

def analyze(words,func):
    start = time.time()
    data = func(words)
    end = time.time()
    print str(func),
    print "total: %d diff: %d, time: %4.3f" % (len(words),len(data),end-start)
    return data


def tradeoff(datasource):
    if datasource.startswith("http"):
        f = urllib.urlopen(datasource)
    else:
        f = open(datasource)
    words = f.read().split()
    words = [clean(w) for w in words]
    funcs = [linear, binary,dictionary]
    f.close()
    alldata = []
    for f in funcs:
        print "running on",str(f)
        alldata.append(sorted(analyze(words,f)))
    x = alldata[0]
    for y in range(1,len(alldata)):
        if x != alldata[y]:
            print "differ at %d on %d words" % (y,len(alldata[y]))
        

if __name__ == '__main__':
    url = "http://www.cs.duke.edu/csed/data/poe.txt"
    f = "/data/melville4.txt"
    tradeoff(f)