'''
Created on Oct 28, 2013

@author: rcd
'''
# general function for reading all three IMDB movie files
def readFile (filename):
    """
    given a filename that is formatted as tab separated values on each line
    return a list of lists of strings, where each line is represented
      as a list of separate string elements
    note, the header row is ignored
    """
    # some OSes need to know that the file might have some special characters
    f = open(filename)
    # convert reader to a list so we can close the file
    result = [ line.strip().split('\t') for line in f if len(line) > 1 ]
    # close the file so we do not take up extra system resources
    f.close()
    # throw away the header row(s) of the data
    return result[1:]


# general function for turning all three IMDB movie data into a dictionary
def processData (data, result):
    """
    given a list of lists of strings representing movie data,
    return a dictionary where
      key is a tuple of the movie's title and year
      value is the remaining information about the movie
    this function is parameterized in three additional ways:
      titleIndex is the index of the title in each list of movie information
      yearIndex is the index of the year in each list of movie information
      result is a dictionary in which to add this data
    """
    TITLE = 1
    YEAR = 2
    for d in data:
        # create key by using the given indices to find the title and year
        key = (d[TITLE].strip(), d[YEAR].strip())
        # create value from the remaining elements in the information list
        value = d[0:TITLE] + d[YEAR+1:]
        # add data to dictionary
        if key not in result:
            result[key] = []
        result[key] += value
        # OR:
        # result[key] = result.get(key, []) + value
    return result


# use this function to print the results of your functions
def printData (data):
    """
    prints the length and sorted contents of the sequence
    """
    print(str(len(data)) + '\t' + str(data))


def bothTopRatedAndGrossing (movies):
    """
    return a dictionary in the same format as the one given, but that
      includes only those movies that are both top rated and top grossing
    """
    result = {}
    for (k, v) in movies.items():
        # note, only movies that have all fields are in both
        #   (7 for cast + 2 for rating + 2 for gross)
        if len(v) == 11:
            result[k] = v
    return result


def uniqueDirectors (movies):
    """
    return list of strings, names of directors, sorted alphabetically
      for the given movies
    """
    # note, director guaranteed to be in the second spot in movie information list
    return sorted(set([ x[1] for x in movies.values() ]))


def directorsOfMostMovies (movies, count):
    """
    return list of tuples, (count, name), sorted by count from most to least of
      the directors and how many movies they directed
    note, only return the first "count" directors
    """
    return []


def castFilmography (movies, minAppearances):
    """
    return list of lists, [ name, (title, year) ], sorted alphabetically by name
      the cast members that appeared in at least minAppearances movies
    """
    return []


def uniqueCastMembers (movies, minAppearances):
    """
    return list of strings, names of cast members, sorted alphabetically
      that appeared in the given movies
    """
    return []


def mostHighlyRatedCastMembers (movies, ratedMovies, count, minAppearances):
    """
     return list of tuples, (average rating, name), sorted from greatest to least by rating
      of the top cast count cast members that appeared in at least minAppearances movies   
    """
    return []


# read data from files
# note, this file does not have a header row
cast = readFile("imdb_movies_cast.txt")
# note, these files have a header row
rated = readFile("imdb_movies_toprated.txt")
gross = readFile("imdb_movies_gross.txt")
# verify data that was read
#printData(cast)
#printData(rated)
#printData(gross)

# create separate dictionaries of just information from a specific file, where
#  key is a tuple of strings: (title, year)
#  values are a list of strings: [rank, director, actor1, actor2, actor3, actor4, actor5]
castMovies = processData(cast, {})
#  values are a list of strings: [rank, rating]
ratedMovies = processData(rated, {})
#  values is a list of strings: [rank, profits]
grossMovies = processData(gross, {})
# verify data that was read
#printData(castMovies)
#printData(ratedMovies)
#printData(grossMovies)

# combine results of processing data multiple times into ONE dictionary, where
#  key is a tuple of strings: (title, year)
#  values are a list of strings: [rank, director, actor1, actor2, actor3, actor4, actor5, X ]
#    where the last two or four elements are the rankings, rating, and/or profits
movies = {}
processData(cast, movies)
processData(rated, movies)
processData(gross, movies)
print("All movie data:")
printData(movies)

# answers to questions
print('Which movies are both top rated and top grossing?')
topMovies = bothTopRatedAndGrossing(movies)
printData(topMovies)

print('Who directed the movies that are either top rated or top grossing?')
directors = uniqueDirectors(movies)
printData(directors)

print('Who directed the movies that are both top rated and top grossing?')
directors = uniqueDirectors(topMovies)
printData(directors)

print('Top 10, by count, who directed the most movies that are either top rated or top grossing?')
# TODO
printData(prolificDirectors)

print('Top 5, by count, who directed the most movies that are both top rated and top grossing?')
# TODO
printData(prolificDirectors)

print('Who acted in at least 3 movies that are either top rated or top grossing?')
# TODO
printData(actors)

print('Top 10, by count, who acted in the most movies that are either top rated or top grossing?')
# TODO
printData(prolificActors)

print('Who directed and also starred in any movies that are either top rated or top grossing?')
# TODO
printData(directorAndActors)

print('Top 20, by rating, who acted in any movies whose average rating is the highest?')
# TODO
printData(topActors)

print('Top 1, by rating, who acted in at least 4 movies whose average rating is the highest?')
# TODO
printData(topActors)

