'''
Created on Oct 28, 2013

@author: rcd
'''
import csv

# general function for reading all three IMDB movie files
def readCSVfile (filename, headerRows):
    """
    given a filename that is formatted as CSV, comma-separated-values,
    return a list of lists of strings, where each line is represented
      as a list of separate string elements
    optionally, header rows are ignored
    """
    # some OSes need to know that the file might have some special characters
    f = open(filename, 'rb')
    # convert reader to a list so we can close the file
    result = list(csv.reader(f, delimiter=',', quotechar='"'))
    # close the file so we do not take up extra system resources
    f.close()
    # throw away the header row(s) of the data
    return result[headerRows:]


# general function for turning all three IMDB movie data into a dictionary
def processCSVdata (data, titleIndex, yearIndex, result):
    """
    given a list of lists of strings representing movie data,
    return a dictionary where
      key is a tuple of the movie's title and year
      value is the remaining information about the movie
    this function is parameterized in three additional ways:
      titleIndex is the index of the title in each list of movie information
      yearIndex is the index of the year in each list of movie information
      result is a dictionary in which to add this data
    """
    for d in data:
        # create key by using the given indices to find the title and year
        key = (d[titleIndex].strip(), d[yearIndex].strip())
        # create value from the remaining elements in the information list
        value = d[0:titleIndex] + d[yearIndex+1:]
        # add data to dictionary
        if key not in result:
            result[key] = []
        result[key] += value
        # OR:
        # result[key] = result.get(key, []) + value
    return result


# use this function to print the results of your functions
def printData (data):
    """
    prints the length and sorted contents of the sequence
    """
    print(str(len(data)) + '\t' + str(data))


def bothTopRatedAndGrossing (movies):
    result = {}
    for (k, v) in movies.items():
        # note, only movies that have all fields are in both
        #   (6 for cast + 2 for rating + 2 for gross)
        if len(v) == 10:
            result[k] = v
    return result


def uniqueDirectors (movies):
    # note, director guaranteed to be first spot in movie information list
    return sorted(set([ x[0] for x in movies.values() ]))


def directorsOfMostMovies (movies, count):
    """
    return list of tuples, (count, name), sorted by count from most to least
      of the directors and how many movies they directed
    note, only return the first "count" directors
    """
    return []


# read data from files
# note, this file does not have a header row
cast = readCSVfile("imdb_movies_cast.txt", 0)
# note, these files have a header row
rated = readCSVfile("imdb_movies_toprated.txt", 1)
gross = readCSVfile("imdb_movies_gross.txt", 1)
# verify data that was read
#printData(cast)
#printData(rated)
#printData(gross)

# create separate dictionaries of just information from a specific file, where
#  key is a tuple of strings: (title, year)
#  values are a list of strings: [director,actor1,actor2,actor3,actor4,actor5]
castMovies = processCSVdata(cast, 0, 1, {})
#  values are a list of strings: [rank, rating]
ratedMovies = processCSVdata(rated, 1, 2, {})
#  values is a list of strings: [rank, profits]
grossMovies = processCSVdata(gross, 1, 2, {})
# verify data that was read
#printData(castMovies)
#printData(ratedMovies)
#printData(grossMovies)

# combine results of processing data multiple times into ONE dictionary, where
#  key is a tuple of strings: (title, year)
#  values are list of strings: [director,actor1,actor2,actor3,actor4,actor5,X]
#    where the last two or four elements are rankings, rating, and/or profits
movies = {}
processCSVdata(cast, 0, 1, movies)
processCSVdata(rated, 1, 2, movies)
processCSVdata(gross, 1, 2, movies)
print("All movie data:")
printData(movies)

# answers to questions
print('Which movies are both top rated and top grossing?')
topMovies = bothTopRatedAndGrossing(movies)
printData(topMovies)

print('Who directed the movies that are either top rated or top grossing?')
directors = uniqueDirectors(movies)
printData(directors)

print('Who directed the movies that are both top rated and top grossing?')
directors = uniqueDirectors(topMovies)
printData(directors)

print('Top 10, by count, who directed the most movies that are either top rated or top grossing?')
topDirectors = directorsOfMostMovies(movies, 10)
printData(topDirectors)


print('Top 5, by count, who directed the most movies that are both top rated and top grossing?')


print('Top 20, by gross, who directed the movies that grossed the most money?')


print('Who acted in at least 3 movies that are either top rated or top grossing?')


print('Top 10, by count, who acted in the most movies that are either top rated or top grossing?')


print('Top 5, by count, who acted in the most movies that are both top rated and top grossing?')


print('Top 20, by rating, who acted in movies whose average rating is the hightest?')


print('YOUR QUESTION #1')


print('YOUR QUESTION #2')