## Explore Movie Dataset

In [1]:
import os
import pandas as pd
import settings
import etl

%matplotlib inline

%load_ext watermark
%watermark -d -t -v -m -p pea,pandas

2017-06-29 08:29:53 

CPython 3.6.1
IPython 6.1.0

pea 0.0.7
pandas 0.20.2

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 7
machine    : AMD64
processor  : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel
CPU cores  : 8
interpreter: 64bit


In [2]:
data = etl.Data()
data.load()

## Available Columns

In [3]:
data.movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

### Add Calulations to etl

In [4]:
data.movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [5]:
data.movie['net'] = data.movie['gross'] - data.movie['budget']

In [6]:
data.movie.sort_values('budget',ascending=False)[['movie_title', 'title_year', 'budget', 'gross', 'net']]

Unnamed: 0,movie_title,title_year,budget,gross,net
2988,The Host,2006.0,1.221550e+10,2201412.0,-1.221330e+10
3859,Lady Vengeance,2005.0,4.200000e+09,211667.0,-4.199788e+09
3005,Fateless,2005.0,2.500000e+09,195888.0,-2.499804e+09
2323,Princess Mononoke,1997.0,2.400000e+09,2298191.0,-2.397702e+09
2334,Steamboy,2004.0,2.127520e+09,410388.0,-2.127110e+09
3423,Akira,1988.0,1.100000e+09,439162.0,-1.099561e+09
4542,Godzilla 2000,1999.0,1.000000e+09,10037390.0,-9.899626e+08
3075,Kabhi Alvida Naa Kehna,2006.0,7.000000e+08,3275443.0,-6.967246e+08
3851,Tango,1998.0,7.000000e+08,1687311.0,-6.983127e+08
3273,Kites,2010.0,6.000000e+08,1602466.0,-5.983975e+08


## plotting with IPlotter

This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame.  Much less verbose, but can be done with the current version on PyPI.

In [7]:
from iplotter import C3Plotter

In [8]:
c3 = C3Plotter()

### Timeseries of mean gross

In [9]:
plot_data = data.movie.groupby(['title_year']).min()[['gross', 'net', 'budget']].fillna(0)
c3.plot(plot_data, zoom=True)

In [10]:
country_group = data.movie.groupby('country').mean()['imdb_score']
values = country_group.values.tolist()
countries = country_group.index.values.tolist()

In [11]:
from iplotter import PlotlyPlotter
from IPython.display import HTML

plotly = PlotlyPlotter()

c3_plotter = C3Plotter()

plotly_chart = [{
    "type": 'choropleth',
    "locationmode": 'country names',
    "locations": countries,
    "z": values,
    "zmin": 0,
    "zmax": max(values),
    "colorscale": [
        [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],
        [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
        [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']
    ],
    "colorbar": {
        "title": 'Count',
        "thickness": 10
    },
    "marker": {
        "line": {
            "color": 'rgb(255,255,255)',
            "width": 2
        }
    }
}]

plotly_layout = {
    "title": 'Movie Counts by Country',
    "geo": {
        "scope": 'country names',
    }
}



country_plot = plotly.plot(data=plotly_chart)

### Movies by Country

{{ country_plot }}

In [12]:
data.movie.set_index(['budget'])['imdb_score']

budget
 237000000.0    7.9
 300000000.0    7.1
 245000000.0    6.8
 250000000.0    8.5
NaN             7.1
 263700000.0    6.6
 258000000.0    6.2
 260000000.0    7.8
 250000000.0    7.5
 250000000.0    7.5
 250000000.0    6.9
 209000000.0    6.1
 200000000.0    6.7
 225000000.0    7.3
 215000000.0    6.5
 225000000.0    7.2
 225000000.0    6.6
 220000000.0    8.1
 250000000.0    6.7
 225000000.0    6.8
 250000000.0    7.5
 230000000.0    7.0
 200000000.0    6.7
 225000000.0    7.9
 180000000.0    6.1
 207000000.0    7.2
 200000000.0    7.7
 250000000.0    8.2
 209000000.0    5.9
 150000000.0    7.0
               ... 
 24000.0        7.0
NaN             6.3
 23000.0        7.1
 25000.0        4.8
 22000.0        3.3
 20000.0        6.9
NaN             4.6
 17350.0        3.0
 15000.0        6.6
 15000.0        7.4
 15000.0        6.2
 20000.0        4.0
 10000.0        6.1
 4500.0         6.9
 10000.0        7.5
 10000.0        6.7
 1000000.0      7.4
NaN             6.1
 200000.0    

In [13]:
score_by_budget = data.movie.set_index(['director_facebook_likes'])[['net']]
c3.plot(score_by_budget, kind='scatter', zoom=True, )

In [14]:
from ipywidgets import interact, interactive, fixed, interact_manual

In [15]:
def f(country):
    df = data.movie[data.movie['country'] == country]
    ax = df.groupby(['director_name']).agg({'director_facebook_likes':'sum', 'gross':'sum'}).plot(kind='scatter', x='director_facebook_likes', y='gross')
    plt.show()

In [16]:
import matplotlib.pyplot as plt

In [17]:
interact(f, country=data.movie.country.drop_duplicates().dropna().values.tolist());