Explore Movie Dataset

In [1]:
import os
import pandas as pd
import settings
import etl

%matplotlib inline

%load_ext watermark
%watermark -d -t -v -m -p pea,pandas
2017-06-26 18:57:49 

CPython 3.6.1
IPython 6.1.0

pea 0.0.7
pandas 0.20.2

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 7
machine    : AMD64
processor  : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel
CPU cores  : 8
interpreter: 64bit
In [2]:
data = etl.Data()
data.load()

Available Columns

In [3]:
data.movie.columns
Out[3]:
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

plotting with IPlotter

This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame. Much less verbose, but can be done with the current version on PyPI.

In [5]:
from iplotter import C3Plotter
In [6]:
c3 = C3Plotter()

Timeseries of mean gross

In [46]:
plot_data = data.movie.groupby(['title_year']).mean()[['gross']].fillna(0)
c3.plot(plot_data, zoom=True)
Out[46]:
In [25]:
country_group = data.movie.groupby('country').count()['duration']
counts = country_group.values.tolist()
countries = country_group.index.values.tolist()
In [47]:
from iplotter import PlotlyPlotter
from IPython.display import HTML

plotly = PlotlyPlotter()

c3_plotter = C3Plotter()

plotly_chart = [{
    "type": 'choropleth',
    "locationmode": 'country names',
    "locations": countries,
    "z": counts,
    "zmin": 0,
    "zmax": max(counts),
    "colorscale": [
        [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],
        [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
        [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']
    ],
    "colorbar": {
        "title": 'Count',
        "thickness": 10
    },
    "marker": {
        "line": {
            "color": 'rgb(255,255,255)',
            "width": 2
        }
    }
}]

plotly_layout = {
    "title": 'Movie Counts by Country',
    "geo": {
        "scope": 'country names',
    }
}



country_plot = plotly.plot(data=plotly_chart)

Movies by Country

In [ ]: