diff --git a/notebooks/Explore Movie Dataset.html b/notebooks/Explore Movie Dataset.html new file mode 100644 index 0000000..3cc537b --- /dev/null +++ b/notebooks/Explore Movie Dataset.html @@ -0,0 +1,721 @@ + +
+
+
+
+

Explore Movie Dataset

+
+
+
+
+
+
In [1]:
+
+
+
import os
+import pandas as pd
+import settings
+import etl
+
+%matplotlib inline
+
+%load_ext watermark
+%watermark -d -t -v -m -p pea,pandas
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
2017-06-26 18:57:49 
+
+CPython 3.6.1
+IPython 6.1.0
+
+pea 0.0.7
+pandas 0.20.2
+
+compiler   : MSC v.1900 64 bit (AMD64)
+system     : Windows
+release    : 7
+machine    : AMD64
+processor  : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel
+CPU cores  : 8
+interpreter: 64bit
+
+
+
+ +
+
+ +
+
+
+
In [2]:
+
+
+
data = etl.Data()
+data.load()
+
+ +
+
+
+ +
+
+
+
+
+

Available Columns

+
+
+
+
+
+
In [3]:
+
+
+
data.movie.columns
+
+ +
+
+
+ +
+
+ + +
+ +
Out[3]:
+ + + + +
+
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
+       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
+       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
+       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
+       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
+       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
+       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
+       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
+      dtype='object')
+
+ +
+ +
+
+ +
+
+
+
+
+

plotting with IPlotter

This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame. Much less verbose, but can be done with the current version on PyPI.

+ +
+
+
+
+
+
In [5]:
+
+
+
from iplotter import C3Plotter
+
+ +
+
+
+ +
+
+
+
In [6]:
+
+
+
c3 = C3Plotter()
+
+ +
+
+
+ +
+
+
+
+
+

Timeseries of mean gross

+
+
+
+
+
+
In [46]:
+
+
+
plot_data = data.movie.groupby(['title_year']).mean()[['gross']].fillna(0)
+c3.plot(plot_data, zoom=True)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[46]:
+ + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [25]:
+
+
+
country_group = data.movie.groupby('country').count()['duration']
+counts = country_group.values.tolist()
+countries = country_group.index.values.tolist()
+
+ +
+
+
+ +
+
+
+
In [47]:
+
+
+
from iplotter import PlotlyPlotter
+from IPython.display import HTML
+
+plotly = PlotlyPlotter()
+
+c3_plotter = C3Plotter()
+
+plotly_chart = [{
+    "type": 'choropleth',
+    "locationmode": 'country names',
+    "locations": countries,
+    "z": counts,
+    "zmin": 0,
+    "zmax": max(counts),
+    "colorscale": [
+        [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],
+        [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
+        [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']
+    ],
+    "colorbar": {
+        "title": 'Count',
+        "thickness": 10
+    },
+    "marker": {
+        "line": {
+            "color": 'rgb(255,255,255)',
+            "width": 2
+        }
+    }
+}]
+
+plotly_layout = {
+    "title": 'Movie Counts by Country',
+    "geo": {
+        "scope": 'country names',
+    }
+}
+
+
+
+country_plot = plotly.plot(data=plotly_chart)
+
+ +
+
+
+ +
+
+
+
+
+

Movies by Country

+
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+ +