diff --git a/notebooks/Explore Movie Dataset.ipynb b/notebooks/Explore Movie Dataset.ipynb new file mode 100644 index 0000000..fc35c86 --- /dev/null +++ b/notebooks/Explore Movie Dataset.ipynb @@ -0,0 +1,503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore Movie Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-06-26 18:57:49 \n", + "\n", + "CPython 3.6.1\n", + "IPython 6.1.0\n", + "\n", + "pea 0.0.7\n", + "pandas 0.20.2\n", + "\n", + "compiler : MSC v.1900 64 bit (AMD64)\n", + "system : Windows\n", + "release : 7\n", + "machine : AMD64\n", + "processor : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel\n", + "CPU cores : 8\n", + "interpreter: 64bit\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "import settings\n", + "import etl\n", + "\n", + "%matplotlib inline\n", + "\n", + "%load_ext watermark\n", + "%watermark -d -t -v -m -p pea,pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = etl.Data()\n", + "data.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Available Columns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',\n", + " 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',\n", + " 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',\n", + " 'movie_title', 'num_voted_users', 'cast_total_facebook_likes',\n", + " 'actor_3_name', 'facenumber_in_poster', 'plot_keywords',\n", + " 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',\n", + " 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',\n", + " 'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.movie.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## plotting with IPlotter\n", + "\n", + "This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame. Much less verbose, but can be done with the current version on PyPI." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from iplotter import C3Plotter" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "c3 = C3Plotter()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timeseries of mean gross" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "plot_data = data.movie.groupby(['title_year']).mean()[['gross']].fillna(0)\n", + "c3.plot(plot_data, zoom=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "country_group = data.movie.groupby('country').count()['duration']\n", + "counts = country_group.values.tolist()\n", + "countries = country_group.index.values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "from iplotter import PlotlyPlotter\n", + "from IPython.display import HTML\n", + "\n", + "plotly = PlotlyPlotter()\n", + "\n", + "c3_plotter = C3Plotter()\n", + "\n", + "plotly_chart = [{\n", + " \"type\": 'choropleth',\n", + " \"locationmode\": 'country names',\n", + " \"locations\": countries,\n", + " \"z\": counts,\n", + " \"zmin\": 0,\n", + " \"zmax\": max(counts),\n", + " \"colorscale\": [\n", + " [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],\n", + " [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],\n", + " [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']\n", + " ],\n", + " \"colorbar\": {\n", + " \"title\": 'Count',\n", + " \"thickness\": 10\n", + " },\n", + " \"marker\": {\n", + " \"line\": {\n", + " \"color\": 'rgb(255,255,255)',\n", + " \"width\": 2\n", + " }\n", + " }\n", + "}]\n", + "\n", + "plotly_layout = {\n", + " \"title\": 'Movie Counts by Country',\n", + " \"geo\": {\n", + " \"scope\": 'country names',\n", + " }\n", + "}\n", + "\n", + "\n", + "\n", + "country_plot = plotly.plot(data=plotly_chart)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "variables": { + " country_plot ": "" + } + }, + "source": [ + "### Movies by Country\n", + "\n", + "{{ country_plot }}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}