diff --git a/requirements.txt b/requirements.txt index f473b21..051b877 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ flask -flask-cors flask-restplus gunicorn markdown nltk pandas textblob --e git+https://github.com/WaylonWalker/iplotter.git@master#egg=iplotter \ No newline at end of file diff --git a/src/iplotter/__init__.py b/src/iplotter/__init__.py new file mode 100644 index 0000000..4cd17ca --- /dev/null +++ b/src/iplotter/__init__.py @@ -0,0 +1,8 @@ +from .base_plotter import IPlotter +# from .export import VirtualBrowser +from .c3_plotter import C3Plotter +from .plotly_plotter import PlotlyPlotter +from .chartjs_plotter import ChartJSPlotter +from .chartist_plotter import ChartistPlotter +from .google_plotter import GCPlotter +__version__ = '0.4.3' diff --git a/src/iplotter/base_plotter.py b/src/iplotter/base_plotter.py new file mode 100644 index 0000000..70c86ee --- /dev/null +++ b/src/iplotter/base_plotter.py @@ -0,0 +1,55 @@ +from abc import ABCMeta, abstractmethod +import re +import time +# from selenium import webdriver +import os + + +class IPlotter(object): + """Abstract IPlotter""" + + __metaclass__ = ABCMeta + iframe = '' + invalid_name_pattern = re.compile(r'[^a-zA-Z0-9_\-\. ]+') + + def __init__(self): + super(IPlotter, self).__init__() + + @classmethod + def is_valid_name(cls, name): + ''' + check whether plot div id or filenname are valid + ''' + if (cls.invalid_name_pattern.search(name)): + return False + else: + return True + + @abstractmethod + def render(self): + ''' + render the data in HTML template + ''' + pass + + @abstractmethod + def plot(self): + ''' + output an iframe containing the plot in the notebook without saving + ''' + pass + + @abstractmethod + def save(self): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + pass + + @abstractmethod + def plot_and_save(self): + ''' + save the rendered html to a file and return an IFrame to display the + plot in the notebook + ''' + pass diff --git a/src/iplotter/c3_plotter.py b/src/iplotter/c3_plotter.py new file mode 100644 index 0000000..cbfd12f --- /dev/null +++ b/src/iplotter/c3_plotter.py @@ -0,0 +1,386 @@ +from jinja2 import Template +from IPython.display import IFrame, HTML +import os +import json +from .base_plotter import IPlotter +import math + + +class C3Plotter(IPlotter): + """ + Class for creating c3.js charts in ipython notebook + """ + + head = ''' + + + + + + + + ''' + + template = ''' +

{{title}}

+
+ + ''' + + def __init__(self): + super(C3Plotter, self).__init__() + + def render(self, + data, + div_id="chart", + custom_css='', + title="", + head="", + y_axis_tick_format='', + secondary_y_axis_tick_format='' + , + **kwargs): + ''' + render the data in HTML template + ''' + try: + data = self.pandas_data(data, **kwargs) + except AttributeError: + pass + + if not self.is_valid_name(div_id): + raise ValueError( + "Name {} is invalid. Only letters, numbers, '_', and '-' are permitted ".format( + div_id)) + + return Template(head + self.template).render( + div_id=div_id.replace(" ", "_"), + custom_css=custom_css, + title=title, + y_axis_tick_format=y_axis_tick_format, + secondary_y_axis_tick_format=secondary_y_axis_tick_format, + data=json.dumps( + data, indent=4).replace("'", "\\'").replace('"', "'")) + + def plot_and_save(self, + data, + w=800, + h=430, + filename='chart', + subplots=False, + subplot_groups=False, + title=False, + overwrite=True): + ''' + save the rendered html to a file and returns an IFrame to display the plot in the notebook + ''' + self.save(data, filename, overwrite,) + return IFrame(filename + '.html', w, h) + + def plot(self, + data, + w=800, + h=430, + div_id='chart', + subplots=False, + subplot_groups=False, + title=False, + **kwargs): + ''' + output an iframe containing the plot in the notebook without saving + ''' + + if subplots: + if title: + if len(title) > 0: + title = title + '
' + body = '' + + if not subplot_groups: + subplot_groups = {col: [col] for col in data.columns} + + for group in subplot_groups: + body = body + (self.render(data=data[subplot_groups[group]], + div_id=str(div_id) + str(group), + head=self.head, + title=str(title) + str(group), + **kwargs + ) + ) + title='' + else: + body = self.render( + data=data, + div_id=div_id, + head=self.head, + **kwargs) + + return HTML(self.iframe.format(source=body, w=w, h=h*len(subplot_groups))) + + def update(): + pass + + def save(self, data, filename='chart', overwrite=True): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + try: + data = self.pandas_data(data, **kwargs) + except AttributeError: + pass + + html = self.render(data=data, div_id=filename, head=self.head) + if overwrite: + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + if not os.path.exists(filename.replace(" ", "_") + '.html'): + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + raise IOError('File Already Exists!') + + def pandas_data(self, + df, + colors=False, + data_label_formats=False, + data_labels=False, + grid=False, + group=False, + height=300, + hue=False, + kind='line', + kinds=None, + legend=True, + mark_right=False, + point=False, + secondary_y=list(), + stacked=False, + subchart=False, + subplots=False, + tick_count=10, + value=False, + value_labels=False, + x_axis_tick_culling=False, + x_axis_type='auto', + x_tick_values=False, + xlabels=False, + xlim=False, + xregions=False, + xy_rotated=False, + ylabels=False, + ylim=False, + yregions=False, + zoom=False, + ): + ''' + create data dictionary from pandas DataFrame + + TODO: + ## Pandas Features + * proper docstring + * subplots + * layout + * height -> figsize + * use_index + * legend + * xlim (axis.x.min, axis.x.max or axis.x.extent) + * ylim + * colorbar + * table + * axis-rotation + ## Seaborn-esque features + * hue - ability to provide long form data + ## C3 Features + * interaction: {enabled: false} + * transition: {duration: 500} + * onrendered: function() {...} + * onmouseover/out + * data.empty.label.text + * data.selection.enabled + * data.selection.grouped + * data.selection.multiple + * data.selection.draggable + * axis.x.tick.fit + * axis.x.tick.values + * axis.x.tick.rotate + * axis.x.label + * axis.x.show + * legend.hide + * legend.position + * tooltip.show + * tooltip.grouped + * point.focus.expand.enabled + * subchart.size.height + * point.focus.expand.r + * point.select.r + * line.connectNull + + + param kind: str + * line + * spline + * step + * areas + * area-spline + * area-step + * bar + * scatter + * pie + * donut + * gauge + + param x_axis_type: str + * timeseries + * category + * numeric + + ''' + # kinds = ['line', 'spline', 'step', 'area','area-spline', 'area-step', + # 'bar', 'scatter', 'pie', 'donut', 'gauge'] + + data = { + 'size': { + 'height': height, + }, + "data": { + 'x': 'x', + 'axes': dict() + }, + 'subchart': { + 'show': subchart + }, + 'point': { + 'show': point + }, + 'grid': { + 'x': { + 'show': grid + }, + 'y': { + 'show': grid + } + }, + 'axis': { + 'rotated': xy_rotated, + 'x': {'tick': {'count': tick_count, + 'values': x_tick_values, + 'culling': dict(), + }, + }, + 'y': {'tick': {'format': ''}}, + 'y2': {'tick': {}}, + }, + 'zoom': {} + + } + if kind: + data['data']['type'] = kind + if kinds: + data['data']['types'] = kinds + + if mark_right: + df = df.rename( + columns={col: col + '(right)' for col in secondary_y}) + secondary_y = [y + '(right)' for y in secondary_y] + if hue and value: + df = df.groupby([df.index.name, hue])[value].sum().unstack() + + df = df.copy() + df['x'] = df.index + df['x'] = df['x'].astype('str').values.tolist() + + data['data']['columns'] = [[col] + df[col].values.tolist() + for col in df.columns] + # data['data']['columns'].extend([['x'] + df.index.astype('str').values.tolist()]) + for col in df.columns: + if col in secondary_y: + data['data']['axes'][col] = 'y2' + else: + data['data']['axes'][col] = 'y' + if len(secondary_y) > 0: + data['axis']['y2']['show'] = True + + if colors: + # repeat color palette if not long enough + colors = colors*math.ceil(len(df.columns)/len(colors)) + color_data = {} + for col, color in zip(df.columns, colors): + color_data[col] = color + data['data']['colors'] = color_data + + if x_axis_type == 'auto': + index_type = str(df.index.dtype) + + if 'date' in index_type: + data['axis']['x']['type'] = 'timeseries' + data['axis']['x']['tick']['format'] = '%Y-%m-%d' + + if 'object' in index_type or 'category' in index_type: + data['axis']['x']['type'] = 'category' + data['axis']['x']['tick']['culling'][ + 'max'] = x_axis_tick_culling + else: + if 'date' in x_axis_type or 'time' in x_axis_type: + data['axis']['x']['type'] = 'timeseries' + data['axis']['x']['tick']['format'] = '%Y-%m-%d' + + if 'categor' in x_axis_type or 'str' in x_axis_type: + data['axis']['x']['type'] = 'category' + data['axis']['x']['tick']['culling'][ + 'max'] = x_axis_tick_culling + + if xlim: + data['axis']['x']['min'] = xlim[0] + data['axis']['x']['max'] = xlim[1] + + if ylim: + data['axis']['y']['min'] = ylim[0] + data['axis']['y']['max'] = ylim[1] + + if stacked: + group = df.columns.values.tolist() + group.pop(-1) + group = [group] + + if group: + data['data']['groups'] = group + + if zoom: + data['zoom']['enabled'] = True + data['zoom']['rescale'] = True + + if xregions: + data['regions'] = [{'axis': 'x', 'start': region[ + 0], 'end':region[1]} for region in xregions] + + if yregions: + data['regions'] = [{'axis': 'y', 'start': region[ + 0], 'end':region[1]} for region in yregions] + + if xlabels: + data['grid']['x']['lines'] = [ + {'value': label[0], 'text': label[1]} for label in xlabels] + + if ylabels: + data['grid']['y']['lines'] = [ + {'value': label[0], 'text': label[1]} for label in ylabels] + + if data_labels: + if data_labels == True: + data_labels = df.drop('x', axis=1).columns + if data_label_formats: + data['data']['labels'] = {} + for column in data_label_formats: + data['data']['labels'][column] = data_label_formats[column] + else: + data['data']['labels'] = True + + return data diff --git a/src/iplotter/chartist_plotter.py b/src/iplotter/chartist_plotter.py new file mode 100644 index 0000000..30d9e92 --- /dev/null +++ b/src/iplotter/chartist_plotter.py @@ -0,0 +1,98 @@ +from jinja2 import Template +from IPython.display import IFrame, HTML +import os +import json +from .base_plotter import IPlotter + + +class ChartistPlotter(IPlotter): + """ + Class for creating chartist.js charts in ipython notebook + """ + + head = ''' + + + + ''' + + template = ''' +
+ + ''' + + def __init__(self): + super(ChartistPlotter, self).__init__() + + def render(self, data, chart_type, options=None, div_id="chart", head=""): + ''' + render the data in HTML template + ''' + if not self.is_valid_name(div_id): + raise ValueError( + "Name {} is invalid. Only letters, numbers, '_', and '-' are permitted ".format( + div_id)) + + return Template(head + self.template).render( + div_id=div_id.replace(" ", "_"), + data=json.dumps( + data, indent=4).replace("'", "\\'").replace('"', "'"), + chart_type=chart_type, + options=json.dumps( + options, indent=4).replace("'", "\\'").replace('"', "'")) + + def plot_and_save(self, + data, + chart_type, + options=None, + w=800, + h=420, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file and return an IFrame to display the plot in the notebook + ''' + self.save(data, chart_type, options, filename, overwrite) + return IFrame(filename + '.html', w, h) + + def plot(self, data, chart_type, options=None, w=800, h=420): + ''' + output an iframe containing the plot in the notebook without saving + ''' + return HTML( + self.iframe.format( + source=self.render( + data=data, + options=options, + chart_type=chart_type, + head=self.head), + w=w, + h=h)) + + def save(self, + data, + chart_type, + options=None, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + html = self.render( + data=data, + chart_type=chart_type, + options=options, + div_id=filename, + head=self.head) + + if overwrite: + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + if not os.path.exists(filename.replace(" ", "_") + '.html'): + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + raise IOError('File Already Exists!') diff --git a/src/iplotter/chartjs_plotter.py b/src/iplotter/chartjs_plotter.py new file mode 100644 index 0000000..e8e3762 --- /dev/null +++ b/src/iplotter/chartjs_plotter.py @@ -0,0 +1,115 @@ +from jinja2 import Template +from IPython.display import IFrame, HTML +import os +import json +from .base_plotter import IPlotter + + +class ChartJSPlotter(IPlotter): + """ + Class for creating charts.js charts in ipython notebook + """ + + head = ''' + + + ''' + + template = ''' + + + ''' + + def __init__(self): + super(ChartJSPlotter, self).__init__() + + def render(self, + data, + chart_type, + options=None, + div_id="chart", + head="", + w=800, + h=420): + ''' + render the data in HTML template + ''' + if not self.is_valid_name(div_id): + raise ValueError( + "Name {} is invalid. Only letters, numbers, '_', and '-' are permitted ".format( + div_id)) + + return Template(head + self.template).render( + div_id=div_id.replace(" ", "_"), + data=json.dumps( + data, indent=4).replace("'", "\\'").replace('"', "'"), + chart_type=chart_type, + options=json.dumps( + options, indent=4).replace("'", "\\'").replace('"', "'"), + w=w, + h=h) + + def plot_and_save(self, + data, + chart_type, + options=None, + w=800, + h=420, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file and return an IFrame to display the plot in the notebook + ''' + self.save(data, chart_type, options, filename, w, h, overwrite) + return IFrame(filename + '.html', w, h) + + def plot(self, data, chart_type, options=None, w=800, h=420): + ''' + output an iframe containing the plot in the notebook without saving + ''' + return HTML( + self.iframe.format( + source=self.render( + data=data, + chart_type=chart_type, + options=options, + head=self.head, + w=w, + h=h), + w=w, + h=h)) + + def save(self, + data, + chart_type, + options=None, + filename='chart', + w=800, + h=420, + overwrite=True): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + html = self.render( + data=data, + chart_type=chart_type, + options=options, + div_id=filename, + head=self.head, + w=w, + h=h) + + if overwrite: + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + if not os.path.exists(filename.replace(" ", "_") + '.html'): + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + raise IOError('File Already Exists!') diff --git a/src/iplotter/export.py b/src/iplotter/export.py new file mode 100644 index 0000000..8c065b7 --- /dev/null +++ b/src/iplotter/export.py @@ -0,0 +1,35 @@ +import time +from selenium import webdriver +import os + + +class VirtualBrowser(object): + """Helper class for converting html charts to png""" + + def __init__(self, driver=webdriver.Chrome): + super(VirtualBrowser, self).__init__() + self.driver = driver() + + def __enter__(self): + return self + + def save_as_png(self, filename, width=300, height=250, render_time=1): + ''' + open saved html file in an virtual browser and save a screen shot to PNG format + ''' + self.driver.set_window_size(width, height) + self.driver.get('file://{path}/{filename}'.format( + path=os.getcwd(), filename=filename + ".html")) + time.sleep(render_time) + self.driver.save_screenshot(filename + ".png") + + def __exit__(self, type, value, traceback): + self.driver.quit() + return True + + def quit(self): + ''' + shutdown virtual browser when finished + ''' + self.driver.quit() + return True \ No newline at end of file diff --git a/src/iplotter/google_plotter.py b/src/iplotter/google_plotter.py new file mode 100644 index 0000000..2dcb163 --- /dev/null +++ b/src/iplotter/google_plotter.py @@ -0,0 +1,124 @@ +from jinja2 import Template +from IPython.display import IFrame, HTML +import os +import json +from .base_plotter import IPlotter + + +class GCPlotter(IPlotter): + """ + Class for creating Google Charts in ipython notebook + """ + head = ''' + + + ''' + + template = ''' +
+ + ''' + + def __init__(self): + super(GCPlotter, self).__init__() + + def render(self, + data, + chart_type, + chart_package='corechart', + options=None, + div_id="chart", + head=""): + ''' + render the data in HTML template + ''' + if not self.is_valid_name(div_id): + raise ValueError( + "Name {} is invalid. Only letters, numbers, '_', and '-' are permitted ".format( + div_id)) + + return Template(head + self.template).render( + div_id=div_id.replace(" ", "_"), + data=json.dumps( + data, indent=4).replace("'", "\\'").replace('"', "'"), + chart_type=chart_type, + chart_package=chart_package, + options=json.dumps( + options, indent=4).replace("'", "\\'").replace('"', "'")) + + def plot_and_save(self, + data, + chart_type, + chart_package='corechart', + options=None, + w=800, + h=420, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file and return an IFrame to display the plot in the notebook + ''' + self.save(data, chart_type, chart_package, options, filename, + overwrite) + return IFrame(filename + '.html', w, h) + + def plot(self, + data, + chart_type, + chart_package='corechart', + options=None, + w=800, + h=420): + ''' + output an iframe containing the plot in the notebook without saving + ''' + return HTML( + self.iframe.format( + source=self.render( + data=data, + options=options, + chart_type=chart_type, + chart_package=chart_package, + head=self.head), + w=w, + h=h)) + + def save(self, + data, + chart_type, + chart_package='corechart', + options=None, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + html = self.render( + data=data, + chart_type=chart_type, + chart_package=chart_package, + options=options, + div_id=filename, + head=self.head) + + if overwrite: + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + if not os.path.exists(filename.replace(" ", "_") + '.html'): + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + raise IOError('File Already Exists!') diff --git a/src/iplotter/plotly_plotter.py b/src/iplotter/plotly_plotter.py new file mode 100644 index 0000000..dec9677 --- /dev/null +++ b/src/iplotter/plotly_plotter.py @@ -0,0 +1,88 @@ +from jinja2 import Template +from IPython.display import IFrame, HTML +import os +import json +from .base_plotter import IPlotter + + +class PlotlyPlotter(IPlotter): + """ + Class for creating plotly.js charts in ipython notebook + """ + + head = ''' + + + + + ''' + + template = ''' +
+ + ''' + + def __init__(self): + super(PlotlyPlotter, self).__init__() + + def render(self, data, layout=None, div_id="chart", head=""): + ''' + render the data in HTML template + ''' + if not self.is_valid_name(div_id): + raise ValueError( + "Name {} is invalid. Only letters, numbers, '_', and '-' are permitted ".format( + div_id)) + + return Template(head + self.template).render( + div_id=div_id.replace(" ", "_"), + data=json.dumps( + data, indent=4).replace("'", "\\'").replace('"', "'"), + layout=json.dumps( + layout, indent=4).replace("'", "\\'").replace('"', "'")) + + def plot_and_save(self, + data, + layout=None, + w=800, + h=420, + filename='chart', + overwrite=True): + ''' + save the rendered html to a file and return an IFrame to display the plot in the notebook + ''' + self.save(data, layout, filename, overwrite) + return IFrame(filename + '.html', w, h) + + def plot(self, data, layout=None, w=800, h=420): + ''' + output an iframe containing the plot in the notebook without saving + ''' + return HTML( + self.iframe.format( + source=self.render( + data=data, + layout=layout, + head=self.head, ), + w=w, + h=h)) + + def save(self, data, layout=None, filename='chart', overwrite=True): + ''' + save the rendered html to a file in the same directory as the notebook + ''' + html = self.render( + data=data, layout=layout, div_id=filename, head=self.head) + + if overwrite: + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + if not os.path.exists(filename.replace(" ", "_") + '.html'): + with open(filename.replace(" ", "_") + '.html', 'w') as f: + f.write(html) + else: + raise IOError('File Already Exists!') diff --git a/src/pyDataVizDay.py b/src/pyDataVizDay.py index 02d7c31..ffc66f3 100644 --- a/src/pyDataVizDay.py +++ b/src/pyDataVizDay.py @@ -14,20 +14,16 @@ from collections import Counter from flask import Flask from flask import request, render_template, make_response, jsonify, Blueprint, url_for from flask_restplus import Resource, Api, fields, reqparse -from flask_cors import CORS, cross_origin import markdown from textblob import TextBlob import settings import etl -import palettes as pal from iplotter import C3Plotter c3 = C3Plotter() app = Flask(__name__) -# disq = Disqus(app) -CORS(app) api_blueprint = Blueprint('api', __name__, url_prefix='/api') api = Api(api_blueprint, title='pyDataVizday api', default='pyDataVizDay', diff --git a/src/static/css/custom.css b/src/static/css/custom.css index 516d831..6513687 100644 --- a/src/static/css/custom.css +++ b/src/static/css/custom.css @@ -46,12 +46,6 @@ pre { } -img.blur { - width:367; - height:459px; - -webkit-filter: blur(10px); - filter: blur(10px); -} /*@keyframes fadein{ 0% { opacity:0; } diff --git a/src/templates/Exploratory_Charts-Movie_Data-Latest.html b/src/templates/Exploratory_Charts-Movie_Data-Latest.html index 70677dd..9600691 100644 --- a/src/templates/Exploratory_Charts-Movie_Data-Latest.html +++ b/src/templates/Exploratory_Charts-Movie_Data-Latest.html @@ -1,7 +1,7 @@
-
In [1]:
+
In [2]:
import numpy as np # linear algebra
@@ -15,7 +15,7 @@
 
-
In [2]:
+
In [12]:
from subprocess import check_output
@@ -48,7 +48,7 @@
 
-
In [3]:
+
In [14]:
import os
@@ -96,7 +96,7 @@
 
-
In [20]:
+
In [7]:
f = pd.read_csv("C:/Users/alurus/GIT-Repository/VIZ Day/pyDataVizDay2/data/raw/movie_metadata.csv")
@@ -109,11 +109,11 @@
 
-
In [21]:
+
In [14]:
data=DataFrame(f)
-data.head()[:2]
+data.head()[:5]
 
@@ -126,2803 +126,7 @@
-
Out[21]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
-

2 rows × 28 columns

-
-
- -
- -
-
- -
-
-
-
In [22]:
-
-
-
X_data=data.dtypes[data.dtypes!='object'].index
-X_train=data[X_data]
-X_train.head()[:2] 
-
- -
-
-
- -
-
- - -
- -
Out[22]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
num_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_1_facebook_likesgrossnum_voted_userscast_total_facebook_likesfacenumber_in_posternum_user_for_reviewsbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0723.0178.00.0855.01000.0760505847.088620448340.03054.0237000000.02009.0936.07.91.7833000
1302.0169.0563.01000.040000.0309404152.0471220483500.01238.0300000000.02007.05000.07.12.350
-
-
- -
- -
-
- -
-
-
-
In [32]:
-
-
-
# GETTING Correllation matrix
-corr_mat=X_train.corr(method='pearson')
-plt.figure(figsize=(20,10))
-sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')
-
- -
-
-
- -
-
- - -
- -
Out[32]:
- - - - -
-
<matplotlib.axes._subplots.AxesSubplot at 0x106d77f0>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [19]:
-
-
-
!jupyter nbconvert Exploratory_Charts-Movie_Data-Copy2.ipynb --template basic
-
- -
-
-
- -
-
- - -
- -
- - -
-
[NbConvertApp] Converting notebook Exploratory_Charts-Movie_Data-Copy2.ipynb to html
-[NbConvertApp] Writing 206376 bytes to Exploratory_Charts-Movie_Data-Copy2.html
-
-
-
- -
-
- -
-
-
-
In [5]:
-
-
-
df = pd.read_csv('C:/Users/alurus/GIT-Repository/VIZ Day/pyDataVizDay2/data/raw/movie_metadata.csv')
-
- -
-
-
- -
-
-
-
In [6]:
-
-
-
df.head()
-
- -
-
-
- -
-
- - -
- -
Out[6]:
+
Out[14]:
@@ -3103,7 +307,2828 @@ uhbmo2gAAAAASUVORK5CYII=
-
In [7]:
+
In [10]:
+
+
+
X_data=data.dtypes[data.dtypes!='object'].index
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
X_train=data[X_data]
+X_train.head()[:2] 
+
+ +
+
+
+ +
+
+ + +
+ +
Out[11]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
num_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_1_facebook_likesgrossnum_voted_userscast_total_facebook_likesfacenumber_in_posternum_user_for_reviewsbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0723.0178.00.0855.01000.0760505847.088620448340.03054.0237000000.02009.0936.07.91.7833000
1302.0169.0563.01000.040000.0309404152.0471220483500.01238.0300000000.02007.05000.07.12.350
+
+
+ +
+ +
+
+ +
+
+
+
+
+

GETTING Correllation matrix

+
+
+
+
+
+
In [13]:
+
+
+
# GETTING Correllation matrix
+corr_mat=X_train.corr(method='pearson')
+plt.figure(figsize=(23,10))
+sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='Oranges');
+
+ +
+
+
+ +
+
+ + +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [33]:
+
+
+
df = pd.read_csv('C:/Users/alurus/GIT-Repository/VIZ Day/pyDataVizDay/data/raw/movie_metadata.csv')
+
+ +
+
+
+ +
+
+
+
In [34]:
+
+
+
df.head()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[34]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0
+

5 rows × 28 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+

Director Vs Share by Year

+
+
+
+
+
+
In [29]:
df['diff_gross'] = df['gross'] - df['budget']
@@ -3117,10 +3142,10 @@ uhbmo2gAAAAASUVORK5CYII=
                                       aggfunc='sum')
 
 
-fig,ax = plt.subplots(figsize=(8,6))
-sns.heatmap(director_budge_pivot['diff_gross'],vmin=0,annot=False,linewidth=.5,ax=ax,cmap='PuBu')
-plt.title('Director vs Year and diff_gross')
-plt.ylabel('Year')
+fig,ax = plt.subplots(figsize=(14,8))
+sns.heatmap(director_budge_pivot['diff_gross'],vmin=0,annot=False,linewidth=.5,ax=ax,cmap='Oranges')
+plt.title('Director vs Year and Share')
+plt.ylabel('Year');
 
@@ -3131,19 +3156,6 @@ uhbmo2gAAAAASUVORK5CYII=
-
- -
Out[7]:
- - - - -
-
<matplotlib.text.Text at 0xd419a58>
-
- -
-
@@ -3152,583 +3164,771 @@ uhbmo2gAAAAASUVORK5CYII=
-
@@ -3741,7 +3941,7 @@ OJQeAAAAAElFTkSuQmCC
-
In [8]:
+
In [18]:
data = pd.read_csv("C:/Users/alurus/GIT-Repository/VIZ Day/pyDataVizDay2/data/raw/movie_metadata.csv")
@@ -3751,15 +3951,23 @@ OJQeAAAAAElFTkSuQmCC
 
+
+
+
+
+
+

IMDB Score In Histogram

+
+
-
In [40]:
+
In [50]:
-
matplotlib.rcParams['figure.figsize'] = (9.0, 5.0)
+
matplotlib.rcParams['figure.figsize'] = (18, 9.0)
 scores = pd.DataFrame({"imdb score":data["imdb_score"]})
-scores.hist(bins=20)
+scores.hist(bins=20);
 
@@ -3770,19 +3978,6 @@ OJQeAAAAAElFTkSuQmCC
-
- -
Out[40]:
- - - - -
-
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000012C971D0>]], dtype=object)
-
- -
-
@@ -3791,140 +3986,183 @@ OJQeAAAAAElFTkSuQmCC
-
@@ -3934,10 +4172,18 @@ TkSuQmCC
+
+
+
+
+
+

IMDB Score By Count Plot

+
+
-
In [9]:
+
In [26]:
plt.figure(figsize = (18, 9))
@@ -4191,13 +4437,21 @@ n0ty44m7n1Br/WKLfgCA5fNRDgAAAKAZH+UAAAAAmhFMAAAAAM0IJgAAAIBmBBMAAABAM/8fUHwp
 
+
+
+
+
+
+

Count by Content Rating - Count Plot

+
+
-
In [10]:
+
In [21]:
-
plt.figure(figsize = (12, 9))
+
plt.figure(figsize = (11, 4))
 sns.countplot(x = 'content_rating', data = data)
 xt = plt.xticks(rotation=56)
 
@@ -4218,293 +4472,257 @@ n0ty44m7n1Br/WKLfgCA5fNRDgAAAKAZH+UAAAAAmhFMAAAAAM0IJgAAAIBmBBMAAABAM/8fUHwp
- -
-
In [ ]:
+
+
-
-
 
-
- +
+

Content Rating Vs IBDB Score - Box Plot

-
-
@@ -4886,14 +5099,23 @@ RKRo/x9J7Ry013cL/gAAAABJRU5ErkJggg==
+
+
+
+
+
+

Content Rating Vs IBDB Score - Violin Plot

+
+
-
In [14]:
+
In [33]:
plt.figure(figsize = (11, 4))
 sns.violinplot('content_rating', 'imdb_score', data = data)
+xt = plt.xticks(rotation=45)
 
@@ -4904,19 +5126,6 @@ RKRo/x9J7Ry013cL/gAAAABJRU5ErkJggg==
-
- -
Out[14]:
- - - - -
-
<matplotlib.axes._subplots.AxesSubplot at 0xd1f2828>
-
- -
-
@@ -4925,687 +5134,767 @@ RKRo/x9J7Ry013cL/gAAAABJRU5ErkJggg==
-
@@ -5615,6 +5904,64 @@ KjQVRVEURVGUQPj/AQny1vTwN90tAAAAAElFTkSuQmCC
+
+
+
+
In [45]:
+
+
+
!jupyter nbconvert Exploratory_Charts-Movie_Data-Latest_a.ipynb --template basic
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[NbConvertApp] Converting notebook Exploratory_Charts-Movie_Data-Latest_a.ipynb to html
+[NbConvertApp] Writing 412667 bytes to Exploratory_Charts-Movie_Data-Latest_a.html
+
+
+
+ +
+
+ +
+
+
+
In [11]:
+
+
+
Pf = pd.read_csv('C:/Users/alurus/GIT-Repository/VIZ Day/pyDataVizDay/data/raw/movie_metadata.csv')
+
+ +
+
+
+ +
+
+
+
In [15]:
+
+
+
Pf.head([:5];
+
+ +
+
+
+
diff --git a/src/templates/Exploritory.html b/src/templates/Exploritory.html index 4ffd3b3..3475ace 100644 --- a/src/templates/Exploritory.html +++ b/src/templates/Exploritory.html @@ -14,7 +14,9 @@
-
+
+ +
{{ body | safe }}
@@ -22,7 +24,8 @@
- + {% endblock %} diff --git a/src/templates/index.md b/src/templates/index.md index 5392ff9..d3beb08 100644 --- a/src/templates/index.md +++ b/src/templates/index.md @@ -10,7 +10,7 @@ This site is being used for a capability review of various visualization tools, ## Pages -### [Exploritory](/exploritory) +### [Exploritory](/Exploritory) This page is python's bread and butter in data science today. It is able to pull in nearly any type of data set imaginable, transorm, aggregate, and plot very quickly. This page was built using a [jupyter](jupyter.org) notebook. This a very powerful tool that allows us to do interactive reproducible data science with all of our data, agregations, visualizations, and slides all in one place. @@ -29,23 +29,50 @@ app = Flask(__name__) @app.route('/') def index(): + """ + Index page built by converting index.md to html, then inserting that into the index.html template + + go to your base_url + / to view this page + """ body = markdown.markdown(render_template('index.md'), extensions=['markdown.extensions.fenced_code']) return render_template('index.html', body=body) if __name__ == '__main__': + # if this is ran as the main program run the app + # if the program is imported it will not run allowing us to reuse some components in other projects easily. app.run() - ``` +#### Javascript + +For the Enthusiast page we need to implement some javascript in order to update the page dynamically without a page refresh We need to implement a bit of javascript that will talk to our api and change the information on the page. Currently javascript is the only client side scripting language for the browser. There are a number of languages that can be transpiled into browser ready javascript, but javascript is the only language that run in the browser. + +I believe that modern (ES6) is a much better language. It looks much closer to python than older javascript. Currently ES6 does not run in many browsers natively and needs to be compiled down to browser ready javascript. I am trying to keep this project very simple, and did not want a complicated build tool chain. If you are using javascript more often, or have a more complicated use case do yourself a favor and look into setting up build tools that compile ES6 to browser ready javascript. For this reason I have chosen to use jquery. It provides a simple interface into the features that I need, and runs natively in the browser. + +```javascript +$('#top').change(function(){update_kpi()}) +// ties the update function to the top div, and triggers an update any time there is a change in the top div that contains our form data + +function update_kpi() +{ + var url = '/api/score_timeseries?' + $('top').val() + var kpi_data = $.get(url); // gets the json response data from our python api + kpi_data.done(function(results) // waits for the response to come back from python + { + $('#gross').html(results.responseJSON.gross) // updates the gross kpi that sits in the gross div + }) +} +``` + ### [Slides](/slides) Since we are serving up a web app we can embrace the power and flexibility that this gives us. The slides for this event will be served along side of the visualization. This will make these slides available anywhere you have a connection to the web. These slides were written in markdown, and are very simple to write.

#### Example Markdown Code for Slides +This file wil be placed into a slides.html template that has been setup to render reveal slides from markdown. The reveal.js web page has examples of how to setup the html. It will render typical markdown code as html, and will create a new slide at every ```----``` and a new fragment at every ```---``` ``` markdown - # pyDataVizDay *a python implementation for Data Viz Day* @@ -62,7 +89,6 @@ Since we are serving up a web app we can embrace the power and flexibility that * Tools Used * Other Considerations * Pros/Cons - ``` ### [api](/api/doc/#/pyDataVizDay) diff --git a/src/templates/slides.md b/src/templates/slides.md index 8bb578b..d8ae9b8 100644 --- a/src/templates/slides.md +++ b/src/templates/slides.md @@ -1,4 +1,5 @@ # pyDataVizDay +--- *a python implementation for Data Viz Day* ![python](https://s3.amazonaws.com/files.dezyre.com/images/blog/Python+for+Data+Science+vs.+Python+for+Web+Development/Python+for+Data+Science+vs+Web+Devlopment.png) @@ -6,10 +7,9 @@ ---- # Agenda - -1. Viz Walk (3 Views) +--- +1. Viz Walk (2 Views) 1. Full Web App - * Simple Web App * Exploritory Notebook * Tools Used * Other Considerations @@ -17,20 +17,18 @@ ---- -## About Me - +# About Me +--- ![profile](/static/profile_photo_sm.jpg) Waylon Walker Product Engineering - - ---- # Open The Viz - +--- [pydatavizday.herokuapp.com](pydatavizday.herokuapp.com) --- @@ -41,27 +39,63 @@ Product Engineering ---- -## Stack for this viz +# External Resources -* Python - * pandas - * flask -* javascript - * C3 - * reveal - * jquery - * jqcloud -* HTML - * Bootstrap +--- +### Enthusiast +
+
+
+ +

Python

+
    +
  • pandas
  • +
  • flask
  • +
  • flask_restplus
  • +
  • markdown
  • +
+
+
+ +

javascript

+
    +
  • C3
  • +
  • reveal
  • +
  • jquery
  • +
  • jqcloud
  • +
+
+
+ +

HTML

+
    +
  • Bootstrap
  • +
      +
+
+
+ +--- + +### Exploritory + +Python + * jupyter + * seaborn + * numpy + * pandas + * sci-kit learn ---- ## Other Considerations - +--- * Jupyter Notebooks * Jupyter Dashboards * DASH (just released in mid JUNE) +* Bokeh +* Data shader --- @@ -93,7 +127,7 @@ Product Engineering ---- ## Pros of Python - +--- * Fast High Level Data Science * reusable * Powerful Web stack @@ -110,20 +144,17 @@ python has a vast ecosysytem for data wrangling ``` python import pandas as pd -import glob, os -path = "C:/reports" -files = glob.glob(path + os.sep + '*_report.csv*') +raw_example = pd.read_csv('example_data.csv') -frames = [] -for file in files: - frames.append(pd.read_csv(file)) - -all_reports = (pd.concat(frames) - .dropna() - .query('DIVISION == ACCOUNTING') - ) +example = (raw_example + .groupby(['Date']) + .sum() + .resample('m') + .fillna(0) + ) +example.plot() ``` --- @@ -144,19 +175,23 @@ data.update() ### Testing -The ability to easily reuse code/datasets/plot gives us the ability to spend time making large projects more . - +Well written tests give us the confidence to push to production without manually spending our own time testing each feature of our end product. ``` python +import unittest +import etl + class Testdata(unittest.TestCase): """ Test suite for my dataframe data """ + def setUp(self): + self.data = etl.Data() + important_cols = ['DATE', 'PRODUCT', 'QTY'] def test_cols(self): for col in important_cols: - self.assertLess(len(data[data[col].isnull()]), 0, msg=f'column {col} has unexpected null values') - self.assertIn(col, data.columns.tolist(), msg=f'column {col} is missing - check the /data/raw/shipments.csv file to ensure logistics has not changed the data format') + self.assertIn(col, sdata.columns.tolist()) ``` @@ -195,10 +230,9 @@ else: ---- ## Cons on python - +--- * Code * Interactivity -* Speed * ML research --- @@ -212,12 +246,6 @@ else: --- -### Slow - -*slow runtime compared to statically typed languages (c, java)* - ---- - ### Latest ML aglorithms are typically developed in R ---