From c8d14fd6cce4d0b6f69bb4e41e1f519ba9607432 Mon Sep 17 00:00:00 2001 From: walkews Date: Tue, 11 Jul 2017 08:26:35 -0500 Subject: [PATCH] implemented keyword api --- README.md | 2 ++ requirements.txt | 1 + src/etl.py | 63 +++++++++++++++++++++++++++++++++++++++++++++ src/pyDataVizDay.py | 56 +++++++++++++++++++++++++++------------- 4 files changed, 104 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 8bea766..b8d5543 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,6 @@ This is the wireframe that the team has been given to replicate in python using * define update method * docstrings * add keyword/genre to load method +* **1 hr** implemented data.filter() method +* **30 min** api parser/docs/keywords diff --git a/requirements.txt b/requirements.txt index 46027da..3d91c55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ flask +flask-restplus gunicorn pandas -e git+https://github.com/WaylonWalker/iplotter.git@master#egg=iplotter \ No newline at end of file diff --git a/src/etl.py b/src/etl.py index 1ad7598..6fd8984 100644 --- a/src/etl.py +++ b/src/etl.py @@ -14,6 +14,10 @@ class Data(object): """ if data == None: self.load() + else: + self.movie = data.movie + self.genre = data.genre + self.keyword = data.keyword def __str__(self): @@ -58,6 +62,65 @@ class Data(object): keyword = generate_keyword(movie) keyword.to_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl')) + def filter(self, start_year=None, end_year=None, + genre=None, country=None, language=None, + top=None, title=None, color=None): + + """ + Efficiently filters + + """ + data = Data(self) + + if start_year: + start_year_mask = data.movie.title_year > f'{str(int(start_year)-1)}-01-01' + else: + start_year_mask = True + if end_year: + end_year_mask = data.movie.title_year <= f'{str(end_year)}-01-01' + else: + end_year_mask = True + + if genre: + genre_indexes = data.genre[data.genre.genres == genre]['index'].values + genre_mask = data.movie.index.isin(genre_indexes) + else: + genre_mask = True + + if country: + country_mask = data.movie.country == country + else: + country_mask = True + + if language: + language_mask = data.movie.language == language + else: + language_mask = True + + if title: + title_mask = data.movie.movie_title == title + else: + title_mask = True + + if color: + color_mask = data.movie.color == color + else: + color_mask = True + masks = genre_mask & start_year_mask & end_year_mask & country_mask & language_mask & title_mask & color_mask + + try: + len(masks) + except TypeError: # object type 'bool' has no len() i.e. not a list + masks = [True]*len(data.movie) + + data.movie = data.movie[masks].sort_values('imdb_score', ascending=False) + if top: + data.movie = data.movie.head(int(top)) + data.genre = data.genre[data.genre['index'].isin(data.movie.index.values.tolist())] + data.keyword = data.keyword[data.keyword['index'].isin(data.movie.index.values.tolist())] + + return data + def generate_genre(movie): """ diff --git a/src/pyDataVizDay.py b/src/pyDataVizDay.py index 2d60883..dc75f5c 100644 --- a/src/pyDataVizDay.py +++ b/src/pyDataVizDay.py @@ -9,9 +9,11 @@ IMDB 5000 Movie Dataset. import os import io import base64 as b64 +from collections import Counter from flask import Flask -from flask import request, render_template, make_response, jsonify +from flask import request, render_template, make_response, jsonify, Blueprint, url_for +from flask_restplus import Resource, Api, fields, reqparse import settings import etl import palettes as pal @@ -19,26 +21,25 @@ import palettes as pal from iplotter import C3Plotter c3 = C3Plotter() -def fig_to_html(fig): - """ - converts a matplotlib figure into an html image +app = Flask(__name__) +api_blueprint = Blueprint('api', __name__, url_prefix='/api') +api = Api(api_blueprint, title='pyDataVizday api', + description='This api is used for the pyDataVizDay visualization', + doc='/doc/') +app.register_blueprint(api_blueprint) + +parser = reqparse.RequestParser() +parser.add_argument('start_year', help='start date for data', required=False) +parser.add_argument('end_year', help='end date for data', required=False) +parser.add_argument('genre', help='movie genre', required=False) +parser.add_argument('country', help='geographical country location', required=False) +parser.add_argument('language', help='language of the movie (ex. english)', required=False) +parser.add_argument('top', help='top n titles by imdb rating', required=False) +parser.add_argument('title', help='title of the movie', required=False) +parser.add_argument('color', help='"Color" or "Black and White"', required=False) - :param fig: matplotlibe figure object - :returns: STR html string - """ - buf = io.BytesIO() - fig.savefig(buf, format='png') - img = ('' - .format(b64.b64encode(buf.getvalue())) - .replace("b'",'') - .replace("'",'')) - return img data = etl.Data() -data.load() - - -app = Flask(__name__) @app.route('/') def index(): @@ -71,6 +72,25 @@ def slides(): slide_body = render_template('slide_body.html') return render_template('slides.html', body=slide_body) +@api.route('/keywords') +@api.expect(parser) +class keywords(Resource): + def get(self): + args = parser.parse_args() + keyword_data = data.filter(start_year=args['start_year'], + end_year=args['end_year'], + genre=args['genre'], + country=args['country'], + language=args['language'], + top=args['top'], + title=args['title'], + color=args['color'] + ) + c = Counter(keyword_data.keyword.plot_keywords.values.tolist()) + words = [{'text': word[0], 'weight': word[1]} for word in c.most_common(50)] + + return jsonify(words) + if __name__ == '__main__': port = int(os.environ.get("PORT", 5000)) app.run(host='0.0.0.0', port=port, debug=True) \ No newline at end of file