diff --git a/README.md b/README.md index b7d1897..8bea766 100644 --- a/README.md +++ b/README.md @@ -69,4 +69,9 @@ This is the wireframe that the team has been given to replicate in python using * **10 min** setup heroku repo and build environment * **30 min** debug initial heroku deployment * **15 min** added reveal slide template +* **45 min** update etl + * split keyword/genre into rows + * define update method + * docstrings + * add keyword/genre to load method diff --git a/data/processed/genre.pkl b/data/processed/genre.pkl new file mode 100644 index 0000000..e02cef1 Binary files /dev/null and b/data/processed/genre.pkl differ diff --git a/data/processed/keyword.pkl b/data/processed/keyword.pkl new file mode 100644 index 0000000..6aa52e2 Binary files /dev/null and b/data/processed/keyword.pkl differ diff --git a/data/processed/movie.pkl b/data/processed/movie.pkl new file mode 100644 index 0000000..fa59013 Binary files /dev/null and b/data/processed/movie.pkl differ diff --git a/src/etl.py b/src/etl.py index 4be0508..1ad7598 100644 --- a/src/etl.py +++ b/src/etl.py @@ -26,10 +26,76 @@ class Data(object): return value + def __repr__(self): + return self.__str__() + def load(self): """ loads/reloads data. Can be called to update data without redefining a new data object. """ - self.movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv')) + self.movie = pd.read_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl')) + self.genre = pd.read_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl')) + self.keyword = pd.read_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl')) + + def update_data(self): + """ + creates processed data sets from raw data sets + + This method only needs ran when the dataset gets updated + """ + movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv')) + movie['net'] = movie['gross'] - movie['budget'] + movie['profitable'] = 0 + movie.loc[movie['net']>0, 'profitable'] = 1 + movie.title_year = pd.to_datetime({'year':movie.title_year, 'month':1, 'day':1}) + movie.to_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl')) + + genre = generate_genre(movie) + genre.to_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl')) + + keyword = generate_keyword(movie) + keyword.to_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl')) + + +def generate_genre(movie): + """ + splits genres into rows + + movie: DataFrame of movie Data + returns: returns DataFrame of index and genre + """ + genres = movie.reset_index()[['index', 'genres']] + frames = list() + for row in genres.iterrows(): + row_genres = row[1].genres.split('|') + index = row[1]['index'] + frames.append(pd.DataFrame({'index':[index]*len(row_genres), 'genres': row_genres})) + + genre = pd.concat(frames).reset_index(drop=True)[['index', 'genres']] + return genre + + + +def generate_keyword(movie): + """ + splits keywords into rows + + movie: DataFrame of movie Data + returns: returns DataFrame of index and keyword + """ + + keywords = movie.reset_index()[['index', 'plot_keywords']].fillna('') + frames = list() + for row in keywords.iterrows(): + try: + row_keywords = row[1].plot_keywords.split('|') + except: + print(row[1].plot_keywords) + index = row[1]['index'] + frames.append(pd.DataFrame({'index':[index]*len(row_keywords), 'plot_keywords': row_keywords})) + + keyword = pd.concat(frames).reset_index(drop=True)[['index', 'plot_keywords']] + return keyword +