updated(etl)
uses processed pickle files for performance. Splits genre and keyword into separate rows.
This commit is contained in:
parent
2ecdb43007
commit
84084f3b96
5 changed files with 72 additions and 1 deletions
|
|
@ -69,4 +69,9 @@ This is the wireframe that the team has been given to replicate in python using
|
||||||
* **10 min** setup heroku repo and build environment
|
* **10 min** setup heroku repo and build environment
|
||||||
* **30 min** debug initial heroku deployment
|
* **30 min** debug initial heroku deployment
|
||||||
* **15 min** added reveal slide template
|
* **15 min** added reveal slide template
|
||||||
|
* **45 min** update etl
|
||||||
|
* split keyword/genre into rows
|
||||||
|
* define update method
|
||||||
|
* docstrings
|
||||||
|
* add keyword/genre to load method
|
||||||
|
|
||||||
|
|
|
||||||
BIN
data/processed/genre.pkl
Normal file
BIN
data/processed/genre.pkl
Normal file
Binary file not shown.
BIN
data/processed/keyword.pkl
Normal file
BIN
data/processed/keyword.pkl
Normal file
Binary file not shown.
BIN
data/processed/movie.pkl
Normal file
BIN
data/processed/movie.pkl
Normal file
Binary file not shown.
68
src/etl.py
68
src/etl.py
|
|
@ -26,10 +26,76 @@ class Data(object):
|
||||||
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
"""
|
"""
|
||||||
loads/reloads data. Can be called to update data without redefining a
|
loads/reloads data. Can be called to update data without redefining a
|
||||||
new data object.
|
new data object.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv'))
|
self.movie = pd.read_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl'))
|
||||||
|
self.genre = pd.read_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl'))
|
||||||
|
self.keyword = pd.read_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl'))
|
||||||
|
|
||||||
|
def update_data(self):
|
||||||
|
"""
|
||||||
|
creates processed data sets from raw data sets
|
||||||
|
|
||||||
|
This method only needs ran when the dataset gets updated
|
||||||
|
"""
|
||||||
|
movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv'))
|
||||||
|
movie['net'] = movie['gross'] - movie['budget']
|
||||||
|
movie['profitable'] = 0
|
||||||
|
movie.loc[movie['net']>0, 'profitable'] = 1
|
||||||
|
movie.title_year = pd.to_datetime({'year':movie.title_year, 'month':1, 'day':1})
|
||||||
|
movie.to_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl'))
|
||||||
|
|
||||||
|
genre = generate_genre(movie)
|
||||||
|
genre.to_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl'))
|
||||||
|
|
||||||
|
keyword = generate_keyword(movie)
|
||||||
|
keyword.to_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl'))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_genre(movie):
|
||||||
|
"""
|
||||||
|
splits genres into rows
|
||||||
|
|
||||||
|
movie: DataFrame of movie Data
|
||||||
|
returns: returns DataFrame of index and genre
|
||||||
|
"""
|
||||||
|
genres = movie.reset_index()[['index', 'genres']]
|
||||||
|
frames = list()
|
||||||
|
for row in genres.iterrows():
|
||||||
|
row_genres = row[1].genres.split('|')
|
||||||
|
index = row[1]['index']
|
||||||
|
frames.append(pd.DataFrame({'index':[index]*len(row_genres), 'genres': row_genres}))
|
||||||
|
|
||||||
|
genre = pd.concat(frames).reset_index(drop=True)[['index', 'genres']]
|
||||||
|
return genre
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_keyword(movie):
|
||||||
|
"""
|
||||||
|
splits keywords into rows
|
||||||
|
|
||||||
|
movie: DataFrame of movie Data
|
||||||
|
returns: returns DataFrame of index and keyword
|
||||||
|
"""
|
||||||
|
|
||||||
|
keywords = movie.reset_index()[['index', 'plot_keywords']].fillna('')
|
||||||
|
frames = list()
|
||||||
|
for row in keywords.iterrows():
|
||||||
|
try:
|
||||||
|
row_keywords = row[1].plot_keywords.split('|')
|
||||||
|
except:
|
||||||
|
print(row[1].plot_keywords)
|
||||||
|
index = row[1]['index']
|
||||||
|
frames.append(pd.DataFrame({'index':[index]*len(row_keywords), 'plot_keywords': row_keywords}))
|
||||||
|
|
||||||
|
keyword = pd.concat(frames).reset_index(drop=True)[['index', 'plot_keywords']]
|
||||||
|
return keyword
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue