updated(etl)
uses processed pickle files for performance. Splits genre and keyword into separate rows.
This commit is contained in:
parent
2ecdb43007
commit
84084f3b96
5 changed files with 72 additions and 1 deletions
|
|
@ -69,4 +69,9 @@ This is the wireframe that the team has been given to replicate in python using
|
|||
* **10 min** setup heroku repo and build environment
|
||||
* **30 min** debug initial heroku deployment
|
||||
* **15 min** added reveal slide template
|
||||
* **45 min** update etl
|
||||
* split keyword/genre into rows
|
||||
* define update method
|
||||
* docstrings
|
||||
* add keyword/genre to load method
|
||||
|
||||
|
|
|
|||
BIN
data/processed/genre.pkl
Normal file
BIN
data/processed/genre.pkl
Normal file
Binary file not shown.
BIN
data/processed/keyword.pkl
Normal file
BIN
data/processed/keyword.pkl
Normal file
Binary file not shown.
BIN
data/processed/movie.pkl
Normal file
BIN
data/processed/movie.pkl
Normal file
Binary file not shown.
68
src/etl.py
68
src/etl.py
|
|
@ -26,10 +26,76 @@ class Data(object):
|
|||
|
||||
return value
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
loads/reloads data. Can be called to update data without redefining a
|
||||
new data object.
|
||||
"""
|
||||
|
||||
self.movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv'))
|
||||
self.movie = pd.read_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl'))
|
||||
self.genre = pd.read_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl'))
|
||||
self.keyword = pd.read_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl'))
|
||||
|
||||
def update_data(self):
|
||||
"""
|
||||
creates processed data sets from raw data sets
|
||||
|
||||
This method only needs ran when the dataset gets updated
|
||||
"""
|
||||
movie = pd.read_csv(os.path.join(settings.raw_data_dir, 'movie_metadata.csv'))
|
||||
movie['net'] = movie['gross'] - movie['budget']
|
||||
movie['profitable'] = 0
|
||||
movie.loc[movie['net']>0, 'profitable'] = 1
|
||||
movie.title_year = pd.to_datetime({'year':movie.title_year, 'month':1, 'day':1})
|
||||
movie.to_pickle(os.path.join(settings.processed_data_dir, 'movie.pkl'))
|
||||
|
||||
genre = generate_genre(movie)
|
||||
genre.to_pickle(os.path.join(settings.processed_data_dir, 'genre.pkl'))
|
||||
|
||||
keyword = generate_keyword(movie)
|
||||
keyword.to_pickle(os.path.join(settings.processed_data_dir, 'keyword.pkl'))
|
||||
|
||||
|
||||
def generate_genre(movie):
|
||||
"""
|
||||
splits genres into rows
|
||||
|
||||
movie: DataFrame of movie Data
|
||||
returns: returns DataFrame of index and genre
|
||||
"""
|
||||
genres = movie.reset_index()[['index', 'genres']]
|
||||
frames = list()
|
||||
for row in genres.iterrows():
|
||||
row_genres = row[1].genres.split('|')
|
||||
index = row[1]['index']
|
||||
frames.append(pd.DataFrame({'index':[index]*len(row_genres), 'genres': row_genres}))
|
||||
|
||||
genre = pd.concat(frames).reset_index(drop=True)[['index', 'genres']]
|
||||
return genre
|
||||
|
||||
|
||||
|
||||
def generate_keyword(movie):
|
||||
"""
|
||||
splits keywords into rows
|
||||
|
||||
movie: DataFrame of movie Data
|
||||
returns: returns DataFrame of index and keyword
|
||||
"""
|
||||
|
||||
keywords = movie.reset_index()[['index', 'plot_keywords']].fillna('')
|
||||
frames = list()
|
||||
for row in keywords.iterrows():
|
||||
try:
|
||||
row_keywords = row[1].plot_keywords.split('|')
|
||||
except:
|
||||
print(row[1].plot_keywords)
|
||||
index = row[1]['index']
|
||||
frames.append(pd.DataFrame({'index':[index]*len(row_keywords), 'plot_keywords': row_keywords}))
|
||||
|
||||
keyword = pd.concat(frames).reset_index(drop=True)[['index', 'plot_keywords']]
|
||||
return keyword
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue