commit 24e604ab32b4d20ed4b5713e26d73b78a3acf1f7 Author: WaylonWalker Date: Fri Feb 28 08:16:44 2020 -0600 init diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..003c131 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,6 @@ +[report] +fail_under=0 +show_missing=True +exclude_lines = + pragma: no cover + raise NotImplementedError diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dd4627a --- /dev/null +++ b/.gitignore @@ -0,0 +1,157 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** +logs/** + +# except their sub-folders +!data/**/ +!logs/**/ + +# also keep all .gitkeep files +!.gitkeep + +# keep also the example dataset +!data/01_raw/iris.csv + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +.ipython/profile_default/history.sqlite +.ipython/profile_default/startup/README + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/.ipython/profile_default/startup/00-kedro-init.py b/.ipython/profile_default/startup/00-kedro-init.py new file mode 100644 index 0000000..259e5e5 --- /dev/null +++ b/.ipython/profile_default/startup/00-kedro-init.py @@ -0,0 +1,58 @@ +import logging.config +import os +import sys +from pathlib import Path + +from IPython.core.magic import register_line_magic + +# Find the project root (./../../../) +startup_error = None +project_path = Path(__file__).parents[3].resolve() + + +@register_line_magic +def reload_kedro(path, line=None): + """"Line magic which reloads all Kedro default variables.""" + global startup_error + global context + global catalog + + try: + import kedro.config.default_logger + from kedro.context import KEDRO_ENV_VAR, load_context + from kedro.cli.jupyter import collect_line_magic + except ImportError: + logging.error( + "Kedro appears not to be installed in your current environment " + "or your current IPython session was not started in a valid Kedro project." + ) + raise + + try: + path = path or project_path + logging.debug("Loading the context from %s", str(path)) + + context = load_context(path, env=os.getenv(KEDRO_ENV_VAR)) + catalog = context.catalog + + # remove cached user modules + package_name = context.__module__.split(".")[0] + to_remove = [mod for mod in sys.modules if mod.startswith(package_name)] + for module in to_remove: + del sys.modules[module] + + logging.info("** Kedro project %s", str(context.project_name)) + logging.info("Defined global variable `context` and `catalog`") + + for line_magic in collect_line_magic(): + register_line_magic(line_magic) + logging.info("Registered line magic `%s`", line_magic.__name__) + except Exception as err: + startup_error = err + logging.exception( + "Kedro's ipython session startup script failed:\n%s", str(err) + ) + raise err + + +reload_kedro(project_path) diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..590d4b2 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,7 @@ +[settings] +multi_line_output=3 +include_trailing_comma=True +force_grid_wrap=0 +use_parentheses=True +line_length=88 +known_third_party=kedro diff --git a/.kedro.yml b/.kedro.yml new file mode 100644 index 0000000..488c7d4 --- /dev/null +++ b/.kedro.yml @@ -0,0 +1 @@ +context_path: default_kedro_157.run.ProjectContext diff --git a/README.md b/README.md new file mode 100644 index 0000000..48a6aee --- /dev/null +++ b/README.md @@ -0,0 +1,137 @@ +# Default Kedro 157 + +## Overview + +This is your new Kedro project, which was generated using `Kedro 0.15.7` by running: + +``` +kedro new +``` + +Take a look at the [documentation](https://kedro.readthedocs.io) to get started. + +## Rules and guidelines + +In order to get the best out of the template: + * Please don't remove any lines from the `.gitignore` file provided + * Make sure your results can be reproduced by following a data engineering convention, e.g. the one we suggest [here](https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention) + * Don't commit any data to your repository + * Don't commit any credentials or local configuration to your repository + * Keep all credentials or local configuration in `conf/local/` + +## Installing dependencies + +Dependencies should be declared in `src/requirements.txt` for pip installation and `src/environment.yml` for conda installation. + +To install them, run: + +``` +kedro install +``` + +## Running Kedro + +You can run your Kedro project with: + +``` +kedro run +``` + +## Testing Kedro + +Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests with the following command: + +``` +kedro test +``` + +To configure the coverage threshold, please have a look at the file `.coveragerc`. + + +### Working with Kedro from notebooks + +In order to use notebooks in your Kedro project, you need to install Jupyter: + +``` +pip install jupyter +``` + +For using Jupyter Lab, you need to install it: + +``` +pip install jupyterlab +``` + +After installing Jupyter, you can start a local notebook server: + +``` +kedro jupyter notebook +``` + +You can also start Jupyter Lab: + +``` +kedro jupyter lab +``` + +And if you want to run an IPython session: + +``` +kedro ipython +``` + +Running Jupyter or IPython this way provides the following variables in +scope: `proj_dir`, `proj_name`, `conf`, `io`, `parameters` and `startup_error`. + +#### Converting notebook cells to nodes in a Kedro project + +Once you are happy with a notebook, you may want to move your code over into the Kedro project structure for the next stage in your development. This is done through a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#cell-tags) and Kedro CLI commands. + +By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src//nodes/`. +``` +kedro jupyter convert +``` +> *Note:* The name of the Python file matches the name of the original notebook. + +Alternatively, you may want to transform all your notebooks in one go. To this end, you can run the following command to convert all notebook files found in the project root directory and under any of its sub-folders. +``` +kedro jupyter convert --all +``` + +#### Ignoring notebook output cells in `git` + +In order to automatically strip out all output cell contents before committing to `git`, you can run `kedro activate-nbstripout`. This will add a hook in `.git/config` which will run `nbstripout` before anything is committed to `git`. + +> *Note:* Your output cells will be left intact locally. + +## Package the project + +In order to package the project's Python code in `.egg` and / or a `.wheel` file, you can run: + +``` +kedro package +``` + +After running that, you can find the two packages in `src/dist/`. + +## Building API documentation + +To build API docs for your code using Sphinx, run: + +``` +kedro build-docs +``` + +See your documentation by opening `docs/build/html/index.html`. + +## Building the project requirements + +To generate or update the dependency requirements for your project, run: + +``` +kedro build-reqs +``` + +This will copy the contents of `src/requirements.txt` into a new file `src/requirements.in` which will be used as the source for `pip-compile`. You can see the output of the resolution by opening `src/requirements.txt`. + +After this, if you'd like to update your project requirements, please update `src/requirements.in` and re-run `kedro build-reqs`. diff --git a/conf/README.md b/conf/README.md new file mode 100644 index 0000000..bc9d103 --- /dev/null +++ b/conf/README.md @@ -0,0 +1,26 @@ +# What is this for? + +This folder should be used to store configuration files used by Kedro or by separate tools. + +This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**. + +## Local configuration + +The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +> *Note:* Please do not check in any local configuration to version control. + +## Base configuration + +The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. + +WARNING: Please do not put access credentials in the base configuration folder. + +# Instructions + + + + + +# Find out more +You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/04_user_guide/03_configuration.html). diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml new file mode 100644 index 0000000..0ecf6f2 --- /dev/null +++ b/conf/base/catalog.yml @@ -0,0 +1,50 @@ +# Here you can define all your data sets by using simple YAML syntax. +# +# Documentation for this file format can be found in "The Data Catalog" +# Link: https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html +# +# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS +# +# An example data set definition can look as follows: +# +#bikes: +# type: pandas.CSVDataSet +# filepath: "data/01_raw/bikes.csv" +# +#weather: +# type: spark.SparkDataSet +# filepath: s3a://your_bucket/data/01_raw/weather* +# file_format: csv +# credentials: dev_s3 +# load_args: +# header: True +# inferSchema: True +# save_args: +# sep: '|' +# header: True +# +#scooters: +# type: pandas.SQLTableDataSet +# credentials: scooters_credentials +# table_name: scooters +# load_args: +# index_col: ['name'] +# columns: ['name', 'gear'] +# save_args: +# if_exists: 'replace' +# # if_exists: 'fail' +# # if_exists: 'append' +# +# The Data Catalog supports being able to reference the same file using two different DataSet implementations +# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: +# https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html + +# +# This is a data set used by the "Hello World" example pipeline provided with the project +# template. Please feel free to remove it once you remove the example pipeline. + +example_iris_data: + type: CSVLocalDataSet + filepath: data/01_raw/iris.csv + + diff --git a/conf/base/logging.yml b/conf/base/logging.yml new file mode 100644 index 0000000..3689418 --- /dev/null +++ b/conf/base/logging.yml @@ -0,0 +1,66 @@ +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + json_formatter: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + class: pythonjsonlogger.jsonlogger.JsonFormatter + +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: logs/info.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + + error_file_handler: + class: logging.handlers.RotatingFileHandler + level: ERROR + formatter: simple + filename: logs/errors.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + + journal_file_handler: + class: kedro.versioning.journal.JournalFileHandler + level: INFO + base_dir: logs/journals + formatter: json_formatter + +loggers: + anyconfig: + level: WARNING + handlers: [console, info_file_handler, error_file_handler] + propagate: no + + kedro.io: + level: INFO + handlers: [console, info_file_handler, error_file_handler] + propagate: no + + kedro.pipeline: + level: INFO + handlers: [console, info_file_handler, error_file_handler] + propagate: no + + kedro.journal: + level: INFO + handlers: [journal_file_handler] + propagate: no + +root: + level: INFO + handlers: [console, info_file_handler, error_file_handler] diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml new file mode 100644 index 0000000..9e9f8b6 --- /dev/null +++ b/conf/base/parameters.yml @@ -0,0 +1,8 @@ + +# Parameters for the example pipeline. Feel free to delete these once you +# remove the example pipeline from pipeline.py and the example nodes in +# `src/pipelines/` +example_test_data_ratio: 0.2 +example_num_train_iter: 10000 +example_learning_rate: 0.01 + diff --git a/conf/local/.gitkeep b/conf/local/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/01_raw/.gitkeep b/data/01_raw/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/01_raw/iris.csv b/data/01_raw/iris.csv new file mode 100644 index 0000000..ba0ebd2 --- /dev/null +++ b/data/01_raw/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,setosa +4.9,3.0,1.4,0.2,setosa +4.7,3.2,1.3,0.2,setosa +4.6,3.1,1.5,0.2,setosa +5.0,3.6,1.4,0.2,setosa +5.4,3.9,1.7,0.4,setosa +4.6,3.4,1.4,0.3,setosa +5.0,3.4,1.5,0.2,setosa +4.4,2.9,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.4,3.7,1.5,0.2,setosa +4.8,3.4,1.6,0.2,setosa +4.8,3.0,1.4,0.1,setosa +4.3,3.0,1.1,0.1,setosa +5.8,4.0,1.2,0.2,setosa +5.7,4.4,1.5,0.4,setosa +5.4,3.9,1.3,0.4,setosa +5.1,3.5,1.4,0.3,setosa +5.7,3.8,1.7,0.3,setosa +5.1,3.8,1.5,0.3,setosa +5.4,3.4,1.7,0.2,setosa +5.1,3.7,1.5,0.4,setosa +4.6,3.6,1.0,0.2,setosa +5.1,3.3,1.7,0.5,setosa +4.8,3.4,1.9,0.2,setosa +5.0,3.0,1.6,0.2,setosa +5.0,3.4,1.6,0.4,setosa +5.2,3.5,1.5,0.2,setosa +5.2,3.4,1.4,0.2,setosa +4.7,3.2,1.6,0.2,setosa +4.8,3.1,1.6,0.2,setosa +5.4,3.4,1.5,0.4,setosa +5.2,4.1,1.5,0.1,setosa +5.5,4.2,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.0,3.2,1.2,0.2,setosa +5.5,3.5,1.3,0.2,setosa +4.9,3.1,1.5,0.1,setosa +4.4,3.0,1.3,0.2,setosa +5.1,3.4,1.5,0.2,setosa +5.0,3.5,1.3,0.3,setosa +4.5,2.3,1.3,0.3,setosa +4.4,3.2,1.3,0.2,setosa +5.0,3.5,1.6,0.6,setosa +5.1,3.8,1.9,0.4,setosa +4.8,3.0,1.4,0.3,setosa +5.1,3.8,1.6,0.2,setosa +4.6,3.2,1.4,0.2,setosa +5.3,3.7,1.5,0.2,setosa +5.0,3.3,1.4,0.2,setosa +7.0,3.2,4.7,1.4,versicolor +6.4,3.2,4.5,1.5,versicolor +6.9,3.1,4.9,1.5,versicolor +5.5,2.3,4.0,1.3,versicolor +6.5,2.8,4.6,1.5,versicolor +5.7,2.8,4.5,1.3,versicolor +6.3,3.3,4.7,1.6,versicolor +4.9,2.4,3.3,1.0,versicolor +6.6,2.9,4.6,1.3,versicolor +5.2,2.7,3.9,1.4,versicolor +5.0,2.0,3.5,1.0,versicolor +5.9,3.0,4.2,1.5,versicolor +6.0,2.2,4.0,1.0,versicolor +6.1,2.9,4.7,1.4,versicolor +5.6,2.9,3.6,1.3,versicolor +6.7,3.1,4.4,1.4,versicolor +5.6,3.0,4.5,1.5,versicolor +5.8,2.7,4.1,1.0,versicolor +6.2,2.2,4.5,1.5,versicolor +5.6,2.5,3.9,1.1,versicolor +5.9,3.2,4.8,1.8,versicolor +6.1,2.8,4.0,1.3,versicolor +6.3,2.5,4.9,1.5,versicolor +6.1,2.8,4.7,1.2,versicolor +6.4,2.9,4.3,1.3,versicolor +6.6,3.0,4.4,1.4,versicolor +6.8,2.8,4.8,1.4,versicolor +6.7,3.0,5.0,1.7,versicolor +6.0,2.9,4.5,1.5,versicolor +5.7,2.6,3.5,1.0,versicolor +5.5,2.4,3.8,1.1,versicolor +5.5,2.4,3.7,1.0,versicolor +5.8,2.7,3.9,1.2,versicolor +6.0,2.7,5.1,1.6,versicolor +5.4,3.0,4.5,1.5,versicolor +6.0,3.4,4.5,1.6,versicolor +6.7,3.1,4.7,1.5,versicolor +6.3,2.3,4.4,1.3,versicolor +5.6,3.0,4.1,1.3,versicolor +5.5,2.5,4.0,1.3,versicolor +5.5,2.6,4.4,1.2,versicolor +6.1,3.0,4.6,1.4,versicolor +5.8,2.6,4.0,1.2,versicolor +5.0,2.3,3.3,1.0,versicolor +5.6,2.7,4.2,1.3,versicolor +5.7,3.0,4.2,1.2,versicolor +5.7,2.9,4.2,1.3,versicolor +6.2,2.9,4.3,1.3,versicolor +5.1,2.5,3.0,1.1,versicolor +5.7,2.8,4.1,1.3,versicolor +6.3,3.3,6.0,2.5,virginica +5.8,2.7,5.1,1.9,virginica +7.1,3.0,5.9,2.1,virginica +6.3,2.9,5.6,1.8,virginica +6.5,3.0,5.8,2.2,virginica +7.6,3.0,6.6,2.1,virginica +4.9,2.5,4.5,1.7,virginica +7.3,2.9,6.3,1.8,virginica +6.7,2.5,5.8,1.8,virginica +7.2,3.6,6.1,2.5,virginica +6.5,3.2,5.1,2.0,virginica +6.4,2.7,5.3,1.9,virginica +6.8,3.0,5.5,2.1,virginica +5.7,2.5,5.0,2.0,virginica +5.8,2.8,5.1,2.4,virginica +6.4,3.2,5.3,2.3,virginica +6.5,3.0,5.5,1.8,virginica +7.7,3.8,6.7,2.2,virginica +7.7,2.6,6.9,2.3,virginica +6.0,2.2,5.0,1.5,virginica +6.9,3.2,5.7,2.3,virginica +5.6,2.8,4.9,2.0,virginica +7.7,2.8,6.7,2.0,virginica +6.3,2.7,4.9,1.8,virginica +6.7,3.3,5.7,2.1,virginica +7.2,3.2,6.0,1.8,virginica +6.2,2.8,4.8,1.8,virginica +6.1,3.0,4.9,1.8,virginica +6.4,2.8,5.6,2.1,virginica +7.2,3.0,5.8,1.6,virginica +7.4,2.8,6.1,1.9,virginica +7.9,3.8,6.4,2.0,virginica +6.4,2.8,5.6,2.2,virginica +6.3,2.8,5.1,1.5,virginica +6.1,2.6,5.6,1.4,virginica +7.7,3.0,6.1,2.3,virginica +6.3,3.4,5.6,2.4,virginica +6.4,3.1,5.5,1.8,virginica +6.0,3.0,4.8,1.8,virginica +6.9,3.1,5.4,2.1,virginica +6.7,3.1,5.6,2.4,virginica +6.9,3.1,5.1,2.3,virginica +5.8,2.7,5.1,1.9,virginica +6.8,3.2,5.9,2.3,virginica +6.7,3.3,5.7,2.5,virginica +6.7,3.0,5.2,2.3,virginica +6.3,2.5,5.0,1.9,virginica +6.5,3.0,5.2,2.0,virginica +6.2,3.4,5.4,2.3,virginica +5.9,3.0,5.1,1.8,virginica diff --git a/data/02_intermediate/.gitkeep b/data/02_intermediate/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/03_primary/.gitkeep b/data/03_primary/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/04_features/.gitkeep b/data/04_features/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/05_model_input/.gitkeep b/data/05_model_input/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/06_models/.gitkeep b/data/06_models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/07_model_output/.gitkeep b/data/07_model_output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/08_reporting/.gitkeep b/data/08_reporting/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..8b375f9 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +# default_kedro_157 documentation build +# configuration file, created by sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import re + +from kedro.cli.utils import find_stylesheets +from recommonmark.transform import AutoStructify + +from default_kedro_157 import __version__ as release + +# -- Project information ----------------------------------------------------- + +project = "default_kedro_157" +copyright = "2020, QuantumBlack Visual Analytics Limited" +author = "QuantumBlack" + +# The short X.Y version. +version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.doctest", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.mathjax", + "nbsphinx", + "recommonmark", + "sphinx_copybutton", +] + +# enable autosummary plugin (table of contents for modules/classes/class +# methods) +autosummary_generate = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ["_build", "**.ipynb_checkpoints"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = {"collapse_navigation": False, "style_external_links": True} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +html_show_sourcelink = False + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "default_kedro_157doc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ( + master_doc, + "default_kedro_157.tex", + "default_kedro_157 Documentation", + "QuantumBlack", + "manual", + ) +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ( + master_doc, + "default_kedro_157", + "default_kedro_157 Documentation", + [author], + 1, + ) +] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "default_kedro_157", + "default_kedro_157 Documentation", + author, + "default_kedro_157", + "Project default_kedro_157 codebase.", + "Data-Science", + ) +] + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Extension configuration ------------------------------------------------- + +# nbsphinx_prolog = """ +# see here for prolog/epilog details: +# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html +# """ + +# -- NBconvert kernel config ------------------------------------------------- +nbsphinx_kernel_name = "python3" + + +def remove_arrows_in_examples(lines): + for i, line in enumerate(lines): + lines[i] = line.replace(">>>", "") + + +def autodoc_process_docstring(app, what, name, obj, options, lines): + remove_arrows_in_examples(lines) + + +def skip(app, what, name, obj, skip, options): + if name == "__init__": + return False + return skip + + +def setup(app): + app.connect("autodoc-process-docstring", autodoc_process_docstring) + app.connect("autodoc-skip-member", skip) + # add Kedro stylesheets + for stylesheet in find_stylesheets(): + app.add_stylesheet(stylesheet) + # enable rendering RST tables in Markdown + app.add_config_value("recommonmark_config", {"enable_eval_rst": True}, True) + app.add_transform(AutoStructify) diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..f327240 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,19 @@ +.. default_kedro_157 documentation master file, created by sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to project's default_kedro_157 API docs! +============================================= + +.. toctree:: + :maxdepth: 4 + + modules + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/kedro_cli.py b/kedro_cli.py new file mode 100644 index 0000000..6ced0dd --- /dev/null +++ b/kedro_cli.py @@ -0,0 +1,624 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Command line tools for manipulating a Kedro project. +Intended to be invoked via `kedro`.""" +import os +import re +import shutil +import subprocess +import sys +import webbrowser +from collections import Counter +from glob import iglob +from itertools import chain +from pathlib import Path +from typing import Any, Dict, Iterable, List, Tuple + +import anyconfig +import click +from click import secho, style +from kedro.cli import main as kedro_main +from kedro.cli.utils import ( + KedroCliError, + call, + export_nodes, + forward_command, + python_call, +) +from kedro.context import KEDRO_ENV_VAR, load_context +from kedro.runner import SequentialRunner +from kedro.utils import load_obj + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + +# get our package onto the python path +PROJ_PATH = Path(__file__).resolve().parent +os.environ["IPYTHONDIR"] = str(PROJ_PATH / ".ipython") + + +NO_DEPENDENCY_MESSAGE = """{0} is not installed. Please make sure {0} is in +src/requirements.txt and run `kedro install`.""" + +TAG_ARG_HELP = """Construct the pipeline using only nodes which have this tag +attached. Option can be used multiple times, what results in a +pipeline constructed from nodes having any of those tags.""" + +PIPELINE_ARG_HELP = """Name of the modular pipeline to run. +If not set, the project pipeline is run by default.""" + +ENV_ARG_HELP = """Run the pipeline in a configured environment. If not specified, +pipeline will run using environment `local`.""" + +NODE_ARG_HELP = """Run only nodes with specified names.""" + +FROM_NODES_HELP = """A list of node names which should be used as a starting point.""" + +TO_NODES_HELP = """A list of node names which should be used as an end point.""" + +FROM_INPUTS_HELP = ( + """A list of dataset names which should be used as a starting point.""" +) + +PARALLEL_ARG_HELP = """Run the pipeline using the `ParallelRunner`. +If not specified, use the `SequentialRunner`. This flag cannot be used together +with --runner.""" + +OPEN_ARG_HELP = """Open the documentation in your default browser after building.""" + +RUNNER_ARG_HELP = """Specify a runner that you want to run the pipeline with. +This option cannot be used together with --parallel.""" + +CONVERT_ALL_HELP = """Extract the nodes from all notebooks in the Kedro project directory, +including sub-folders.""" + +OVERWRITE_HELP = """If Python file already exists for the equivalent notebook, +overwrite its contents.""" + +LOAD_VERSION_HELP = """Specify a particular dataset version (timestamp) for loading.""" + +CONFIG_FILE_HELP = """Specify a YAML configuration file to load the run +command arguments from. If command line arguments are provided, they will +override the loaded ones.""" + +PARAMS_ARG_HELP = """Specify extra parameters that you want to pass +to the context initializer. Items must be separated by comma, keys - by colon, +example: param1:value1,param2:value2. Each parameter is split by the first comma, +so parameter values are allowed to contain colons, parameter keys are not.""" + +JUPYTER_IP_HELP = "IP address of the Jupyter server." +JUPYTER_ALL_KERNELS_HELP = "Display all available Python kernels." +JUPYTER_IDLE_TIMEOUT_HELP = """When a notebook is closed, Jupyter server will +terminate its kernel after so many seconds of inactivity. This does not affect +any open notebooks.""" + + +def _split_string(ctx, param, value): + return [item for item in value.split(",") if item] + + +def _try_convert_to_numeric(value): + try: + value = float(value) + except ValueError: + return value + return int(value) if value.is_integer() else value + + +def _split_params(ctx, param, value): + if isinstance(value, dict): + return value + result = {} + for item in _split_string(ctx, param, value): + item = item.split(":", 1) + if len(item) != 2: + ctx.fail( + "Invalid format of `{}` option: Item `{}` must contain a key and " + "a value separated by `:`.".format(param.name, item[0]) + ) + key = item[0].strip() + if not key: + ctx.fail( + "Invalid format of `{}` option: Parameter key cannot be " + "an empty string.".format(param.name) + ) + value = item[1].strip() + result[key] = _try_convert_to_numeric(value) + return result + + +def _reformat_load_versions(ctx, param, value) -> Dict[str, str]: + """Reformat data structure from tuple to dictionary for `load-version`. + E.g ('dataset1:time1', 'dataset2:time2') -> {"dataset1": "time1", "dataset2": "time2"}. + """ + load_version_separator = ":" + load_versions_dict = {} + + for load_version in value: + load_version_list = load_version.split(load_version_separator, 1) + if len(load_version_list) != 2: + raise ValueError( + "Expected the form of `load_version` to be " + "`dataset_name:YYYY-MM-DDThh.mm.ss.sssZ`," + "found {} instead".format(load_version) + ) + load_versions_dict[load_version_list[0]] = load_version_list[1] + + return load_versions_dict + + +def _config_file_callback(ctx, param, value): + """Config file callback, that replaces command line options with config file + values. If command line options are passed, they override config file values. + """ + ctx.default_map = ctx.default_map or {} + section = ctx.info_name + + if value: + config = anyconfig.load(value)[section] + ctx.default_map.update(config) + + return value + + +def _get_values_as_tuple(values: Iterable[str]) -> Tuple[str]: + return tuple(chain.from_iterable(value.split(",") for value in values)) + + +@click.group(context_settings=CONTEXT_SETTINGS, name=__file__) +def cli(): + """Command line tools for manipulating a Kedro project.""" + + +@cli.command() +@click.option( + "--from-inputs", type=str, default="", help=FROM_INPUTS_HELP, callback=_split_string +) +@click.option( + "--from-nodes", type=str, default="", help=FROM_NODES_HELP, callback=_split_string +) +@click.option( + "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=_split_string +) +@click.option("--node", "-n", "node_names", type=str, multiple=True, help=NODE_ARG_HELP) +@click.option( + "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP +) +@click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP) +@click.option( + "--env", + "-e", + type=str, + default=None, + multiple=False, + envvar=KEDRO_ENV_VAR, + help=ENV_ARG_HELP, +) +@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) +@click.option( + "--load-version", + "-lv", + type=str, + multiple=True, + help=LOAD_VERSION_HELP, + callback=_reformat_load_versions, +) +@click.option("--pipeline", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option( + "--config", + "-c", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + help=CONFIG_FILE_HELP, + callback=_config_file_callback, +) +@click.option( + "--params", type=str, default="", help=PARAMS_ARG_HELP, callback=_split_params +) +def run( + tag, + env, + parallel, + runner, + node_names, + to_nodes, + from_nodes, + from_inputs, + load_version, + pipeline, + config, + params, +): + """Run the pipeline.""" + if parallel and runner: + raise KedroCliError( + "Both --parallel and --runner options cannot be used together. " + "Please use either --parallel or --runner." + ) + if parallel: + runner = "ParallelRunner" + runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner + + tag = _get_values_as_tuple(tag) if tag else tag + node_names = _get_values_as_tuple(node_names) if node_names else node_names + + context = load_context(Path.cwd(), env=env, extra_params=params) + context.run( + tags=tag, + runner=runner_class(), + node_names=node_names, + from_nodes=from_nodes, + to_nodes=to_nodes, + from_inputs=from_inputs, + load_versions=load_version, + pipeline_name=pipeline, + ) + + +@forward_command(cli, forward_help=True) +def test(args): + """Run the test suite.""" + try: + import pytest # pylint: disable=unused-import + except ImportError: + raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("pytest")) + else: + python_call("pytest", args) + + +@cli.command() +@click.argument("files", type=click.Path(exists=True), nargs=-1) +def lint(files): + """Run flake8, isort and (on Python >=3.6) black.""" + # pylint: disable=unused-import + if not files: + files = ("src/tests", "src/default_kedro_157") + + try: + import flake8 + import isort + except ImportError as exc: + raise KedroCliError(NO_DEPENDENCY_MESSAGE.format(exc.name)) + + python_call("flake8", ("--max-line-length=88",) + files) + python_call("isort", ("-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=88") + files) + + if sys.version_info[:2] >= (3, 6): + try: + import black + except ImportError: + raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("black")) + python_call("black", files) + + +@cli.command() +def install(): + """Install project dependencies from both requirements.txt + and environment.yml (optional).""" + + if (Path.cwd() / "src" / "environment.yml").is_file(): + call(["conda", "install", "--file", "src/environment.yml", "--yes"]) + + pip_command = ["install", "-U", "-r", "src/requirements.txt"] + + if os.name == "posix": + python_call("pip", pip_command) + else: + command = [sys.executable, "-m", "pip"] + pip_command + subprocess.Popen(command, creationflags=subprocess.CREATE_NEW_CONSOLE) + + +@forward_command(cli, forward_help=True) +def ipython(args): + """Open IPython with project specific variables loaded.""" + if "-h" not in args and "--help" not in args: + ipython_message() + call(["ipython"] + list(args)) + + +@cli.command() +def package(): + """Package the project as a Python egg and wheel.""" + call([sys.executable, "setup.py", "clean", "--all", "bdist_egg"], cwd="src") + call([sys.executable, "setup.py", "clean", "--all", "bdist_wheel"], cwd="src") + + +@cli.command("build-docs") +@click.option( + "--open", + "-o", + "open_docs", + is_flag=True, + multiple=False, + default=False, + help=OPEN_ARG_HELP, +) +def build_docs(open_docs): + """Build the project documentation.""" + python_call("pip", ["install", "src/[docs]"]) + python_call("pip", ["install", "-r", "src/requirements.txt"]) + python_call( + "ipykernel", ["install", "--user", "--name=default_kedro_157"] + ) + shutil.rmtree("docs/build", ignore_errors=True) + call( + [ + "sphinx-apidoc", + "--module-first", + "-o", + "docs/source", + "src/default_kedro_157", + ] + ) + call(["sphinx-build", "-M", "html", "docs/source", "docs/build", "-a"]) + if open_docs: + docs_page = (Path.cwd() / "docs" / "build" / "html" / "index.html").as_uri() + secho("Opening {}".format(docs_page)) + webbrowser.open(docs_page) + + +@cli.command("build-reqs") +def build_reqs(): + """Build the project dependency requirements.""" + requirements_path = Path.cwd() / "src" / "requirements.in" + if not requirements_path.is_file(): + secho("No requirements.in found. Copying contents from requirements.txt...") + contents = (Path.cwd() / "src" / "requirements.txt").read_text() + requirements_path.write_text(contents) + python_call("piptools", ["compile", str(requirements_path)]) + secho( + ( + "Requirements built! Please update requirements.in " + "if you'd like to make a change in your project's dependencies, " + "and re-run build-reqs to generate the new requirements.txt." + ) + ) + + +@cli.command("activate-nbstripout") +def activate_nbstripout(): + """Install the nbstripout git hook to automatically clean notebooks.""" + secho( + ( + "Notebook output cells will be automatically cleared before committing" + " to git." + ), + fg="yellow", + ) + + try: + import nbstripout # pylint: disable=unused-import + except ImportError: + raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("nbstripout")) + + try: + res = subprocess.run( + ["git", "rev-parse", "--git-dir"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if res.returncode: + raise KedroCliError("Not a git repository. Run `git init` first.") + except FileNotFoundError: + raise KedroCliError("Git executable not found. Install Git first.") + + call(["nbstripout", "--install"]) + + +def _build_jupyter_command( + base: str, ip: str, all_kernels: bool, args: Iterable[str], idle_timeout: int +) -> List[str]: + cmd = [ + base, + "--ip", + ip, + "--MappingKernelManager.cull_idle_timeout={}".format(idle_timeout), + "--MappingKernelManager.cull_interval={}".format(idle_timeout), + ] + + if not all_kernels: + project_name = "Default Kedro 157" + kernel_name = re.sub(r"[^\w]+", "", project_name).strip() or "Kedro" + + cmd += [ + "--NotebookApp.kernel_spec_manager_class=" + "kedro.cli.jupyter.SingleKernelSpecManager", + "--KernelSpecManager.default_kernel_name='{}'".format(kernel_name), + ] + + return cmd + list(args) + + +def _build_jupyter_env(kedro_env: str) -> Dict[str, Any]: + """Build the environment dictionary that gets injected into the subprocess running + Jupyter. Since the subprocess has access only to the environment variables passed + in, we need to copy the current environment and add ``KEDRO_ENV_VAR``. + """ + if not kedro_env: + return {} + jupyter_env = os.environ.copy() + jupyter_env[KEDRO_ENV_VAR] = kedro_env + return {"env": jupyter_env} + + +@cli.group() +def jupyter(): + """Open Jupyter Notebook / Lab with project specific variables loaded, or + convert notebooks into Kedro code. + """ + + +@forward_command(jupyter, "notebook", forward_help=True) +@click.option("--ip", type=str, default="127.0.0.1", help=JUPYTER_IP_HELP) +@click.option( + "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP +) +@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP) +@click.option( + "--env", + "-e", + type=str, + default=None, + multiple=False, + envvar=KEDRO_ENV_VAR, + help=ENV_ARG_HELP, +) +def jupyter_notebook(ip, all_kernels, env, idle_timeout, args): + """Open Jupyter Notebook with project specific variables loaded.""" + if "-h" not in args and "--help" not in args: + ipython_message(all_kernels) + + arguments = _build_jupyter_command( + "notebook", ip=ip, all_kernels=all_kernels, args=args, idle_timeout=idle_timeout + ) + + python_call_kwargs = _build_jupyter_env(env) + python_call("jupyter", arguments, **python_call_kwargs) + + +@forward_command(jupyter, "lab", forward_help=True) +@click.option("--ip", type=str, default="127.0.0.1", help=JUPYTER_IP_HELP) +@click.option( + "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP +) +@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP) +@click.option( + "--env", + "-e", + type=str, + default=None, + multiple=False, + envvar=KEDRO_ENV_VAR, + help=ENV_ARG_HELP, +) +def jupyter_lab(ip, all_kernels, env, idle_timeout, args): + """Open Jupyter Lab with project specific variables loaded.""" + if "-h" not in args and "--help" not in args: + ipython_message(all_kernels) + + arguments = _build_jupyter_command( + "lab", ip=ip, all_kernels=all_kernels, args=args, idle_timeout=idle_timeout + ) + + python_call_kwargs = _build_jupyter_env(env) + python_call("jupyter", arguments, **python_call_kwargs) + + +@jupyter.command("convert") +@click.option("--all", "all_flag", is_flag=True, help=CONVERT_ALL_HELP) +@click.option("-y", "overwrite_flag", is_flag=True, help=OVERWRITE_HELP) +@click.argument( + "filepath", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + required=False, + nargs=-1, +) +def convert_notebook(all_flag, overwrite_flag, filepath): + """Convert selected or all notebooks found in a Kedro project + to Kedro code, by exporting code from the appropriately-tagged cells: + Cells tagged as `node` will be copied over to a Python file matching + the name of the notebook, under `src//nodes`. + *Note*: Make sure your notebooks have unique names! + FILEPATH: Path(s) to exact notebook file(s) to be converted. Both + relative and absolute paths are accepted. + Should not be provided if --all flag is already present. + """ + context = load_context(Path.cwd()) + + if not filepath and not all_flag: + secho( + "Please specify a notebook filepath " + "or add '--all' to convert all notebooks." + ) + sys.exit(1) + + kedro_project_path = context.project_path + kedro_package_name = "default_kedro_157" + + if all_flag: + # pathlib glob does not ignore hidden directories, + # whereas Python glob does, which is more useful in + # ensuring checkpoints will not be included + pattern = kedro_project_path / "**" / "*.ipynb" + notebooks = sorted(Path(p) for p in iglob(str(pattern), recursive=True)) + else: + notebooks = [Path(f) for f in filepath] + + counter = Counter(n.stem for n in notebooks) + non_unique_names = [name for name, counts in counter.items() if counts > 1] + if non_unique_names: + raise KedroCliError( + "Found non-unique notebook names! " + "Please rename the following: {}".format(", ".join(non_unique_names)) + ) + + for notebook in notebooks: + secho("Converting notebook '{}'...".format(str(notebook))) + output_path = ( + kedro_project_path + / "src" + / kedro_package_name + / "nodes" + / "{}.py".format(notebook.stem) + ) + + if output_path.is_file(): + overwrite = overwrite_flag or click.confirm( + "Output file {} already exists. Overwrite?".format(str(output_path)), + default=False, + ) + if overwrite: + export_nodes(notebook, output_path) + else: + export_nodes(notebook, output_path) + + secho("Done!") + + +def ipython_message(all_kernels=True): + """Show a message saying how we have configured the IPython env.""" + ipy_vars = ["startup_error", "context"] + secho("-" * 79, fg="cyan") + secho("Starting a Kedro session with the following variables in scope") + secho(", ".join(ipy_vars), fg="green") + secho( + "Use the line magic {} to refresh them".format( + style("%reload_kedro", fg="green") + ) + ) + secho("or to see the error message if they are undefined") + + if not all_kernels: + secho("The choice of kernels is limited to the default one.", fg="yellow") + secho("(restart with --all-kernels to get access to others)", fg="yellow") + + secho("-" * 79, fg="cyan") + + +if __name__ == "__main__": + os.chdir(str(PROJ_PATH)) + kedro_main() diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/logs/journals/.gitkeep b/logs/journals/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8f0a267 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[tool:pytest] +addopts=--cov-report term-missing + --cov src/default_kedro_157 -ra diff --git a/src/default_kedro_157/__init__.py b/src/default_kedro_157/__init__.py new file mode 100644 index 0000000..894d411 --- /dev/null +++ b/src/default_kedro_157/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Default Kedro 157 +""" + +__version__ = "0.1" diff --git a/src/default_kedro_157/nodes/__init__.py b/src/default_kedro_157/nodes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/default_kedro_157/pipeline.py b/src/default_kedro_157/pipeline.py new file mode 100644 index 0000000..2c617a1 --- /dev/null +++ b/src/default_kedro_157/pipeline.py @@ -0,0 +1,68 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Construction of the master pipeline. +""" + +from typing import Dict +from kedro.pipeline import Pipeline + + + +########################################################################### +# Here you can find an example pipeline, made of two modular pipelines. +# +# Delete this when you start working on your own Kedro project as +# well as pipelines/data_science AND pipelines/data_engineering +# ------------------------------------------------------------------------- + +from default_kedro_157.pipelines import data_engineering as de +from default_kedro_157.pipelines import data_science as ds + + +def create_pipelines(**kwargs) -> Dict[str, Pipeline]: + """Create the project's pipeline. + + Args: + kwargs: Ignore any additional arguments added in the future. + + Returns: + A mapping from a pipeline name to a ``Pipeline`` object. + + """ + + + data_engineering_pipeline = de.create_pipeline() + data_science_pipeline = ds.create_pipeline() + + return { + "de": data_engineering_pipeline, + "ds": data_science_pipeline, + "__default__": data_engineering_pipeline + data_science_pipeline, + } + diff --git a/src/default_kedro_157/pipelines/__init__.py b/src/default_kedro_157/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/default_kedro_157/pipelines/data_engineering/README.md b/src/default_kedro_157/pipelines/data_engineering/README.md new file mode 100644 index 0000000..50bab30 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_engineering/README.md @@ -0,0 +1,53 @@ +# Data Engineering pipeline + +> *Note:* This `README.md` was generated using `Kedro 0.15.7` for illustration purposes. Please modify it according to your pipeline structure and contents. + +## Overview + +This modular pipeline splits the incoming data into the train and test subsets (`split_data` node) + +## Pipeline inputs + +### `example_iris_data` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | Input data to split into train and test sets | + +### `params:example_test_data_ratio` + +| | | +| ---- | ------------------ | +| Type | `float` | +| Description | The split ratio parameter that identifies what percentage of rows goes to the train set | + +## Pipeline outputs + +### `example_train_x` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing train set features | + +### `example_train_y` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing train set one-hot encoded target variable | + +### `example_test_x` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing test set features | + +### `example_test_y` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing test set one-hot encoded target variable | diff --git a/src/default_kedro_157/pipelines/data_engineering/__init__.py b/src/default_kedro_157/pipelines/data_engineering/__init__.py new file mode 100644 index 0000000..afca3d4 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_engineering/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +PLEASE DELETE THIS FILE ONCE YOU START WORKING ON YOUR OWN PROJECT! +""" + +from .pipeline import create_pipeline # NOQA diff --git a/src/default_kedro_157/pipelines/data_engineering/nodes.py b/src/default_kedro_157/pipelines/data_engineering/nodes.py new file mode 100644 index 0000000..52a891b --- /dev/null +++ b/src/default_kedro_157/pipelines/data_engineering/nodes.py @@ -0,0 +1,78 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +PLEASE DELETE THIS FILE ONCE YOU START WORKING ON YOUR OWN PROJECT! +""" + +from typing import Any, Dict + +import pandas as pd + + +def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> Dict[str, Any]: + """Node for splitting the classical Iris data set into training and test + sets, each split into features and labels. + The split ratio parameter is taken from conf/project/parameters.yml. + The data and the parameters will be loaded and provided to your function + automatically when the pipeline is executed and it is time to run this node. + """ + data.columns = [ + "sepal_length", + "sepal_width", + "petal_length", + "petal_width", + "target", + ] + classes = sorted(data["target"].unique()) + # One-hot encoding for the target variable + data = pd.get_dummies(data, columns=["target"], prefix="", prefix_sep="") + + # Shuffle all the data + data = data.sample(frac=1).reset_index(drop=True) + + # Split to training and testing data + n = data.shape[0] + n_test = int(n * example_test_data_ratio) + training_data = data.iloc[n_test:, :].reset_index(drop=True) + test_data = data.iloc[:n_test, :].reset_index(drop=True) + + # Split the data to features and labels + train_data_x = training_data.loc[:, "sepal_length":"petal_width"] + train_data_y = training_data[classes] + test_data_x = test_data.loc[:, "sepal_length":"petal_width"] + test_data_y = test_data[classes] + + # When returning many variables, it is a good practice to give them names: + return dict( + train_x=train_data_x, + train_y=train_data_y, + test_x=test_data_x, + test_y=test_data_y, + ) diff --git a/src/default_kedro_157/pipelines/data_engineering/pipeline.py b/src/default_kedro_157/pipelines/data_engineering/pipeline.py new file mode 100644 index 0000000..ffb8ca8 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_engineering/pipeline.py @@ -0,0 +1,54 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +Delete this when you start working on your own Kedro project. +""" + +from kedro.pipeline import Pipeline, node + +from .nodes import split_data + + +def create_pipeline(**kwargs): + return Pipeline( + [ + node( + split_data, + ["example_iris_data", "params:example_test_data_ratio"], + dict( + train_x="example_train_x", + train_y="example_train_y", + test_x="example_test_x", + test_y="example_test_y", + ), + ) + ] + ) diff --git a/src/default_kedro_157/pipelines/data_science/README.md b/src/default_kedro_157/pipelines/data_science/README.md new file mode 100644 index 0000000..210d5c2 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_science/README.md @@ -0,0 +1,58 @@ +# Data Science pipeline + +> *Note:* This `README.md` was generated using `Kedro 0.15.7` for illustration purposes. Please modify it according to your pipeline structure and contents. + +## Overview + +This modular pipeline: +1. trains a simple multi-class logistic regression model (`train_model` node) +2. makes predictions given a trained model from (1) and a test set (`predict` node) +3. reports the model accuracy on a test set (`report_accuracy` node) + + +## Pipeline inputs + +### `example_train_x` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing train set features | + +### `example_train_y` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing train set one-hot encoded target variable | + +### `example_test_x` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing test set features | + +### `example_test_y` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing test set one-hot encoded target variable | + +### `parameters` + +| | | +| ---- | ------------------ | +| Type | `dict` | +| Description | Project parameter dictionary that must contain the following keys: `example_num_train_iter` (number of model training iterations), `example_learning_rate` (learning rate for gradient descent) | + + +## Pipeline outputs + +### `example_model` + +| | | +| ---- | ------------------ | +| Type | `numpy.ndarray` | +| Description | Example logistic regression model | diff --git a/src/default_kedro_157/pipelines/data_science/__init__.py b/src/default_kedro_157/pipelines/data_science/__init__.py new file mode 100644 index 0000000..afca3d4 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_science/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +PLEASE DELETE THIS FILE ONCE YOU START WORKING ON YOUR OWN PROJECT! +""" + +from .pipeline import create_pipeline # NOQA diff --git a/src/default_kedro_157/pipelines/data_science/nodes.py b/src/default_kedro_157/pipelines/data_science/nodes.py new file mode 100644 index 0000000..a90c487 --- /dev/null +++ b/src/default_kedro_157/pipelines/data_science/nodes.py @@ -0,0 +1,109 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +Delete this when you start working on your own Kedro project. +""" +# pylint: disable=invalid-name + +import logging +from typing import Any, Dict + +import numpy as np +import pandas as pd + + +def train_model( + train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] +) -> np.ndarray: + """Node for training a simple multi-class logistic regression model. The + number of training iterations as well as the learning rate are taken from + conf/project/parameters.yml. All of the data as well as the parameters + will be provided to this function at the time of execution. + """ + num_iter = parameters["example_num_train_iter"] + lr = parameters["example_learning_rate"] + X = train_x.to_numpy() + Y = train_y.to_numpy() + + # Add bias to the features + bias = np.ones((X.shape[0], 1)) + X = np.concatenate((bias, X), axis=1) + + weights = [] + # Train one model for each class in Y + for k in range(Y.shape[1]): + # Initialise weights + theta = np.zeros(X.shape[1]) + y = Y[:, k] + for _ in range(num_iter): + z = np.dot(X, theta) + h = _sigmoid(z) + gradient = np.dot(X.T, (h - y)) / y.size + theta -= lr * gradient + # Save the weights for each model + weights.append(theta) + + # Return a joint multi-class model with weights for all classes + return np.vstack(weights).transpose() + + +def predict(model: np.ndarray, test_x: pd.DataFrame) -> np.ndarray: + """Node for making predictions given a pre-trained model and a test set. + """ + X = test_x.to_numpy() + + # Add bias to the features + bias = np.ones((X.shape[0], 1)) + X = np.concatenate((bias, X), axis=1) + + # Predict "probabilities" for each class + result = _sigmoid(np.dot(X, model)) + + # Return the index of the class with max probability for all samples + return np.argmax(result, axis=1) + + +def report_accuracy(predictions: np.ndarray, test_y: pd.DataFrame) -> None: + """Node for reporting the accuracy of the predictions performed by the + previous node. Notice that this function has no outputs, except logging. + """ + # Get true class index + target = np.argmax(test_y.to_numpy(), axis=1) + # Calculate accuracy of predictions + accuracy = np.sum(predictions == target) / target.shape[0] + # Log the accuracy of the model + log = logging.getLogger(__name__) + log.info("Model accuracy on test set: %0.2f%%", accuracy * 100) + + +def _sigmoid(z): + """A helper sigmoid function used by the training and the scoring nodes.""" + return 1 / (1 + np.exp(-z)) diff --git a/src/default_kedro_157/pipelines/data_science/pipeline.py b/src/default_kedro_157/pipelines/data_science/pipeline.py new file mode 100644 index 0000000..ba1003f --- /dev/null +++ b/src/default_kedro_157/pipelines/data_science/pipeline.py @@ -0,0 +1,55 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Example code for the nodes in the example pipeline. This code is meant +just for illustrating basic Kedro features. + +Delete this when you start working on your own Kedro project. +""" + +from kedro.pipeline import Pipeline, node + +from .nodes import predict, report_accuracy, train_model + + +def create_pipeline(**kwargs): + return Pipeline( + [ + node( + train_model, + ["example_train_x", "example_train_y", "parameters"], + "example_model", + ), + node( + predict, + dict(model="example_model", test_x="example_test_x"), + "example_predictions", + ), + node(report_accuracy, ["example_predictions", "example_test_y"], None), + ] + ) diff --git a/src/default_kedro_157/run.py b/src/default_kedro_157/run.py new file mode 100644 index 0000000..036a507 --- /dev/null +++ b/src/default_kedro_157/run.py @@ -0,0 +1,61 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Application entry point.""" +from pathlib import Path +from typing import Dict + +from kedro.context import KedroContext, load_context +from kedro.pipeline import Pipeline + +from default_kedro_157.pipeline import create_pipelines + + +class ProjectContext(KedroContext): + """Users can override the remaining methods from the parent class here, + or create new ones (e.g. as required by plugins) + """ + + project_name = "Default Kedro 157" + project_version = "0.15.7" + + def _get_pipelines(self) -> Dict[str, Pipeline]: + return create_pipelines() + + +def run_package(): + # entry point for running pip-install projects + # using `` command + project_context = load_context(Path.cwd()) + project_context.run() + + +if __name__ == "__main__": + # entry point for running pip-installed projects + # using `python -m .run` command + run_package() diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..b49f80a --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,11 @@ +ipython>=7.0.0, <8.0 +jupyter>=1.0.0, <2.0 +jupyter_client>=5.1.0, <6.0 +jupyterlab==0.31.1 +kedro==0.15.7 +kedro-viz>=3.1.0, <4.0 +nbstripout==0.3.3 +pytest-cov>=2.5, <3.0 +pytest-mock>=1.7.1,<2.0 +pytest>=3.4, <4.0 +wheel==0.32.2 diff --git a/src/setup.py b/src/setup.py new file mode 100644 index 0000000..567cafb --- /dev/null +++ b/src/setup.py @@ -0,0 +1,67 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import find_packages, setup + +entry_point = ( + "default-kedro-157 = default_kedro_157.run:run_package" +) + + +# get the dependencies and installs +with open("requirements.txt", "r", encoding="utf-8") as f: + # Make sure we strip all comments and options (e.g "--extra-index-url") + # that arise from a modified pip.conf file that configure global options + # when running kedro build-reqs + requires = [] + for line in f: + req = line.split("#", 1)[0].strip() + if req and not req.startswith("--"): + requires.append(req) + +setup( + name="default_kedro_157", + version="0.1", + packages=find_packages(exclude=["tests"]), + entry_points={"console_scripts": [entry_point]}, + install_requires=requires, + extras_require={ + "docs": [ + "sphinx>=1.6.3, <2.0", + "sphinx_rtd_theme==0.4.1", + "nbsphinx==0.3.4", + "nbstripout==0.3.3", + "recommonmark==0.5.0", + "sphinx-autodoc-typehints==1.6.0", + "sphinx_copybutton==0.2.5", + "jupyter_client>=5.1.0, <6.0", + "tornado>=4.2, <6.0", + "ipykernel>=4.8.1, <5.0", + ] + }, +) diff --git a/src/tests/__init__.py b/src/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/test_run.py b/src/tests/test_run.py new file mode 100644 index 0000000..70ad804 --- /dev/null +++ b/src/tests/test_run.py @@ -0,0 +1,55 @@ +# Copyright 2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module contains an example test. + +Tests should be placed in ``src/tests``, in modules that mirror your +project's structure, and in files named test_*.py. They are simply functions +named ``test_*`` which test a unit of logic. + +To run the tests, run ``kedro test``. +""" +from pathlib import Path + +import pytest + +from default_kedro_157.run import ProjectContext + + +@pytest.fixture +def project_context(): + return ProjectContext(str(Path.cwd())) + + +class TestProjectContext: + def test_project_name(self, project_context): + assert project_context.project_name == "Default Kedro 157" + + def test_project_version(self, project_context): + assert project_context.project_version == "0.15.7"