From d691b968f30b96b2bb15236450023e6e1f8a60aa Mon Sep 17 00:00:00 2001 From: Takieddine Kadiri Date: Sun, 11 Oct 2020 15:21:13 +0200 Subject: [PATCH 1/3] FIX #66 #64 #54 #30 #31 #72 #29 #62 - context access - kedro 16.5 - hook auto registration --- .isort.cfg | 2 +- CHANGELOG.md | 6 + kedro_mlflow/framework/cli/cli.py | 142 +++++++++--------- .../framework/context/mlflow_context.py | 19 +-- kedro_mlflow/framework/hooks/node_hook.py | 44 +++++- kedro_mlflow/framework/hooks/pipeline_hook.py | 15 +- kedro_mlflow/template/project/run.py | 16 +- requirements/requirements.txt | 2 +- setup.py | 20 +-- tests/conftest.py | 21 ++- tests/dummy_run.py | 13 ++ tests/framework/cli/test_cli.py | 70 ++++----- .../framework/context/test_mlflow_context.py | 5 +- tests/framework/hooks/test_node_hook.py | 75 +++++++-- tests/framework/hooks/test_pipeline_hook.py | 22 +-- tests/mlflow/test_kedro_pipeline_model.py | 3 + .../template/project/test_template_pyfiles.py | 109 -------------- 17 files changed, 303 insertions(+), 281 deletions(-) create mode 100644 tests/dummy_run.py delete mode 100644 tests/template/project/test_template_pyfiles.py diff --git a/.isort.cfg b/.isort.cfg index 6058eb54..bc9982cd 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -7,4 +7,4 @@ line_length=88 ensure_newline_before_comments=True sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER known_first_party=kedro_mlflow -known_third_party=black,click,cookiecutter,flake8,isort,jinja2,kedro,mlflow,pandas,pytest,pytest_lazyfixture,setuptools,yaml +known_third_party=click,cookiecutter,jinja2,kedro,mlflow,pandas,pytest,pytest_lazyfixture,setuptools,yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 93720f17..0a9ea15f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,15 @@ ### Added +- kedro-mlflow hooks are now auto-registred (#29) - Add dataset ``MlflowMetricsDataSet`` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics. - Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) - Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) ### Fixed +- `get_mlflow_config` now uses the kedro project config_loader to get configs (#66 #64 #54 #30 #31 #72) +- Fix the tests that are broken during the 0.16.5 update (#62) - Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41)) - MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45)) - Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50)) @@ -21,6 +24,9 @@ ### Changed +- Bump kedro requirement to <=0.16.5 +- `kedro mlflow init` is no longer touching the run.py file +- `MlflowNodeHook` now have `before_pipeline_run` hook so it can access to project run parameters - Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport. - `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) - Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) diff --git a/kedro_mlflow/framework/cli/cli.py b/kedro_mlflow/framework/cli/cli.py index c58f11a7..bb502b54 100644 --- a/kedro_mlflow/framework/cli/cli.py +++ b/kedro_mlflow/framework/cli/cli.py @@ -1,16 +1,12 @@ -import os import subprocess from pathlib import Path import click -from kedro import __file__ as KEDRO_PATH +from kedro.framework.context import get_static_project_data, load_context -from kedro_mlflow.framework.cli.cli_utils import ( - render_jinja_template, - write_jinja_template, -) +from kedro_mlflow.framework.cli.cli_utils import write_jinja_template from kedro_mlflow.framework.context import get_mlflow_config -from kedro_mlflow.utils import _already_updated, _get_project_globals, _is_kedro_project +from kedro_mlflow.utils import _already_updated, _is_kedro_project TEMPLATE_FOLDER_PATH = Path(__file__).parent.parent.parent / "template" / "project" @@ -87,80 +83,86 @@ def init(force, silent): # get constants project_path = Path().cwd() - project_globals = _get_project_globals() + context = load_context(project_path) + # project_globals = _get_project_globals() + project_globals = get_static_project_data(project_path) # mlflow.yml is just a static file, # but the name of the experiment is set to be the same as the project + conf_root = context.CONF_ROOT mlflow_yml = "mlflow.yml" write_jinja_template( src=TEMPLATE_FOLDER_PATH / mlflow_yml, is_cookiecutter=False, - dst=project_path / "conf" / "base" / mlflow_yml, - python_package=project_globals["python_package"], + dst=project_path / conf_root / "base" / mlflow_yml, + python_package=project_globals["package_name"], ) if not silent: click.secho( - click.style("'conf/base/mlflow.yml' successfully updated.", fg="green") + click.style( + f"'{conf_root}/base/mlflow.yml' successfully updated.", fg="green" + ) ) # make a check whether the project run.py is strictly identical to the template # if yes, replace the script by the template silently # if no, raise a warning and send a message to INSERT_DOC_URL - flag_erase_runpy = force - runpy_project_path = ( - project_path - / "src" - / (Path(project_globals["context_path"]).parent.as_posix() + ".py") - ) - if not force: - kedro_path = Path(KEDRO_PATH).parent - runpy_template_path = ( - kedro_path - / "templates" - / "project" - / "{{ cookiecutter.repo_name }}" - / "src" - / "{{ cookiecutter.python_package }}" - / "run.py" - ) - kedro_runpy_template = render_jinja_template( - src=runpy_template_path, - is_cookiecutter=True, - python_package=project_globals["python_package"], - project_name=project_globals["project_name"], - kedro_version=project_globals["kedro_version"], - ) - - with open(runpy_project_path, mode="r") as file_handler: - kedro_runpy_project = file_handler.read() - - # beware : black formatting could change slightly this test which is very strict - if kedro_runpy_project == kedro_runpy_template: - flag_erase_runpy = True - - if flag_erase_runpy: - os.remove(runpy_project_path) - write_jinja_template( - src=TEMPLATE_FOLDER_PATH / "run.py", - dst=runpy_project_path, - is_cookiecutter=True, - python_package=project_globals["python_package"], - project_name=project_globals["project_name"], - kedro_version=project_globals["kedro_version"], - ) - if not silent: - click.secho(click.style("'run.py' successfully updated", fg="green")) - else: - click.secho( - click.style( - "You have modified your 'run.py' since project creation.\n" - + "In order to use kedro-mlflow, you must either:\n" - + " - set up your run.py with the following instructions :\n" - + "INSERT_DOC_URL\n" - + " - call the following command:\n" - + "$ kedro mlflow init --force", - fg="yellow", - ) - ) + # flag_erase_runpy = force + # runpy_project_path = ( + # project_path + # / "src" + # / project_globals["package_name"] + # / "run.py" + # ) + # if not force: + # kedro_path = Path(KEDRO_PATH).parent + # runpy_template_path = ( + # kedro_path + # / "templates" + # / "project" + # / "{{ cookiecutter.repo_name }}" + # / "src" + # / "{{ cookiecutter.python_package }}" + # / "run.py" + # ) + # kedro_runpy_template = render_jinja_template( + # src=runpy_template_path, + # is_cookiecutter=True, + # python_package=project_globals["package_name"], + # project_name=project_globals["project_name"], + # kedro_version=project_globals["project_version"], + # ) + + # with open(runpy_project_path, mode="r") as file_handler: + # kedro_runpy_project = file_handler.read() + + # # beware : black formatting could change slightly this test which is very strict + # if kedro_runpy_project == kedro_runpy_template: + # flag_erase_runpy = True + + # if flag_erase_runpy: + # os.remove(runpy_project_path) + # write_jinja_template( + # src=TEMPLATE_FOLDER_PATH / "run.py", + # dst=runpy_project_path, + # is_cookiecutter=True, + # python_package=project_globals["package_name"], + # project_name=project_globals["project_name"], + # kedro_version=project_globals["project_version"], + # ) + # if not silent: + # click.secho(click.style("'run.py' successfully updated", fg="green")) + # else: + # click.secho( + # click.style( + # "You have modified your 'run.py' since project creation.\n" + # + "In order to use kedro-mlflow, you must either:\n" + # + " - set up your run.py with the following instructions :\n" + # + "INSERT_DOC_URL\n" + # + " - call the following command:\n" + # + "$ kedro mlflow init --force", + # fg="yellow", + # ) + # ) @mlflow_commands.command() @@ -185,7 +187,11 @@ def ui(project_path, env): """ # the context must contains the self.mlflow attribues with mlflow configuration - mlflow_conf = get_mlflow_config(project_path=project_path, env=env) + if not project_path: + project_path = Path().cwd() + + context = load_context(project_path=project_path, env=env) + mlflow_conf = get_mlflow_config(context) # call mlflow ui with specific options # TODO : add more options for ui diff --git a/kedro_mlflow/framework/context/mlflow_context.py b/kedro_mlflow/framework/context/mlflow_context.py index 98bf8e93..8ed91bae 100644 --- a/kedro_mlflow/framework/context/mlflow_context.py +++ b/kedro_mlflow/framework/context/mlflow_context.py @@ -1,6 +1,4 @@ -from pathlib import Path - -from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext from kedro_mlflow.framework.context.config import KedroMlflowConfig @@ -8,16 +6,9 @@ # this could be a read-only property in the context # with a @property decorator # but for consistency with hook system, it is an external function -def get_mlflow_config(project_path=None, env="local"): - if project_path is None: - project_path = Path.cwd() - project_path = Path(project_path) - conf_paths = [ - str(project_path / "conf" / "base"), - str(project_path / "conf" / env), - ] - config_loader = ConfigLoader(conf_paths=conf_paths) - conf_mlflow_yml = config_loader.get("mlflow*", "mlflow*/**") - conf_mlflow = KedroMlflowConfig(project_path=project_path) +def get_mlflow_config(context: KedroContext): + + conf_mlflow_yml = context.config_loader.get("mlflow*", "mlflow*/**") + conf_mlflow = KedroMlflowConfig(context.project_path) conf_mlflow.from_dict(conf_mlflow_yml) return conf_mlflow diff --git a/kedro_mlflow/framework/hooks/node_hook.py b/kedro_mlflow/framework/hooks/node_hook.py index 95585df4..0bd354ac 100644 --- a/kedro_mlflow/framework/hooks/node_hook.py +++ b/kedro_mlflow/framework/hooks/node_hook.py @@ -1,8 +1,10 @@ from typing import Any, Dict import mlflow +from kedro.framework.context import load_context from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog +from kedro.pipeline import Pipeline from kedro.pipeline.node import Node from kedro_mlflow.framework.context import get_mlflow_config @@ -10,7 +12,44 @@ class MlflowNodeHook: def __init__(self): - config = get_mlflow_config() + self.context = None + self.flatten = False + self.recursive = True + self.sep = "." + + @hook_impl + def before_pipeline_run( + self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog + ) -> None: + """Hook to be invoked before a pipeline runs. + Args: + run_params: The params needed for the given run. + Should be identical to the data logged by Journal. + # @fixme: this needs to be modelled explicitly as code, instead of comment + Schema: { + "run_id": str, + "project_path": str, + "env": str, + "kedro_version": str, + "tags": Optional[List[str]], + "from_nodes": Optional[List[str]], + "to_nodes": Optional[List[str]], + "node_names": Optional[List[str]], + "from_inputs": Optional[List[str]], + "load_versions": Optional[List[str]], + "pipeline_name": str, + "extra_params": Optional[Dict[str, Any]], + } + pipeline: The ``Pipeline`` that will be run. + catalog: The ``DataCatalog`` to be used during the run. + """ + + self.context = load_context( + project_path=run_params["project_path"], + env=run_params["env"], + extra_params=run_params["extra_params"], + ) + config = get_mlflow_config(self.context) self.flatten = config.node_hook_opts["flatten_dict_params"] self.recursive = config.node_hook_opts["recursive"] self.sep = config.node_hook_opts["sep"] @@ -51,6 +90,9 @@ def before_node_run( mlflow.log_params(params_inputs) +mlflow_node_hooks = MlflowNodeHook() + + def flatten_dict(d, recursive: bool = True, sep="."): def expand(key, value): if isinstance(value, dict): diff --git a/kedro_mlflow/framework/hooks/pipeline_hook.py b/kedro_mlflow/framework/hooks/pipeline_hook.py index 7c8f3ba3..cc7a029e 100644 --- a/kedro_mlflow/framework/hooks/pipeline_hook.py +++ b/kedro_mlflow/framework/hooks/pipeline_hook.py @@ -4,6 +4,7 @@ import mlflow import yaml +from kedro.framework.context import load_context from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog from kedro.pipeline import Pipeline @@ -17,6 +18,9 @@ class MlflowPipelineHook: + def __init__(self): + self.context = None + @hook_impl def after_catalog_created( self, @@ -62,9 +66,13 @@ def before_pipeline_run( pipeline: The ``Pipeline`` that will be run. catalog: The ``DataCatalog`` to be used during the run. """ - mlflow_conf = get_mlflow_config( - project_path=run_params["project_path"], env=run_params["env"] + context = load_context( + project_path=run_params["project_path"], + env=run_params["env"], + extra_params=run_params["extra_params"], ) + + mlflow_conf = get_mlflow_config(context) mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri) # TODO : if the pipeline fails, we need to be able to end stop the mlflow run # cannot figure out how to do this within hooks @@ -177,6 +185,9 @@ def on_pipeline_error( mlflow.end_run() +mlflow_pipeline_hooks = MlflowPipelineHook() + + def _generate_kedro_command( tags, node_names, from_nodes, to_nodes, from_inputs, load_versions, pipeline_name ): diff --git a/kedro_mlflow/template/project/run.py b/kedro_mlflow/template/project/run.py index 6e8d3228..0e70577f 100644 --- a/kedro_mlflow/template/project/run.py +++ b/kedro_mlflow/template/project/run.py @@ -28,34 +28,20 @@ """Application entry point.""" from pathlib import Path -from typing import Dict from kedro.framework.context import KedroContext, load_package_context -from kedro.pipeline import Pipeline - -from {{ cookiecutter.python_package }}.pipeline import create_pipelines - from kedro_mlflow.framework.hooks import MlflowNodeHook, MlflowPipelineHook - class ProjectContext(KedroContext): """Users can override the remaining methods from the parent class here, or create new ones (e.g. as required by plugins) """ - project_name = "{{ cookiecutter.project_name }}" - # `project_version` is the version of kedro used to generate the project - project_version = "{{ cookiecutter.kedro_version }}" - package_name = "{{ cookiecutter.python_package }}" hooks = ( MlflowNodeHook(), MlflowPipelineHook(), ) - def _get_pipelines(self) -> Dict[str, Pipeline]: - return create_pipelines() - - def run_package(): # Entry point for running a Kedro project packaged with `kedro package` # using `python -m .run` command. @@ -66,4 +52,4 @@ def run_package(): if __name__ == "__main__": - run_package() + run_package() \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6562d31a..96f51986 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,2 @@ mlflow>=1.0.0, <2.0.0 -kedro>=0.16.0, <=0.16.4 # 0.16.5 breaks pipeline_ml, template and hooks test +kedro>=0.16.0, <=0.16.5 diff --git a/setup.py b/setup.py index 50d90029..6ad794cc 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,12 @@ import pathlib -import re from setuptools import find_packages, setup +from kedro_mlflow import __version__ as VERSION + NAME = "kedro_mlflow" HERE = pathlib.Path(__file__).parent -# get package version -with open((HERE / NAME / "__init__.py").as_posix(), encoding="utf-8") as file_handler: - result = re.search(r'__version__ *= *["\']([^"\']+)', file_handler.read()) - - if not result: - raise ValueError("Can't find the version in kedro/__init__.py") - - VERSION = result.group(1) - def _parse_requirements(path, encoding="utf-8"): with open(path, mode="r", encoding=encoding) as file_handler: @@ -51,10 +43,14 @@ def _parse_requirements(path, encoding="utf-8"): author="Galileo-Galilei", entry_points={ "kedro.project_commands": [ - "kedro_mlflow = kedro_mlflow.framework.cli.cli:commands" + "kedro_mlflow = kedro_mlflow.framework.cli.cli:commands", ], "kedro.global_commands": [ - "kedro_mlflow = kedro_mlflow.framework.cli.cli:commands" + "kedro_mlflow = kedro_mlflow.framework.cli.cli:commands", + ], + "kedro.hooks": [ + "mlflow_pipeline_hooks = kedro_mlflow.framework.hooks.pipeline_hook:mlflow_pipeline_hooks", + "mlflow_node_hooks = kedro_mlflow.framework.hooks.node_hook:mlflow_node_hooks", ], }, zip_safe=False, diff --git a/tests/conftest.py b/tests/conftest.py index d4a6d529..503e3a44 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,6 @@ +import os from pathlib import Path +from shutil import copyfile from typing import Dict import pytest @@ -49,7 +51,24 @@ def config_dir(tmp_path): credentials = tmp_path / "conf" / env / "credentials.yml" logging = tmp_path / "conf" / env / "logging.yml" parameters = tmp_path / "conf" / env / "parameters.yml" + kedro_yaml = tmp_path / ".kedro.yml" _write_yaml(catalog, dict()) _write_yaml(parameters, dict()) _write_yaml(credentials, dict()) - _write_yaml(logging, _get_local_logging_config()) + _write_yaml(logging, _get_local_logging_config()), + + _write_yaml( + kedro_yaml, + dict( + { + "context_path": "dummy_package.run.ProjectContext", + "project_name": "dummy_package", + "project_version": "0.16.5", + "package_name": "dummy_package", + } + ), + ) + + os.mkdir(tmp_path / "src") + os.mkdir(tmp_path / "src" / "dummy_package") + copyfile("tests/dummy_run.py", tmp_path / "src" / "dummy_package" / "run.py") diff --git a/tests/dummy_run.py b/tests/dummy_run.py new file mode 100644 index 00000000..882beb6d --- /dev/null +++ b/tests/dummy_run.py @@ -0,0 +1,13 @@ +"""Application entry point.""" +from kedro.framework.context import KedroContext + + +class ProjectContext(KedroContext): + """Users can override the remaining methods from the parent class here, + or create new ones (e.g. as required by plugins) + """ + + project_name = "dummy_package" + # `project_version` is the version of kedro used to generate the project + project_version = "0.16.5" + package_name = "dummy_package" diff --git a/tests/framework/cli/test_cli.py b/tests/framework/cli/test_cli.py index 637f3fdb..dd0e9da8 100644 --- a/tests/framework/cli/test_cli.py +++ b/tests/framework/cli/test_cli.py @@ -100,41 +100,41 @@ def test_cli_init(monkeypatch, tmp_path, kedro_project): assert "'conf/base/mlflow.yml' successfully updated." in result.output assert (project_path / "conf" / "base" / "mlflow.yml").is_file() - # check runpy file - assert "'run.py' successfully updated" in result.output - with open(project_path / "src" / "fake_project" / "run.py", "r") as file_handler: - runpy_txt = file_handler.read() - assert "MlflowPipelineHook" in runpy_txt - assert "MlflowNodeHook" in runpy_txt - - -def test_cli_init_with_modified_runpy(monkeypatch, tmp_path, kedro_project): - # "kedro_project" is a pytest.fixture declared in conftest - project_path = tmp_path / "fake-project" - monkeypatch.chdir(project_path) - - # modify run.py - with open(project_path / "src" / "fake_project" / "run.py", "a") as file_handler: - file_handler.write("\n # hello") - - cli_runner = CliRunner() - result = cli_runner.invoke(cli_init) - - # FIRST TEST: - # the command should have executed propery - assert result.exit_code == 0 - - # mlflow.yml file must be updated whatsoever - assert "'conf/base/mlflow.yml' successfully updated." in result.output - assert (project_path / "conf" / "base" / "mlflow.yml").is_file() - - # the call should raise a warning and - # runpy file must NOT be updated - assert "You have modified your 'run.py' since project creation" in result.output - with open(project_path / "src" / "fake_project" / "run.py", "r") as file_handler: - runpy_txt = file_handler.read() - assert "MlflowPipelineHook" not in runpy_txt - assert "MlflowNodeHook" not in runpy_txt + # # check runpy file + # assert "'run.py' successfully updated" in result.output + # with open(project_path / "src" / "fake_project" / "run.py", "r") as file_handler: + # runpy_txt = file_handler.read() + # assert "MlflowPipelineHook" in runpy_txt + # assert "MlflowNodeHook" in runpy_txt + + +# def test_cli_init_with_modified_runpy(monkeypatch, tmp_path, kedro_project): +# # "kedro_project" is a pytest.fixture declared in conftest +# project_path = tmp_path / "fake-project" +# monkeypatch.chdir(project_path) + +# # modify run.py +# # with open(project_path / "src" / "fake_project" / "run.py", "a") as file_handler: +# # file_handler.write("\n # hello") + +# cli_runner = CliRunner() +# result = cli_runner.invoke(cli_init) + +# # FIRST TEST: +# # the command should have executed propery +# assert result.exit_code == 0 + +# # mlflow.yml file must be updated whatsoever +# assert "'conf/base/mlflow.yml' successfully updated." in result.output +# assert (project_path / "conf" / "base" / "mlflow.yml").is_file() + +# # the call should raise a warning and +# # runpy file must NOT be updated +# assert "You have modified your 'run.py' since project creation" in result.output +# with open(project_path / "src" / "fake_project" / "run.py", "r") as file_handler: +# runpy_txt = file_handler.read() +# assert "MlflowPipelineHook" not in runpy_txt +# assert "MlflowNodeHook" not in runpy_txt # TODO : This is a fake test. add a test to see if ui is properly up diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py index 4cf4cee6..86df5067 100644 --- a/tests/framework/context/test_mlflow_context.py +++ b/tests/framework/context/test_mlflow_context.py @@ -1,4 +1,5 @@ import yaml +from kedro.framework.context import load_context from kedro_mlflow.framework.context import get_mlflow_config @@ -9,6 +10,7 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir): # config_with_base_mlflow_conf is a pytest.fixture in conftest + mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) def _write_yaml(filepath, config): @@ -35,4 +37,5 @@ def _write_yaml(filepath, config): "node": {"flatten_dict_params": True, "recursive": False, "sep": "-"} }, } - assert get_mlflow_config(project_path=tmp_path, env="local").to_dict() == expected + context = load_context(tmp_path) + assert get_mlflow_config(context).to_dict() == expected diff --git a/tests/framework/hooks/test_node_hook.py b/tests/framework/hooks/test_node_hook.py index 89182c7a..79b1e6de 100644 --- a/tests/framework/hooks/test_node_hook.py +++ b/tests/framework/hooks/test_node_hook.py @@ -1,10 +1,10 @@ import mlflow import pytest +import yaml from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline import node +from kedro.pipeline import Pipeline, node from mlflow.tracking import MlflowClient -from kedro_mlflow.framework.context.config import KedroMlflowConfig from kedro_mlflow.framework.hooks import MlflowNodeHook from kedro_mlflow.framework.hooks.node_hook import flatten_dict @@ -38,6 +38,25 @@ def test_flatten_dict_nested_2_levels(): } +@pytest.fixture +def dummy_run_params(tmp_path): + dummy_run_params = { + "run_id": "abcdef", + "project_path": tmp_path.as_posix(), + "env": "local", + "kedro_version": "0.16.0", + "tags": [], + "from_nodes": [], + "to_nodes": [], + "node_names": [], + "from_inputs": [], + "load_versions": [], + "pipeline_name": "my_cool_pipeline", + "extra_params": [], + } + return dummy_run_params + + @pytest.mark.parametrize( "flatten_dict_params,expected", [ @@ -45,18 +64,45 @@ def test_flatten_dict_nested_2_levels(): (False, {"param1": "1", "parameters": "{'param1': 1, 'param2': 2}"}), ], ) -def test_node_hook_logging(tmp_path, mocker, flatten_dict_params, expected): +def test_node_hook_logging( + tmp_path, + mocker, + monkeypatch, + dummy_run_params, + config_dir, + flatten_dict_params, + expected, +): + mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) - config = KedroMlflowConfig( - project_path=tmp_path, - node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"}, - ) - # the function is imported inside the other file antd this is the file to patch - # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method - mocker.patch( - "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config - ) + monkeypatch.chdir(tmp_path) + # config = KedroMlflowConfig( + # project_path=tmp_path, + # node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"}, + # ) + # # the function is imported inside the other file antd this is the file to patch + # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method + # mocker.patch( + # "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config + # ) + + def _write_yaml(filepath, config): + filepath.parent.mkdir(parents=True, exist_ok=True) + yaml_str = yaml.dump(config) + filepath.write_text(yaml_str) + + _write_yaml( + tmp_path / "conf" / "base" / "mlflow.yml", + dict( + hooks=dict( + node=dict( + flatten_dict_params=flatten_dict_params, recursive=False, sep="-" + ) + ), + ), + ), + mlflow_node_hook = MlflowNodeHook() def fake_fun(arg1, arg2, arg3): @@ -67,6 +113,8 @@ def fake_fun(arg1, arg2, arg3): inputs={"arg1": "params:param1", "arg2": "foo", "arg3": "parameters"}, outputs="out", ) + dummy_pipeline = Pipeline([node_test]) + catalog = DataCatalog( { "params:param1": 1, @@ -80,6 +128,9 @@ def fake_fun(arg1, arg2, arg3): mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): + mlflow_node_hook.before_pipeline_run( + run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=catalog + ) mlflow_node_hook.before_node_run( node=node_test, catalog=catalog, diff --git a/tests/framework/hooks/test_pipeline_hook.py b/tests/framework/hooks/test_pipeline_hook.py index b9e04892..700a689d 100644 --- a/tests/framework/hooks/test_pipeline_hook.py +++ b/tests/framework/hooks/test_pipeline_hook.py @@ -4,13 +4,13 @@ import pytest import yaml from kedro.extras.datasets.pickle import PickleDataSet -from kedro.framework.context import KedroContext +from kedro.framework.context import get_static_project_data from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node from kedro.runner import SequentialRunner +from kedro.utils import load_obj from mlflow.tracking import MlflowClient -from kedro_mlflow.framework.context import get_mlflow_config from kedro_mlflow.framework.hooks.pipeline_hook import ( MlflowPipelineHook, _format_conda_env, @@ -264,8 +264,9 @@ def test_mlflow_pipeline_hook_with_different_pipeline_types( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) # test : parameters should have been logged - mlflow_conf = get_mlflow_config(tmp_path) - mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) + # mlflow_conf = get_mlflow_config(tmp_path) + tracking_uri = (tmp_path / "mlruns").as_uri() + mlflow_client = MlflowClient(tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): @@ -324,7 +325,7 @@ def test_generate_default_kedro_commands(default_value): assert _generate_kedro_command(**record_data) == expected -def test_on_pipeline_error(tmp_path, config_dir, mocker): +def test_on_pipeline_error(tmp_path, monkeypatch, config_dir, mocker): # config_dir is a global fixture in conftest that emulates # the root of a Kedro project @@ -334,6 +335,8 @@ def test_on_pipeline_error(tmp_path, config_dir, mocker): mocker.patch("logging.config.dictConfig") mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) + # monkeypatch.chdir(tmp_path) + # create the extra mlflow.ymlconfig file for the plugin def _write_yaml(filepath, config): filepath.parent.mkdir(parents=True, exist_ok=True) @@ -349,10 +352,11 @@ def failing_node(): mlflow.start_run(nested=True) raise ValueError("Let's make this pipeline fail") - class DummyContextWithHook(KedroContext): - project_name = "fake project" - package_name = "fake_project" - project_version = "0.16.0" + context_path = get_static_project_data(tmp_path)["context_path"] + + DummyContext = load_obj(context_path) + + class DummyContextWithHook(DummyContext): hooks = (MlflowPipelineHook(),) diff --git a/tests/mlflow/test_kedro_pipeline_model.py b/tests/mlflow/test_kedro_pipeline_model.py index 427cb644..e7546d38 100644 --- a/tests/mlflow/test_kedro_pipeline_model.py +++ b/tests/mlflow/test_kedro_pipeline_model.py @@ -85,6 +85,9 @@ def test_model_packaging(tmp_path, pipeline_ml_obj): ) assert loaded_model.predict(1) == {"predictions": 2} + if mlflow.active_run(): + mlflow.end_run() + # should very likely add tests to see what happens when the artifacts # are incorrect diff --git a/tests/template/project/test_template_pyfiles.py b/tests/template/project/test_template_pyfiles.py deleted file mode 100644 index 9bcfeaaf..00000000 --- a/tests/template/project/test_template_pyfiles.py +++ /dev/null @@ -1,109 +0,0 @@ -from pathlib import Path - -import black -import pytest -from click.testing import CliRunner -from flake8.api.legacy import get_style_guide -from isort import SortImports -from kedro import __file__ as KEDRO_PATH - -from kedro_mlflow.framework.cli.cli import TEMPLATE_FOLDER_PATH -from kedro_mlflow.framework.cli.cli_utils import ( - render_jinja_template, - write_jinja_template, -) - - -@pytest.fixture -def template_runpy(tmp_path): - # the goal is to discover all potential ".py" files - # but for now there is only "run.py" - # # this is rather a safeguard for further add - raw_template_path = TEMPLATE_FOLDER_PATH / "run.py" - rendered_template_path = tmp_path / raw_template_path.name - tags = { - "project_name": "This is a fake project", - "python_package": "fake_project", - "kedro_version": "0.16.0", - } - - write_jinja_template( - src=raw_template_path, dst=rendered_template_path, is_cookiecutter=True, **tags - ) - return rendered_template_path.as_posix() - - -def test_number_of_templates_py(): - # for now, there is only run.py as a template - # if new .py templates are added, a fixture must be created - # and it must be added to pytest.mark.parametrize for - # black, flake8 and isort tests - assert len(list(TEMPLATE_FOLDER_PATH.glob("**/*.py"))) == 1 - - -@pytest.mark.parametrize("python_file", [(pytest.lazy_fixture("template_runpy"))]) -def test_check_black(python_file): - # IMPORTANT : to ensure black will not change the file, - # we need to keep line 50 on 1 line (and not 2). - # Black complains in the non-rendered form because the line is too long, - # but when the file is rendered, the tags are replaced by much shorter names and - # black wants to force them on the same line. - # TODO : open an issue to kedro - cli_runner = CliRunner() - result_black = cli_runner.invoke(black.main, python_file) - assert "1 file left unchanged" in result_black.output - - -@pytest.mark.parametrize("python_file", [(pytest.lazy_fixture("template_runpy"))]) -def test_check_flake8(python_file): - style_guide = get_style_guide() - report = style_guide.check_files([python_file]) - assert report.total_errors == 0 # no errors must be found - - -@pytest.mark.parametrize("python_file", [(pytest.lazy_fixture("template_runpy"))]) -def test_check_isort(python_file): - result_isort = SortImports(python_file) - result_isort.output - - -def test_runpy_template_is_consistent_with_kedro(): - tags = { - "project_name": "This is a fake project", - "python_package": "fake_project", - "kedro_version": "0.16.0", - } - kedro_runpy_path = ( - Path(KEDRO_PATH).parent - / "templates" - / "project" - / "{{ cookiecutter.repo_name }}" - / "src" - / "{{ cookiecutter.python_package }}" - / "run.py" - ) - - kedro_mlflow_runpy = render_jinja_template( - TEMPLATE_FOLDER_PATH / "run.py", is_cookiecutter=True, **tags - ) - kedro_runpy = render_jinja_template(kedro_runpy_path, is_cookiecutter=True, **tags) - - # remove the 2 specific additions for kedro_mlflow - kedro_mlflow_runpy = kedro_mlflow_runpy.replace( - "from kedro_mlflow.framework.hooks import MlflowNodeHook, MlflowPipelineHook\n\n", - "", - ) - kedro_mlflow_runpy = kedro_mlflow_runpy.replace( - " hooks = (\n MlflowNodeHook(),\n MlflowPipelineHook(),\n )\n", - "", - ) - - # remove wrong black linting in current (0.16.1) kedro version - # TODO: open an issue to kedro and remove this once the correction is made - # this was fixed in kedro==0.16.3! - # kedro_runpy = kedro_runpy.replace( - # "\n package_name=Path(__file__).resolve().parent.name", - # " package_name=Path(__file__).resolve().parent.name", - # ) - - assert kedro_mlflow_runpy == kedro_runpy From 9aef70fd14175264b38876e2fa4374e8fb761b35 Mon Sep 17 00:00:00 2001 From: Takieddine Kadiri Date: Sun, 11 Oct 2020 18:52:37 +0200 Subject: [PATCH 2/3] Add test pipeline_run_getting_configs and remove dummy_run --- tests/conftest.py | 12 ++- tests/dummy_run.py | 13 --- tests/framework/hooks/test_node_hook.py | 111 +++++++++++++++++------- 3 files changed, 92 insertions(+), 44 deletions(-) delete mode 100644 tests/dummy_run.py diff --git a/tests/conftest.py b/tests/conftest.py index 503e3a44..f4e79adb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import os from pathlib import Path -from shutil import copyfile from typing import Dict import pytest @@ -71,4 +70,13 @@ def config_dir(tmp_path): os.mkdir(tmp_path / "src") os.mkdir(tmp_path / "src" / "dummy_package") - copyfile("tests/dummy_run.py", tmp_path / "src" / "dummy_package" / "run.py") + with open(tmp_path / "src" / "dummy_package" / "run.py", "w") as f: + f.writelines( + [ + "from kedro.framework.context import KedroContext\n", + "class ProjectContext(KedroContext):\n", + " project_name = 'dummy_package'\n", + " project_version = '0.16.5'\n", + " package_name = 'dummy_package'\n", + ] + ) diff --git a/tests/dummy_run.py b/tests/dummy_run.py deleted file mode 100644 index 882beb6d..00000000 --- a/tests/dummy_run.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Application entry point.""" -from kedro.framework.context import KedroContext - - -class ProjectContext(KedroContext): - """Users can override the remaining methods from the parent class here, - or create new ones (e.g. as required by plugins) - """ - - project_name = "dummy_package" - # `project_version` is the version of kedro used to generate the project - project_version = "0.16.5" - package_name = "dummy_package" diff --git a/tests/framework/hooks/test_node_hook.py b/tests/framework/hooks/test_node_hook.py index 79b1e6de..8b72c857 100644 --- a/tests/framework/hooks/test_node_hook.py +++ b/tests/framework/hooks/test_node_hook.py @@ -1,3 +1,6 @@ +from pathlib import Path +from typing import Dict + import mlflow import pytest import yaml @@ -9,6 +12,12 @@ from kedro_mlflow.framework.hooks.node_hook import flatten_dict +def _write_yaml(filepath: Path, config: Dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + yaml_str = yaml.dump(config) + filepath.write_text(yaml_str) + + def test_flatten_dict_non_nested(): d = dict(a=1, b=2) assert flatten_dict(d=d, recursive=True, sep=".") == d @@ -41,10 +50,10 @@ def test_flatten_dict_nested_2_levels(): @pytest.fixture def dummy_run_params(tmp_path): dummy_run_params = { - "run_id": "abcdef", + "run_id": "", "project_path": tmp_path.as_posix(), "env": "local", - "kedro_version": "0.16.0", + "kedro_version": "0.16.5", "tags": [], "from_nodes": [], "to_nodes": [], @@ -57,6 +66,68 @@ def dummy_run_params(tmp_path): return dummy_run_params +@pytest.fixture +def dummy_node(): + def fake_fun(arg1, arg2, arg3): + return None + + node_test = node( + func=fake_fun, + inputs={"arg1": "params:param1", "arg2": "foo", "arg3": "parameters"}, + outputs="out", + ) + + return node_test + + +@pytest.fixture +def dummy_pipeline(dummy_node): + + dummy_pipeline = Pipeline([dummy_node]) + + return dummy_pipeline + + +@pytest.fixture +def dummy_catalog(): + + catalog = DataCatalog( + { + "params:param1": 1, + "foo": MemoryDataSet(), + "bar": MemoryDataSet(), + "parameters": {"param1": 1, "param2": 2}, + } + ) + + return catalog + + +def test_pipeline_run_getting_configs( + tmp_path, config_dir, monkeypatch, dummy_run_params, dummy_pipeline, dummy_catalog +): + + monkeypatch.chdir(tmp_path) + + _write_yaml( + tmp_path / "conf" / "base" / "mlflow.yml", + dict( + hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")), + ), + ), + + mlflow_node_hook = MlflowNodeHook() + mlflow_node_hook.before_pipeline_run( + run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog + ) + + assert ( + mlflow_node_hook.flatten, + mlflow_node_hook.recursive, + mlflow_node_hook.sep, + ) == (True, False, "-") + + @pytest.mark.parametrize( "flatten_dict_params,expected", [ @@ -69,6 +140,9 @@ def test_node_hook_logging( mocker, monkeypatch, dummy_run_params, + dummy_catalog, + dummy_pipeline, + dummy_node, config_dir, flatten_dict_params, expected, @@ -87,11 +161,6 @@ def test_node_hook_logging( # "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config # ) - def _write_yaml(filepath, config): - filepath.parent.mkdir(parents=True, exist_ok=True) - yaml_str = yaml.dump(config) - filepath.write_text(yaml_str) - _write_yaml( tmp_path / "conf" / "base" / "mlflow.yml", dict( @@ -105,35 +174,19 @@ def _write_yaml(filepath, config): mlflow_node_hook = MlflowNodeHook() - def fake_fun(arg1, arg2, arg3): - return None - - node_test = node( - func=fake_fun, - inputs={"arg1": "params:param1", "arg2": "foo", "arg3": "parameters"}, - outputs="out", - ) - dummy_pipeline = Pipeline([node_test]) - - catalog = DataCatalog( - { - "params:param1": 1, - "foo": MemoryDataSet(), - "bar": MemoryDataSet(), - "parameters": {"param1": 1, "param2": 2}, - } - ) - node_inputs = {v: catalog._data_sets.get(v) for k, v in node_test._inputs.items()} + node_inputs = { + v: dummy_catalog._data_sets.get(v) for k, v in dummy_node._inputs.items() + } mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( - run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=catalog + run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog ) mlflow_node_hook.before_node_run( - node=node_test, - catalog=catalog, + node=dummy_node, + catalog=dummy_catalog, inputs=node_inputs, is_async=False, run_id="132", From 446af5aaf1adf34785058611770a7589343fe3fb Mon Sep 17 00:00:00 2001 From: Takieddine Kadiri Date: Sun, 11 Oct 2020 19:08:53 +0200 Subject: [PATCH 3/3] remove unused utils --- kedro_mlflow/utils.py | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/kedro_mlflow/utils.py b/kedro_mlflow/utils.py index a21393e1..824c3a93 100644 --- a/kedro_mlflow/utils.py +++ b/kedro_mlflow/utils.py @@ -1,10 +1,5 @@ -import re from pathlib import Path -from typing import Dict, Union - -import yaml -from kedro import __version__ as KEDRO_VERSION -from kedro.framework.context import load_context +from typing import Union KEDRO_YML = ".kedro.yml" @@ -21,37 +16,6 @@ def _is_kedro_project(project_path: Union[str, Path, None] = None) -> bool: return flag -def _get_project_globals(project_path: Union[str, Path, None] = None) -> Dict[str, str]: - - if project_path is None: - project_path = Path.cwd() - # for the project name, we have to load the context : it is the only place where it is recorded - project_context = load_context(project_path) - project_name = project_context.project_name - - kedro_yml = _read_kedro_yml(project_path) - python_package = re.search( - pattern=r"^(\w+)(?=\.)", string=kedro_yml["context_path"] - ).group(1) - context_path = kedro_yml["context_path"].replace(".", "/") - return dict( - context_path=context_path, - project_name=project_name, - python_package=python_package, - kedro_version=KEDRO_VERSION, - ) - - -def _read_kedro_yml(project_path: Union[str, Path, None] = None) -> Dict[str, str]: - project_path = _validate_project_path(project_path) - kedro_yml_path = project_path / KEDRO_YML - - with open(kedro_yml_path, mode="r", encoding="utf-8") as file_handler: - kedro_yml = yaml.safe_load(file_handler.read()) - - return kedro_yml - - def _validate_project_path(project_path: Union[str, Path, None] = None) -> Path: if project_path is None: project_path = Path.cwd()