From 1c3eec7a48c03a28d068d5026bbc45aab0e563fd Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Tue, 6 Dec 2022 12:04:54 +0200 Subject: [PATCH 01/14] Add the hook and the helper classes --- .../plugin/notebook/notebook_based_step.py | 168 ++++++++++++++++++ .../vdk/plugin/notebook/notebook_plugin.py | 46 +++-- .../vdk/plugin/notebook/notebook_reader.py | 90 ++++++++++ .../src/vdk/plugin/notebook/notebook_step.py | 30 ++++ 4 files changed, 309 insertions(+), 25 deletions(-) create mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py create mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py create mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py new file mode 100644 index 0000000000..963e4bec9c --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py @@ -0,0 +1,168 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +import importlib.util +import inspect +import logging +import pathlib +import sys +import traceback +from typing import Callable +from typing import List + +from vdk.api.job_input import IJobInput +from vdk.internal.core import errors +from vdk.internal.core.errors import SkipRemainingStepsException +from vdk.plugin.notebook.notebook_step import NotebookStep + +log = logging.getLogger(__name__) + + +# consists may duplicates of +# https://github.com/vmware/versatile-data-kit/blob/main/projects/vdk-core/src/vdk/internal/builtin_plugins/run/file_based_step.py + + +class JobNotebookLocator: + """ + Locate the data job files that would be executed by us. + """ + + @staticmethod + def get_notebook_files(directory: pathlib.Path) -> List[pathlib.Path]: + """Locates the files in a directory, that are supported for execution. + Files supported for execution are: .ipynb + Other files in the directory are ignored. + :return: List of files from the directory that supported for execution, sorted alphabetically by name. + :rtype: :class:`.list` + """ + script_files = [ + x for x in directory.iterdir() if (x.name.lower().endswith(".ipynb")) + ] + script_files.sort(key=lambda x: x.name) + log.debug(f"Script files of {directory} are {script_files}") + return script_files + + +class NotebookStepFuncFactory: + """ + Implementations of runner_func for running Notebook steps + """ + + @staticmethod + def run_sql_step(step: NotebookStep, job_input: IJobInput) -> bool: + job_input.execute_query(step.code) + return True + + @staticmethod + def run_python_step(step: NotebookStep, job_input: IJobInput) -> bool: + try: + sys.path.insert(0, str(step.job_dir)) + success = False + try: + log.debug("Loading %s ..." % step.name) + spec = importlib.util.spec_from_loader("nb", loader=None) + module = importlib.util.module_from_spec(spec) + exec(step.code, module.__dict__) + log.debug("Loading %s SUCCESS" % step.name) + + for _, func in inspect.getmembers(module, inspect.isfunction): + if func.__name__ == "run": + try: + log.info("Entering %s#run(...) ..." % step.name) + NotebookStepFuncFactory.invoke_run_function( + func, job_input, step + ) + success = True + return True + finally: + if success: + log.info("Exiting %s#run(...) SUCCESS" % step.name) + else: + log.error("Exiting %s#run(...) FAILURE" % step.name) + log.warn( + "File %s does not contain a valid run() method. Nothing to execute. Skipping %s," + + " and continuing with other files (if present).", + step.file_path.name, + step.file_path.name, + ) + return success + except SyntaxError as e: + log.info("Loading %s FAILURE" % step.name) + errors.log_and_rethrow( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened=f"Failed loading job sources of {step.name} from {step.file_path.name}", + why_it_happened=f"{e.__class__.__name__} at line {e.lineno} of {step.name}" + f": {e.args[0]}", + consequences=f"Current Step {step.name} from {step.file_path}" + f"will fail, and as a result the whole Data Job will fail. ", + countermeasures=f"Please, check the {step.file_path.name} file again for syntax errors", + exception=e, + wrap_in_vdk_error=True, + ) + except Exception as e: + cl, exc, tb = sys.exc_info() + line_number = traceback.extract_tb(tb)[-1][1] + errors.log_and_rethrow( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened=f"Failed loading job sources of {step.name} from {step.file_path.name}", + why_it_happened=f"{e.__class__.__name__} at line {line_number} of {step.name}" + f": {e.args[0]}", + consequences=f"Current Step {step.name} from {step.file_path}" + f"will fail, and as a result the whole Data Job will fail. ", + countermeasures=f"Please, check the {step.file_path.name} file again for errors", + exception=e, + wrap_in_vdk_error=True, + ) + finally: + sys.path.remove(str(step.job_dir)) + + @staticmethod + def invoke_run_function(func: Callable, job_input: IJobInput, step: NotebookStep): + full_arg_spec = inspect.getfullargspec(func) + parameter_names = full_arg_spec[0] + possible_arguments = { + "job_input": job_input, + } + # enable plugins to add different types of job input + actual_arguments = { + arg_name: arg_value + for arg_name, arg_value in possible_arguments.items() + if arg_name in parameter_names + } + if actual_arguments: + try: + func(**actual_arguments) + except SkipRemainingStepsException as e: + log.debug(e) + raise e + except BaseException as e: + from vdk.internal.builtin_plugins.run.job_input_error_classifier import ( + whom_to_blame, + ) + + errors.log_and_rethrow( + to_be_fixed_by=whom_to_blame(e, __file__, None), + log=log, + what_happened=f"Data Job step {step.name} from {step.file_path} completed with error.", + why_it_happened=errors.MSG_WHY_FROM_EXCEPTION(e), + consequences="I will not process the remaining steps (if any), " + "and this Data Job execution will be marked as failed.", + countermeasures="See exception and fix the root cause, so that the exception does " + "not appear anymore.", + exception=e, + wrap_in_vdk_error=True, + ) + else: + errors.log_and_throw( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened=f"I'm trying to call method 'run' and failed.", + why_it_happened=f"Method is missing at least one job input parameter to be passed", + consequences=f"Current Step {step.name} from {step.file_path}" + f"will fail, and as a result the whole Data Job will fail. ", + countermeasures="Make sure that you have specified a job input parameter in the signature of the " + "run method. " + f"Possible parameters of run function are: {list(possible_arguments.keys())}.", + ) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 44642f6d49..56afebeb51 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -1,33 +1,29 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 -from typing import List -from vdk.api.plugin.hook_markers import hookimpl -from vdk.api.plugin.plugin_registry import IPluginRegistry -from vdk.internal.core.config import ConfigurationBuilder -from vdk.internal.core.context import CoreContext - -""" -Include the plugins implementation. For example: -""" +from __future__ import annotations +import logging -class DummyPlugin: - @hookimpl(tryfirst=True) - def vdk_configure(self, config_builder: ConfigurationBuilder): - config_builder.add( - key="dummy_config_key", - default_value="dummy", - description=""" - Dummy configuration - """, - ) +from vdk.api.plugin.hook_markers import hookimpl +from vdk.internal.builtin_plugins.run.job_context import JobContext +from vdk.plugin.notebook.notebook_based_step import JobNotebookLocator +from vdk.plugin.notebook.notebook_reader import Notebook +from vdk.plugin.notebook.notebook_reader import NotebookReader - @hookimpl - def vdk_initialize(self, context: CoreContext): - print("initializing dummy") +log = logging.getLogger(__name__) -@hookimpl -def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): - plugin_registry.load_plugin_with_hooks_impl(DummyPlugin(), "DummyPlugin") +@hookimpl() +def initialize_job(context: JobContext): + if context.job_directory is None: + log.info( + "Data Job directory is not specified. Default job initialization will be skipped." + ) + return + file_locator: JobNotebookLocator = JobNotebookLocator() + notebook_files = file_locator.get_notebook_files(context.job_directory) + if len(notebook_files) >= 1: + for file_path in notebook_files: + nb: Notebook = Notebook(file_path) + NotebookReader.read_notebook_and_save_steps(nb, context) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py new file mode 100644 index 0000000000..2f9ce1ef70 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py @@ -0,0 +1,90 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +import json +import logging +from pathlib import Path + +from vdk.internal.builtin_plugins.run.file_based_step import TYPE_PYTHON +from vdk.internal.builtin_plugins.run.file_based_step import TYPE_SQL +from vdk.internal.builtin_plugins.run.job_context import JobContext +from vdk.internal.core import errors +from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory +from vdk.plugin.notebook.notebook_step import NotebookStep + +log = logging.getLogger(__name__) + + +class Notebook: + """ + Given a notebook file locates the cells with "vdk" tag and saves them. + Files supported for reading are: ipynb. + Other cells are ignored. + """ + + def __init__(self, file_path: Path): + try: + content = json.loads(file_path.read_text()) + self.cells = [ + cell + for cell in content["cells"] + if cell["cell_type"] == "code" + and "vdk" in cell["metadata"].get("tags", {}) + ] + self.file_path = file_path + except json.JSONDecodeError as e: + errors.log_and_rethrow( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened=f"Failed to read the {file_path.name} file.", + why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " + f"cannot be read as a Jupyter notebook", + consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, + countermeasures=f"Check the {file_path.name} format again", + exception=e, + wrap_in_vdk_error=True, + ) + + +class NotebookReader: + @staticmethod + def read_notebook_and_save_steps(notebook: Notebook, context: JobContext): + python_cells = "" + step_index = 0 + for cell in notebook.cells: + if cell["source"][0].startswith("%sql"): + code = "".join(cell["source"]) + code.replace(";", "") + step = NotebookStep( + name="".join( + [ + notebook.file_path.name.replace(".ipynb", "_"), + str(step_index), + ] + ), + type=TYPE_SQL, + runner_func=NotebookStepFuncFactory.run_sql_step, + file_path=notebook.file_path, + job_dir=context.job_directory, + code=code.replace("%sql", ""), + ) + step_index += 1 + context.step_builder.add_step(step) + else: + python_cells += "\n" + "".join(cell["source"]) + if "def run(" in cell["source"][0]: + step = NotebookStep( + name="".join( + [ + notebook.file_path.name.replace(".ipynb", "_"), + str(step_index), + ] + ), + type=TYPE_PYTHON, + runner_func=NotebookStepFuncFactory.run_python_step, + file_path=notebook.file_path, + job_dir=context.job_directory, + code=python_cells, + ) + step_index += 1 + context.step_builder.add_step(step) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py new file mode 100644 index 0000000000..9b330f1a5d --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py @@ -0,0 +1,30 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Callable + +from vdk.api.job_input import IJobInput +from vdk.internal.builtin_plugins.run.step import Step + +log = logging.getLogger(__name__) + +# The function accept NotebookStep (below class) and IJobInput and +# return true if the step has been executed and false if it is not (valid) executable step. +# On error it's expected to raise an exception. +NotebookStepFunction = Callable[["NotebookStep", IJobInput], bool] + + +@dataclass +class NotebookStep(Step): + """ + A notebook step that will be executed when running a data job. + """ + + def __init__(self, name, type, runner_func, file_path, job_dir, code, parent=None): + super().__init__(name, type, runner_func, file_path, job_dir, parent) + self.runner_func = runner_func + self.code = code From 9da07d1088cfef49e3ca3d0cd63fa1113d494e81 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Tue, 6 Dec 2022 13:57:13 +0200 Subject: [PATCH 02/14] Adding the tests and needed imports to the requirements.txt --- .../vdk-plugins/vdk-notebook/requirements.txt | 3 + .../.ipynb_checkpoints/steps-checkpoint.ipynb | 90 ++++++++++++++++ .../jobs/rest-api-job-code-error/config.ini | 61 +++++++++++ .../rest-api-job-code-error/requirements.txt | 6 ++ .../jobs/rest-api-job-code-error/steps.ipynb | 90 ++++++++++++++++ .../.ipynb_checkpoints/steps-checkpoint.ipynb | 89 ++++++++++++++++ .../rest-api-job-fail-syntax-error/config.ini | 61 +++++++++++ .../requirements.txt | 6 ++ .../steps.ipynb | 89 ++++++++++++++++ .../.ipynb_checkpoints/steps-checkpoint.ipynb | 88 +++++++++++++++ .../tests/jobs/rest-api-job/config.ini | 61 +++++++++++ .../tests/jobs/rest-api-job/requirements.txt | 5 + .../tests/jobs/rest-api-job/steps.ipynb | 100 ++++++++++++++++++ .../vdk-notebook/tests/test_plugin.py | 53 +++++++--- 14 files changed, 786 insertions(+), 16 deletions(-) create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/requirements.txt create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/steps.ipynb create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/requirements.txt create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/requirements.txt create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb diff --git a/projects/vdk-plugins/vdk-notebook/requirements.txt b/projects/vdk-plugins/vdk-notebook/requirements.txt index 900e1b689f..5b66d1c9a0 100644 --- a/projects/vdk-plugins/vdk-notebook/requirements.txt +++ b/projects/vdk-plugins/vdk-notebook/requirements.txt @@ -1,7 +1,10 @@ # this file is used to provide testing requirements # for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section +click +requests pytest vdk-core vdk-test-utils +vdk-sqlite \ No newline at end of file diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb new file mode 100644 index 0000000000..1c8849ee89 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " hello = \"hello\"\n", + " hello1 = hello + 1\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini new file mode 100644 index 0000000000..4637c1d713 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini @@ -0,0 +1,61 @@ +; Supported format: https://docs.python.org/3/library/configparser.html#supported-ini-file-structure + +; This is the only file required to deploy a Data Job. +; Read more to understand what each option means: + +; Information about the owner of the Data Job +[owner] + +; Team is a way to group Data Jobs that belonged to the same team. +team = jupyter-test-jobs + +; Configuration related to running data jobs +[job] +; For format see https://en.wikipedia.org/wiki/Cron +; The cron expression is evaluated in UTC time. +; If it is time for a new job run and the previous job run hasn’t finished yet, +; the cron job waits until the previous execution has finished. +schedule_cron = 11 23 5 8 1 + +; Who will be contacted and on what occasion +[contacts] + +; Specifies the time interval (in minutes) that a job execution is allowed to be delayed +; from its scheduled time before a notification email is sent. The default is 240. +; notification_delay_period_minutes=240 + +; Specifies whether to enable or disable the email notifications for each data job run attempt. +; The default value is true. +; enable_attempt_notifications=true + +; Specifies whether to enable or disable email notifications per data job execution and execution delays. +; The default value is true. +;enable_execution_notifications=true + +; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. +; You can also provide email address linked to your Slack account in order to receive Slack messages. +; To generate Slack linked email address follow the steps here: +; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. +; For example: if the job contains an SQL script with syntax error. +; notified_on_job_failure_user_error=example@vmware.com +notified_on_job_failure_user_error= + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. +; notified_on_job_failure_platform_error=example@example.com; example2@example.com +notified_on_job_failure_platform_error= + +; Semicolon-separated list of email addresses to be notified on job execution success. +notified_on_job_success= + +; Semicolon-separated list of email addresses to be notified of job deployment outcome. +; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), +; then an email notification will NOT be sent to the recipients specified here. +notified_on_job_deploy= + +[vdk] +; Key value pairs of any configuration options that can be passed to vdk. +; For possible options in your vdk installation execute command vdk config-help +db_default_type=SQLITE +ingest_method_default=SQLITE diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/requirements.txt b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/requirements.txt new file mode 100644 index 0000000000..09c0252234 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/requirements.txt @@ -0,0 +1,6 @@ +# Python jobs can specify extra library dependencies in requirements.txt file. +# See https://pip.readthedocs.io/en/stable/user_guide/#requirements-files +# The file is optional and can be deleted if no extra library dependencies are necessary. + + +requests diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/steps.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/steps.ipynb new file mode 100644 index 0000000000..1c8849ee89 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/steps.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " hello = \"hello\"\n", + " hello1 = hello + 1\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb new file mode 100644 index 0000000000..4d3b0e4575 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " ss\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini new file mode 100644 index 0000000000..4637c1d713 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini @@ -0,0 +1,61 @@ +; Supported format: https://docs.python.org/3/library/configparser.html#supported-ini-file-structure + +; This is the only file required to deploy a Data Job. +; Read more to understand what each option means: + +; Information about the owner of the Data Job +[owner] + +; Team is a way to group Data Jobs that belonged to the same team. +team = jupyter-test-jobs + +; Configuration related to running data jobs +[job] +; For format see https://en.wikipedia.org/wiki/Cron +; The cron expression is evaluated in UTC time. +; If it is time for a new job run and the previous job run hasn’t finished yet, +; the cron job waits until the previous execution has finished. +schedule_cron = 11 23 5 8 1 + +; Who will be contacted and on what occasion +[contacts] + +; Specifies the time interval (in minutes) that a job execution is allowed to be delayed +; from its scheduled time before a notification email is sent. The default is 240. +; notification_delay_period_minutes=240 + +; Specifies whether to enable or disable the email notifications for each data job run attempt. +; The default value is true. +; enable_attempt_notifications=true + +; Specifies whether to enable or disable email notifications per data job execution and execution delays. +; The default value is true. +;enable_execution_notifications=true + +; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. +; You can also provide email address linked to your Slack account in order to receive Slack messages. +; To generate Slack linked email address follow the steps here: +; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. +; For example: if the job contains an SQL script with syntax error. +; notified_on_job_failure_user_error=example@vmware.com +notified_on_job_failure_user_error= + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. +; notified_on_job_failure_platform_error=example@example.com; example2@example.com +notified_on_job_failure_platform_error= + +; Semicolon-separated list of email addresses to be notified on job execution success. +notified_on_job_success= + +; Semicolon-separated list of email addresses to be notified of job deployment outcome. +; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), +; then an email notification will NOT be sent to the recipients specified here. +notified_on_job_deploy= + +[vdk] +; Key value pairs of any configuration options that can be passed to vdk. +; For possible options in your vdk installation execute command vdk config-help +db_default_type=SQLITE +ingest_method_default=SQLITE diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/requirements.txt b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/requirements.txt new file mode 100644 index 0000000000..09c0252234 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/requirements.txt @@ -0,0 +1,6 @@ +# Python jobs can specify extra library dependencies in requirements.txt file. +# See https://pip.readthedocs.io/en/stable/user_guide/#requirements-files +# The file is optional and can be deleted if no extra library dependencies are necessary. + + +requests diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb new file mode 100644 index 0000000000..4d3b0e4575 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " ss\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb new file mode 100644 index 0000000000..e152ec04fb --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini new file mode 100644 index 0000000000..4637c1d713 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini @@ -0,0 +1,61 @@ +; Supported format: https://docs.python.org/3/library/configparser.html#supported-ini-file-structure + +; This is the only file required to deploy a Data Job. +; Read more to understand what each option means: + +; Information about the owner of the Data Job +[owner] + +; Team is a way to group Data Jobs that belonged to the same team. +team = jupyter-test-jobs + +; Configuration related to running data jobs +[job] +; For format see https://en.wikipedia.org/wiki/Cron +; The cron expression is evaluated in UTC time. +; If it is time for a new job run and the previous job run hasn’t finished yet, +; the cron job waits until the previous execution has finished. +schedule_cron = 11 23 5 8 1 + +; Who will be contacted and on what occasion +[contacts] + +; Specifies the time interval (in minutes) that a job execution is allowed to be delayed +; from its scheduled time before a notification email is sent. The default is 240. +; notification_delay_period_minutes=240 + +; Specifies whether to enable or disable the email notifications for each data job run attempt. +; The default value is true. +; enable_attempt_notifications=true + +; Specifies whether to enable or disable email notifications per data job execution and execution delays. +; The default value is true. +;enable_execution_notifications=true + +; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. +; You can also provide email address linked to your Slack account in order to receive Slack messages. +; To generate Slack linked email address follow the steps here: +; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. +; For example: if the job contains an SQL script with syntax error. +; notified_on_job_failure_user_error=example@vmware.com +notified_on_job_failure_user_error= + +; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. +; notified_on_job_failure_platform_error=example@example.com; example2@example.com +notified_on_job_failure_platform_error= + +; Semicolon-separated list of email addresses to be notified on job execution success. +notified_on_job_success= + +; Semicolon-separated list of email addresses to be notified of job deployment outcome. +; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), +; then an email notification will NOT be sent to the recipients specified here. +notified_on_job_deploy= + +[vdk] +; Key value pairs of any configuration options that can be passed to vdk. +; For possible options in your vdk installation execute command vdk config-help +db_default_type=SQLITE +ingest_method_default=SQLITE diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/requirements.txt b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/requirements.txt new file mode 100644 index 0000000000..15d8cd6f62 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/requirements.txt @@ -0,0 +1,5 @@ +# Python jobs can specify extra library dependencies in requirements.txt file. +# See https://pip.readthedocs.io/en/stable/user_guide/#requirements-files +# The file is optional and can be deleted if no extra library dependencies are necessary. + +requests diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb new file mode 100644 index 0000000000..309d5fb0a9 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ], + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ], + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ], + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ], + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index b52f7f4646..d31d5f96d7 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -1,30 +1,51 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 + import os +import unittest from unittest import mock from click.testing import Result from vdk.plugin.notebook import notebook_plugin +from vdk.plugin.sqlite import sqlite_plugin from vdk.plugin.test_utils.util_funcs import cli_assert_equal from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner from vdk.plugin.test_utils.util_funcs import jobs_path_from_caller_directory -""" -This is a sample test file showing a recommended way to test new plugins. -A good way to test a new plugin is how it would be used in the command that it extends. -""" +@mock.patch.dict( + os.environ, + { + "VDK_DB_DEFAULT_TYPE": "SQLITE", + "VDK_INGEST_METHOD_DEFAULT": "sqlite", + }, +) +class JupyterTests(unittest.TestCase): + def setUp(self) -> None: + self.__runner = CliEntryBasedTestRunner(notebook_plugin, sqlite_plugin) + + def test_notebook_plugin(self) -> None: + result: Result = self.__runner.invoke( + ["run", jobs_path_from_caller_directory("rest-api-job")] + ) + cli_assert_equal(0, result) + actual_rs: Result = self.__runner.invoke( + ["sqlite-query", "--query", f"SELECT * FROM rest_target_table"] + ) + assert actual_rs.stdout == ( + " userId id title completed\n" + "-------- ---- ------------------ -----------\n" + " 1 1 delectus aut autem 0\n" + ) -def test_dummy(): - with mock.patch.dict( - os.environ, - { - # mock the vdk configuration needed for our test - }, - ): - # CliEntryBasedTestRunner (provided by vdk-test-utils) gives a away to simulate vdk command - # and mock large parts of it - e.g passed our own plugins - runner = CliEntryBasedTestRunner(notebook_plugin) + def test_failing_job_with_syntax_error(self) -> None: + result: Result = self.__runner.invoke( + ["run", jobs_path_from_caller_directory("rest-api-job-fail-syntax-error")] + ) + cli_assert_equal(1, result) - # result: Result = runner.invoke(["run", jobs_path_from_caller_directory("job-using-a-plugin")]) - # cli_assert_equal(0, result) + def test_failing_job_with_code_error(self) -> None: + result: Result = self.__runner.invoke( + ["run", jobs_path_from_caller_directory("rest-api-job-fail-code-error")] + ) + cli_assert_equal(2, result) From c26b63be1474a864463fc70f247fa18e9c4bf288 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Dec 2022 12:04:20 +0000 Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- projects/vdk-plugins/vdk-notebook/requirements.txt | 4 ++-- .../src/vdk/plugin/notebook/notebook_based_step.py | 1 - .../vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py | 1 - .../vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py | 1 - .../vdk-notebook/src/vdk/plugin/notebook/notebook_step.py | 1 - .../vdk-notebook/tests/jobs/rest-api-job/steps.ipynb | 2 +- projects/vdk-plugins/vdk-notebook/tests/test_plugin.py | 1 - 7 files changed, 3 insertions(+), 8 deletions(-) diff --git a/projects/vdk-plugins/vdk-notebook/requirements.txt b/projects/vdk-plugins/vdk-notebook/requirements.txt index 5b66d1c9a0..d10854129d 100644 --- a/projects/vdk-plugins/vdk-notebook/requirements.txt +++ b/projects/vdk-plugins/vdk-notebook/requirements.txt @@ -2,9 +2,9 @@ # for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section click -requests pytest +requests vdk-core +vdk-sqlite vdk-test-utils -vdk-sqlite \ No newline at end of file diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py index 963e4bec9c..7b7a12fb05 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py @@ -1,6 +1,5 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 - import importlib.util import inspect import logging diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 56afebeb51..9268f6b4e0 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -1,6 +1,5 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 - from __future__ import annotations import logging diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py index 2f9ce1ef70..078e450ba3 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py @@ -1,6 +1,5 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 - import json import logging from pathlib import Path diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py index 9b330f1a5d..3f049d9e53 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py @@ -1,6 +1,5 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 - from __future__ import annotations import logging diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb index 309d5fb0a9..1b53cb22fe 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/steps.ipynb @@ -97,4 +97,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index d31d5f96d7..9d333c18a8 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -1,6 +1,5 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 - import os import unittest from unittest import mock From d842f809e75b6e623fe2a55899f73cf19dbc6461 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Tue, 6 Dec 2022 17:33:10 +0200 Subject: [PATCH 04/14] Removing .ipynb_checkpoints(backupbs from notebooks) and removing unneeded information from config.ini files --- .../.ipynb_checkpoints/steps-checkpoint.ipynb | 90 ------------------- .../jobs/rest-api-job-code-error/config.ini | 46 ---------- .../.ipynb_checkpoints/steps-checkpoint.ipynb | 89 ------------------ .../rest-api-job-fail-syntax-error/config.ini | 45 ---------- .../.ipynb_checkpoints/steps-checkpoint.ipynb | 88 ------------------ .../tests/jobs/rest-api-job/config.ini | 45 ---------- 6 files changed, 403 deletions(-) delete mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb delete mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb delete mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb deleted file mode 100644 index 1c8849ee89..0000000000 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/.ipynb_checkpoints/steps-checkpoint.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql DROP TABLE IF EXISTS rest_target_table;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "826a105f-1874-4251-8abd-75fb898ba71c", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "def run(job_input):\n", - " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", - " hello = \"hello\"\n", - " hello1 = hello + 1\n", - " response.raise_for_status()\n", - " payload = response.json()\n", - " job_input.send_object_for_ingestion(\n", - " payload=payload,\n", - " destination_table=\"rest_target_table\"\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini index 4637c1d713..0e7f6598b4 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-code-error/config.ini @@ -8,52 +8,6 @@ ; Team is a way to group Data Jobs that belonged to the same team. team = jupyter-test-jobs - -; Configuration related to running data jobs -[job] -; For format see https://en.wikipedia.org/wiki/Cron -; The cron expression is evaluated in UTC time. -; If it is time for a new job run and the previous job run hasn’t finished yet, -; the cron job waits until the previous execution has finished. -schedule_cron = 11 23 5 8 1 - -; Who will be contacted and on what occasion -[contacts] - -; Specifies the time interval (in minutes) that a job execution is allowed to be delayed -; from its scheduled time before a notification email is sent. The default is 240. -; notification_delay_period_minutes=240 - -; Specifies whether to enable or disable the email notifications for each data job run attempt. -; The default value is true. -; enable_attempt_notifications=true - -; Specifies whether to enable or disable email notifications per data job execution and execution delays. -; The default value is true. -;enable_execution_notifications=true - -; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. -; You can also provide email address linked to your Slack account in order to receive Slack messages. -; To generate Slack linked email address follow the steps here: -; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. -; For example: if the job contains an SQL script with syntax error. -; notified_on_job_failure_user_error=example@vmware.com -notified_on_job_failure_user_error= - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. -; notified_on_job_failure_platform_error=example@example.com; example2@example.com -notified_on_job_failure_platform_error= - -; Semicolon-separated list of email addresses to be notified on job execution success. -notified_on_job_success= - -; Semicolon-separated list of email addresses to be notified of job deployment outcome. -; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), -; then an email notification will NOT be sent to the recipients specified here. -notified_on_job_deploy= - [vdk] ; Key value pairs of any configuration options that can be passed to vdk. ; For possible options in your vdk installation execute command vdk config-help diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb deleted file mode 100644 index 4d3b0e4575..0000000000 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/.ipynb_checkpoints/steps-checkpoint.ipynb +++ /dev/null @@ -1,89 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql DROP TABLE IF EXISTS rest_target_table;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "826a105f-1874-4251-8abd-75fb898ba71c", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "def run(job_input):\n", - " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", - " ss\n", - " response.raise_for_status()\n", - " payload = response.json()\n", - " job_input.send_object_for_ingestion(\n", - " payload=payload,\n", - " destination_table=\"rest_target_table\"\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini index 4637c1d713..adc5f80a56 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/config.ini @@ -9,51 +9,6 @@ ; Team is a way to group Data Jobs that belonged to the same team. team = jupyter-test-jobs -; Configuration related to running data jobs -[job] -; For format see https://en.wikipedia.org/wiki/Cron -; The cron expression is evaluated in UTC time. -; If it is time for a new job run and the previous job run hasn’t finished yet, -; the cron job waits until the previous execution has finished. -schedule_cron = 11 23 5 8 1 - -; Who will be contacted and on what occasion -[contacts] - -; Specifies the time interval (in minutes) that a job execution is allowed to be delayed -; from its scheduled time before a notification email is sent. The default is 240. -; notification_delay_period_minutes=240 - -; Specifies whether to enable or disable the email notifications for each data job run attempt. -; The default value is true. -; enable_attempt_notifications=true - -; Specifies whether to enable or disable email notifications per data job execution and execution delays. -; The default value is true. -;enable_execution_notifications=true - -; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. -; You can also provide email address linked to your Slack account in order to receive Slack messages. -; To generate Slack linked email address follow the steps here: -; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. -; For example: if the job contains an SQL script with syntax error. -; notified_on_job_failure_user_error=example@vmware.com -notified_on_job_failure_user_error= - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. -; notified_on_job_failure_platform_error=example@example.com; example2@example.com -notified_on_job_failure_platform_error= - -; Semicolon-separated list of email addresses to be notified on job execution success. -notified_on_job_success= - -; Semicolon-separated list of email addresses to be notified of job deployment outcome. -; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), -; then an email notification will NOT be sent to the recipients specified here. -notified_on_job_deploy= - [vdk] ; Key value pairs of any configuration options that can be passed to vdk. ; For possible options in your vdk installation execute command vdk config-help diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb deleted file mode 100644 index e152ec04fb..0000000000 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/.ipynb_checkpoints/steps-checkpoint.ipynb +++ /dev/null @@ -1,88 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql DROP TABLE IF EXISTS rest_target_table;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "826a105f-1874-4251-8abd-75fb898ba71c", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", - "metadata": { - "tags": [ - "vdk" - ] - }, - "outputs": [], - "source": [ - "def run(job_input):\n", - " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", - " response.raise_for_status()\n", - " payload = response.json()\n", - " job_input.send_object_for_ingestion(\n", - " payload=payload,\n", - " destination_table=\"rest_target_table\"\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini index 4637c1d713..adc5f80a56 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job/config.ini @@ -9,51 +9,6 @@ ; Team is a way to group Data Jobs that belonged to the same team. team = jupyter-test-jobs -; Configuration related to running data jobs -[job] -; For format see https://en.wikipedia.org/wiki/Cron -; The cron expression is evaluated in UTC time. -; If it is time for a new job run and the previous job run hasn’t finished yet, -; the cron job waits until the previous execution has finished. -schedule_cron = 11 23 5 8 1 - -; Who will be contacted and on what occasion -[contacts] - -; Specifies the time interval (in minutes) that a job execution is allowed to be delayed -; from its scheduled time before a notification email is sent. The default is 240. -; notification_delay_period_minutes=240 - -; Specifies whether to enable or disable the email notifications for each data job run attempt. -; The default value is true. -; enable_attempt_notifications=true - -; Specifies whether to enable or disable email notifications per data job execution and execution delays. -; The default value is true. -;enable_execution_notifications=true - -; The [contacts] properties below use semicolon-separated list of email addresses that will be notified with email message on a given condition. -; You can also provide email address linked to your Slack account in order to receive Slack messages. -; To generate Slack linked email address follow the steps here: -; https://get.slack.help/hc/en-us/articles/206819278-Send-emails-to-Slack#connect-the-email-app-to-your-workspace - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by user code or user configuration why. -; For example: if the job contains an SQL script with syntax error. -; notified_on_job_failure_user_error=example@vmware.com -notified_on_job_failure_user_error= - -; Semicolon-separated list of email addresses to be notified on job execution failure caused by a platform why. -; notified_on_job_failure_platform_error=example@example.com; example2@example.com -notified_on_job_failure_platform_error= - -; Semicolon-separated list of email addresses to be notified on job execution success. -notified_on_job_success= - -; Semicolon-separated list of email addresses to be notified of job deployment outcome. -; Notice that if this file is malformed (file structure is not as per https://docs.python.org/3/library/configparser.html#supported-ini-file-structure), -; then an email notification will NOT be sent to the recipients specified here. -notified_on_job_deploy= - [vdk] ; Key value pairs of any configuration options that can be passed to vdk. ; For possible options in your vdk installation execute command vdk config-help From 63970e15385f4fc36318dfb70192581520108699 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Wed, 7 Dec 2022 15:48:47 +0200 Subject: [PATCH 05/14] Rearange class files and changes on creating NotebookSteps --- .../src/vdk/plugin/notebook/notebook.py | 116 ++++++++++++++++++ .../plugin/notebook/notebook_based_step.py | 33 +++-- .../vdk/plugin/notebook/notebook_plugin.py | 15 +-- .../vdk/plugin/notebook/notebook_reader.py | 89 -------------- .../src/vdk/plugin/notebook/notebook_step.py | 29 ----- 5 files changed, 135 insertions(+), 147 deletions(-) create mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py delete mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py delete mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py new file mode 100644 index 0000000000..b0e1f90244 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py @@ -0,0 +1,116 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import json +import logging +from dataclasses import dataclass +from pathlib import Path + +import pathlib +from typing import List + +from vdk.internal.builtin_plugins.run.file_based_step import TYPE_PYTHON +from vdk.internal.builtin_plugins.run.file_based_step import TYPE_SQL +from vdk.internal.builtin_plugins.run.job_context import JobContext +from vdk.internal.core import errors +from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory +from vdk.plugin.notebook.notebook_based_step import NotebookStep + +log = logging.getLogger(__name__) + + +class JobNotebookLocator: + """ + Locate the data job files that would be executed by us. + """ + + @staticmethod + def get_notebook_files(directory: pathlib.Path) -> List[pathlib.Path]: + """Locates the files in a directory, that are supported for execution. + Files supported for execution are: .ipynb + Other files in the directory are ignored. + :return: List of files from the directory that supported for execution, sorted alphabetically by name. + :rtype: :class:`.list` + """ + script_files = [ + x for x in directory.iterdir() if (x.name.lower().endswith(".ipynb")) + ] + script_files.sort(key=lambda x: x.name) + log.debug(f"Script files of {directory} are {script_files}") + return script_files + + +class Notebook: + """ + Given a notebook file locates the cells with "vdk" tag and saves them. + Files supported for reading are: ipynb. + Other cells are ignored. + """ + + def __init__(self, file_path: Path): + try: + self.sql_and_run_cells = [] + self.python_helper_cells = [] + self.file_path = file_path + + content = json.loads(file_path.read_text()) + cells = [ + cell + for cell in content["cells"] + if cell["cell_type"] == "code" and "vdk" in cell["metadata"].get("tags", {}) + ] + for cell in cells: + code = "".join(cell["source"]) + if code.startswith("%sql") or "def run(" in code: + if code.startswith("%sql"): + code = "".join(cell["source"]) + code.replace(";", "") + self.sql_and_run_cells.append([code.replace("%sql", ""), TYPE_SQL]) + else: + self.sql_and_run_cells.append(["".join(cell["source"]), TYPE_PYTHON]) + else: + self.python_helper_cells.append("".join(cell["source"])) + + except json.JSONDecodeError as e: + errors.log_and_rethrow( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened=f"Failed to read the {file_path.name} file.", + why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " + f"cannot be read as a Jupyter notebook", + consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, + countermeasures=f"Check the {file_path.name} format again", + exception=e, + wrap_in_vdk_error=True, + ) + + def extract_notebook_steps(self, context: JobContext): + for index, cell in enumerate(self.sql_and_run_cells): + if cell[1] == TYPE_SQL: + step = NotebookStep( + name="".join( + [ + self.file_path.name.replace(".ipynb", "_"), + str(index), + ] + ), + type=TYPE_SQL, + runner_func=NotebookStepFuncFactory.run_sql_step, + file_path=self.file_path, + job_dir=context.job_directory, + code=cell[0], + ) + else: + step = NotebookStep( + name="".join( + [ + self.file_path.name.replace(".ipynb", "_"), + str(index), + ] + ), + type=TYPE_PYTHON, + runner_func=NotebookStepFuncFactory.run_python_step, + file_path=self.file_path, + job_dir=context.job_directory, + code="\n".join(self.python_helper_cells) + "\n" + cell[0] + ) + context.step_builder.add_step(step) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py index 7b7a12fb05..c9ce419ae6 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py @@ -3,16 +3,15 @@ import importlib.util import inspect import logging -import pathlib import sys import traceback +from dataclasses import dataclass from typing import Callable -from typing import List from vdk.api.job_input import IJobInput +from vdk.internal.builtin_plugins.run.step import Step from vdk.internal.core import errors from vdk.internal.core.errors import SkipRemainingStepsException -from vdk.plugin.notebook.notebook_step import NotebookStep log = logging.getLogger(__name__) @@ -20,26 +19,22 @@ # consists may duplicates of # https://github.com/vmware/versatile-data-kit/blob/main/projects/vdk-core/src/vdk/internal/builtin_plugins/run/file_based_step.py +# The function accept NotebookStep (below class) and IJobInput and +# return true if the step has been executed and false if it is not (valid) executable step. +# On error it's expected to raise an exception. +NotebookStepFunction = Callable[["NotebookStep", IJobInput], bool] -class JobNotebookLocator: + +@dataclass +class NotebookStep(Step): """ - Locate the data job files that would be executed by us. + A notebook step that will be executed when running a data job. """ - @staticmethod - def get_notebook_files(directory: pathlib.Path) -> List[pathlib.Path]: - """Locates the files in a directory, that are supported for execution. - Files supported for execution are: .ipynb - Other files in the directory are ignored. - :return: List of files from the directory that supported for execution, sorted alphabetically by name. - :rtype: :class:`.list` - """ - script_files = [ - x for x in directory.iterdir() if (x.name.lower().endswith(".ipynb")) - ] - script_files.sort(key=lambda x: x.name) - log.debug(f"Script files of {directory} are {script_files}") - return script_files + def __init__(self, name, type, runner_func, file_path, job_dir, code, parent=None): + super().__init__(name, type, runner_func, file_path, job_dir, parent) + self.runner_func = runner_func + self.code = code class NotebookStepFuncFactory: diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 9268f6b4e0..3d62e3f45d 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -6,23 +6,18 @@ from vdk.api.plugin.hook_markers import hookimpl from vdk.internal.builtin_plugins.run.job_context import JobContext -from vdk.plugin.notebook.notebook_based_step import JobNotebookLocator -from vdk.plugin.notebook.notebook_reader import Notebook -from vdk.plugin.notebook.notebook_reader import NotebookReader +from vdk.plugin.notebook.notebook import Notebook +from vdk.plugin.notebook.notebook import JobNotebookLocator log = logging.getLogger(__name__) -@hookimpl() +@hookimpl(trylast=True) def initialize_job(context: JobContext): - if context.job_directory is None: - log.info( - "Data Job directory is not specified. Default job initialization will be skipped." - ) - return file_locator: JobNotebookLocator = JobNotebookLocator() notebook_files = file_locator.get_notebook_files(context.job_directory) if len(notebook_files) >= 1: for file_path in notebook_files: nb: Notebook = Notebook(file_path) - NotebookReader.read_notebook_and_save_steps(nb, context) + nb.extract_notebook_steps(context) + diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py deleted file mode 100644 index 078e450ba3..0000000000 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_reader.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -import json -import logging -from pathlib import Path - -from vdk.internal.builtin_plugins.run.file_based_step import TYPE_PYTHON -from vdk.internal.builtin_plugins.run.file_based_step import TYPE_SQL -from vdk.internal.builtin_plugins.run.job_context import JobContext -from vdk.internal.core import errors -from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory -from vdk.plugin.notebook.notebook_step import NotebookStep - -log = logging.getLogger(__name__) - - -class Notebook: - """ - Given a notebook file locates the cells with "vdk" tag and saves them. - Files supported for reading are: ipynb. - Other cells are ignored. - """ - - def __init__(self, file_path: Path): - try: - content = json.loads(file_path.read_text()) - self.cells = [ - cell - for cell in content["cells"] - if cell["cell_type"] == "code" - and "vdk" in cell["metadata"].get("tags", {}) - ] - self.file_path = file_path - except json.JSONDecodeError as e: - errors.log_and_rethrow( - to_be_fixed_by=errors.ResolvableBy.USER_ERROR, - log=log, - what_happened=f"Failed to read the {file_path.name} file.", - why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " - f"cannot be read as a Jupyter notebook", - consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, - countermeasures=f"Check the {file_path.name} format again", - exception=e, - wrap_in_vdk_error=True, - ) - - -class NotebookReader: - @staticmethod - def read_notebook_and_save_steps(notebook: Notebook, context: JobContext): - python_cells = "" - step_index = 0 - for cell in notebook.cells: - if cell["source"][0].startswith("%sql"): - code = "".join(cell["source"]) - code.replace(";", "") - step = NotebookStep( - name="".join( - [ - notebook.file_path.name.replace(".ipynb", "_"), - str(step_index), - ] - ), - type=TYPE_SQL, - runner_func=NotebookStepFuncFactory.run_sql_step, - file_path=notebook.file_path, - job_dir=context.job_directory, - code=code.replace("%sql", ""), - ) - step_index += 1 - context.step_builder.add_step(step) - else: - python_cells += "\n" + "".join(cell["source"]) - if "def run(" in cell["source"][0]: - step = NotebookStep( - name="".join( - [ - notebook.file_path.name.replace(".ipynb", "_"), - str(step_index), - ] - ), - type=TYPE_PYTHON, - runner_func=NotebookStepFuncFactory.run_python_step, - file_path=notebook.file_path, - job_dir=context.job_directory, - code=python_cells, - ) - step_index += 1 - context.step_builder.add_step(step) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py deleted file mode 100644 index 3f049d9e53..0000000000 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_step.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import Callable - -from vdk.api.job_input import IJobInput -from vdk.internal.builtin_plugins.run.step import Step - -log = logging.getLogger(__name__) - -# The function accept NotebookStep (below class) and IJobInput and -# return true if the step has been executed and false if it is not (valid) executable step. -# On error it's expected to raise an exception. -NotebookStepFunction = Callable[["NotebookStep", IJobInput], bool] - - -@dataclass -class NotebookStep(Step): - """ - A notebook step that will be executed when running a data job. - """ - - def __init__(self, name, type, runner_func, file_path, job_dir, code, parent=None): - super().__init__(name, type, runner_func, file_path, job_dir, parent) - self.runner_func = runner_func - self.code = code From 079c37a435f0ba417ced39c16d22d2172e02de2c Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Wed, 7 Dec 2022 15:49:25 +0200 Subject: [PATCH 06/14] Add test for job with multiple run methods --- .../rest-api-multiple-run-methods/config.ini | 16 ++ .../requirements.txt | 3 + .../rest-api-multiple-run-methods/steps.ipynb | 137 ++++++++++++++++++ .../vdk-notebook/tests/test_plugin.py | 15 ++ 4 files changed, 171 insertions(+) create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/config.ini create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/requirements.txt create mode 100644 projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/steps.ipynb diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/config.ini b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/config.ini new file mode 100644 index 0000000000..adc5f80a56 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/config.ini @@ -0,0 +1,16 @@ +; Supported format: https://docs.python.org/3/library/configparser.html#supported-ini-file-structure + +; This is the only file required to deploy a Data Job. +; Read more to understand what each option means: + +; Information about the owner of the Data Job +[owner] + +; Team is a way to group Data Jobs that belonged to the same team. +team = jupyter-test-jobs + +[vdk] +; Key value pairs of any configuration options that can be passed to vdk. +; For possible options in your vdk installation execute command vdk config-help +db_default_type=SQLITE +ingest_method_default=SQLITE diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/requirements.txt b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/requirements.txt new file mode 100644 index 0000000000..2b590c5cfe --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/requirements.txt @@ -0,0 +1,3 @@ +# Python jobs can specify extra library dependencies in requirements.txt file. +# See https://pip.readthedocs.io/en/stable/user_guide/#requirements-files +# The file is optional and can be deleted if no extra library dependencies are necessary. diff --git a/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/steps.ipynb b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/steps.ipynb new file mode 100644 index 0000000000..061f4cbe9a --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-multiple-run-methods/steps.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c635d290-99d5-4354-b95c-a3210e4cf6e1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c948f9f2-1f7b-4d8c-aeca-9b300ded9775", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a2d5dec-2fd7-425a-afee-fd6bda2d83ed", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql DROP TABLE IF EXISTS rest_target_table_copy;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "826a105f-1874-4251-8abd-75fb898ba71c", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c8cd38b-dc54-4d89-96a0-d88ede4203bf", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "%sql CREATE TABLE rest_target_table_copy (userId, id, title, completed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056aefa2-fec5-4f9f-97b3-78a412ae67d1", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b09d6c5-4c8e-4e4c-b153-9b41735ba7ff", + "metadata": { + "tags": [ + "vdk" + ] + }, + "outputs": [], + "source": [ + "def run(job_input):\n", + " response = requests.get(\"https://jsonplaceholder.typicode.com/todos/1\")\n", + " response.raise_for_status()\n", + " payload = response.json()\n", + " job_input.send_object_for_ingestion(\n", + " payload=payload,\n", + " destination_table=\"rest_target_table_copy\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index 9d333c18a8..6e064be54d 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -6,6 +6,7 @@ from click.testing import Result from vdk.plugin.notebook import notebook_plugin + from vdk.plugin.sqlite import sqlite_plugin from vdk.plugin.test_utils.util_funcs import cli_assert_equal from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner @@ -48,3 +49,17 @@ def test_failing_job_with_code_error(self) -> None: ["run", jobs_path_from_caller_directory("rest-api-job-fail-code-error")] ) cli_assert_equal(2, result) + + def test_notebook_plugin_with_multiple_run_methods(self) -> None: + result: Result = self.__runner.invoke( + ["run", jobs_path_from_caller_directory("rest-api-multiple-run-methods")] + ) + cli_assert_equal(0, result) + actual_rs: Result = self.__runner.invoke( + ["sqlite-query", "--query", f"SELECT * FROM rest_target_table_copy"] + ) + assert actual_rs.stdout == ( + " userId id title completed\n" + "-------- ---- ------------------ -----------\n" + " 1 1 delectus aut autem 0\n" + ) \ No newline at end of file From 9e955ac5ecdd876c428d36137c97dafb027b9daa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Dec 2022 13:52:45 +0000 Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../src/vdk/plugin/notebook/notebook.py | 20 +++++++++++-------- .../vdk/plugin/notebook/notebook_plugin.py | 3 +-- .../vdk-notebook/tests/test_plugin.py | 3 +-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py index b0e1f90244..7f1b976280 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py @@ -2,18 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import json import logging +import pathlib from dataclasses import dataclass from pathlib import Path - -import pathlib from typing import List from vdk.internal.builtin_plugins.run.file_based_step import TYPE_PYTHON from vdk.internal.builtin_plugins.run.file_based_step import TYPE_SQL from vdk.internal.builtin_plugins.run.job_context import JobContext from vdk.internal.core import errors -from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory from vdk.plugin.notebook.notebook_based_step import NotebookStep +from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory log = logging.getLogger(__name__) @@ -56,7 +55,8 @@ def __init__(self, file_path: Path): cells = [ cell for cell in content["cells"] - if cell["cell_type"] == "code" and "vdk" in cell["metadata"].get("tags", {}) + if cell["cell_type"] == "code" + and "vdk" in cell["metadata"].get("tags", {}) ] for cell in cells: code = "".join(cell["source"]) @@ -64,9 +64,13 @@ def __init__(self, file_path: Path): if code.startswith("%sql"): code = "".join(cell["source"]) code.replace(";", "") - self.sql_and_run_cells.append([code.replace("%sql", ""), TYPE_SQL]) + self.sql_and_run_cells.append( + [code.replace("%sql", ""), TYPE_SQL] + ) else: - self.sql_and_run_cells.append(["".join(cell["source"]), TYPE_PYTHON]) + self.sql_and_run_cells.append( + ["".join(cell["source"]), TYPE_PYTHON] + ) else: self.python_helper_cells.append("".join(cell["source"])) @@ -76,7 +80,7 @@ def __init__(self, file_path: Path): log=log, what_happened=f"Failed to read the {file_path.name} file.", why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " - f"cannot be read as a Jupyter notebook", + f"cannot be read as a Jupyter notebook", consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, countermeasures=f"Check the {file_path.name} format again", exception=e, @@ -111,6 +115,6 @@ def extract_notebook_steps(self, context: JobContext): runner_func=NotebookStepFuncFactory.run_python_step, file_path=self.file_path, job_dir=context.job_directory, - code="\n".join(self.python_helper_cells) + "\n" + cell[0] + code="\n".join(self.python_helper_cells) + "\n" + cell[0], ) context.step_builder.add_step(step) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 3d62e3f45d..46ead2b5c0 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -6,8 +6,8 @@ from vdk.api.plugin.hook_markers import hookimpl from vdk.internal.builtin_plugins.run.job_context import JobContext -from vdk.plugin.notebook.notebook import Notebook from vdk.plugin.notebook.notebook import JobNotebookLocator +from vdk.plugin.notebook.notebook import Notebook log = logging.getLogger(__name__) @@ -20,4 +20,3 @@ def initialize_job(context: JobContext): for file_path in notebook_files: nb: Notebook = Notebook(file_path) nb.extract_notebook_steps(context) - diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index 6e064be54d..69404ca1f4 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -6,7 +6,6 @@ from click.testing import Result from vdk.plugin.notebook import notebook_plugin - from vdk.plugin.sqlite import sqlite_plugin from vdk.plugin.test_utils.util_funcs import cli_assert_equal from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner @@ -62,4 +61,4 @@ def test_notebook_plugin_with_multiple_run_methods(self) -> None: " userId id title completed\n" "-------- ---- ------------------ -----------\n" " 1 1 delectus aut autem 0\n" - ) \ No newline at end of file + ) From 5f58a028f9e94721c46ae002f762c236df8845f6 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Wed, 7 Dec 2022 17:25:11 +0200 Subject: [PATCH 08/14] Add class for notebook plugin --- .../vdk/plugin/notebook/notebook_plugin.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 46ead2b5c0..003a821857 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -5,6 +5,7 @@ import logging from vdk.api.plugin.hook_markers import hookimpl +from vdk.api.plugin.plugin_registry import IPluginRegistry from vdk.internal.builtin_plugins.run.job_context import JobContext from vdk.plugin.notebook.notebook import JobNotebookLocator from vdk.plugin.notebook.notebook import Notebook @@ -12,11 +13,17 @@ log = logging.getLogger(__name__) -@hookimpl(trylast=True) -def initialize_job(context: JobContext): - file_locator: JobNotebookLocator = JobNotebookLocator() - notebook_files = file_locator.get_notebook_files(context.job_directory) - if len(notebook_files) >= 1: - for file_path in notebook_files: - nb: Notebook = Notebook(file_path) - nb.extract_notebook_steps(context) +class NotebookPlugin: + @hookimpl(trylast=True) + def initialize_job(self, context: JobContext): + file_locator: JobNotebookLocator = JobNotebookLocator() + notebook_files = file_locator.get_notebook_files(context.job_directory) + if len(notebook_files) >= 1: + for file_path in notebook_files: + nb: Notebook = Notebook(file_path) + nb.extract_notebook_steps(context) + + +@hookimpl +def vdk_start(plugin_registry: IPluginRegistry): + plugin_registry.load_plugin_with_hooks_impl(NotebookPlugin(), "notebook-plugin") From ed30c889b8cc76dbb5512f618b10684b07974121 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Thu, 8 Dec 2022 14:00:08 +0200 Subject: [PATCH 09/14] Add helper class Cell to move some cell creating logic from Notebook class --- .../src/vdk/plugin/notebook/cell.py | 30 +++++++ .../src/vdk/plugin/notebook/notebook.py | 89 ++++++++----------- .../plugin/notebook/notebook_based_step.py | 4 +- .../vdk/plugin/notebook/notebook_plugin.py | 2 +- .../vdk-notebook/tests/test_plugin.py | 4 +- 5 files changed, 70 insertions(+), 59 deletions(-) create mode 100644 projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py new file mode 100644 index 0000000000..8ceb9260d7 --- /dev/null +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py @@ -0,0 +1,30 @@ +class Cell: + """ + Helper class that retrieves data from Jupyter cells + Only the data essential for running a VDK data job is saved + Other data is ignored + """ + def __init__(self, jupyter_cell): + self.tags = jupyter_cell["metadata"].get("tags", {}) + self.source = "".join(jupyter_cell["source"]) + + def is_vdk_cell(self): + return "vdk" in self.tags + + def is_sql_cell(self): + return self.source.startswith("%sql") + + def is_vdk_run_cell(self): + return "def run(" in self.source + + def add_code(self, cells): + code = [] + for cell in cells: + code.append(cell.source) + self.source = "\n".join(code) + "\n" + self.source + + def get_code(self): + if self.source.startswith("%sql"): + code = self.source.replace("%sql", "") + return code.replace(';', '') + return self.source diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py index 7f1b976280..b244bd4509 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py @@ -3,7 +3,6 @@ import json import logging import pathlib -from dataclasses import dataclass from pathlib import Path from typing import List @@ -13,6 +12,7 @@ from vdk.internal.core import errors from vdk.plugin.notebook.notebook_based_step import NotebookStep from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory +from vdk.plugin.notebook.cell import Cell log = logging.getLogger(__name__) @@ -50,71 +50,52 @@ def __init__(self, file_path: Path): self.sql_and_run_cells = [] self.python_helper_cells = [] self.file_path = file_path - + # see Jupyter json schema here: + # https://github.com/jupyter/nbformat/blob/main/nbformat/v4/nbformat.v4.schema.json content = json.loads(file_path.read_text()) - cells = [ - cell - for cell in content["cells"] - if cell["cell_type"] == "code" - and "vdk" in cell["metadata"].get("tags", {}) - ] - for cell in cells: - code = "".join(cell["source"]) - if code.startswith("%sql") or "def run(" in code: - if code.startswith("%sql"): - code = "".join(cell["source"]) - code.replace(";", "") - self.sql_and_run_cells.append( - [code.replace("%sql", ""), TYPE_SQL] - ) - else: - self.sql_and_run_cells.append( - ["".join(cell["source"]), TYPE_PYTHON] - ) - else: - self.python_helper_cells.append("".join(cell["source"])) - + for jupyter_cell in content["cells"]: + if jupyter_cell["cell_type"] == "code": + cell = Cell(jupyter_cell) + if cell.is_vdk_cell(): + if cell.is_sql_cell() or cell.is_vdk_run_cell(): + self.sql_and_run_cells.append(cell) + else: + self.python_helper_cells.append(cell) + log.debug(f"{len(self.sql_and_run_cells) + len(self.python_helper_cells)} " + f"cells with vdk tag were detected!") except json.JSONDecodeError as e: errors.log_and_rethrow( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=log, what_happened=f"Failed to read the {file_path.name} file.", why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " - f"cannot be read as a Jupyter notebook", + f"cannot be read as a Jupyter notebook", consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, countermeasures=f"Check the {file_path.name} format again", exception=e, wrap_in_vdk_error=True, ) - def extract_notebook_steps(self, context: JobContext): + def register_notebook_steps(self, context: JobContext): + if not self.sql_and_run_cells: + log.debug(f"Neither VDK run methods nor SQL statements were detected!") for index, cell in enumerate(self.sql_and_run_cells): - if cell[1] == TYPE_SQL: - step = NotebookStep( - name="".join( - [ - self.file_path.name.replace(".ipynb", "_"), - str(index), - ] - ), - type=TYPE_SQL, - runner_func=NotebookStepFuncFactory.run_sql_step, - file_path=self.file_path, - job_dir=context.job_directory, - code=cell[0], - ) - else: - step = NotebookStep( - name="".join( - [ - self.file_path.name.replace(".ipynb", "_"), - str(index), - ] - ), - type=TYPE_PYTHON, - runner_func=NotebookStepFuncFactory.run_python_step, - file_path=self.file_path, - job_dir=context.job_directory, - code="\n".join(self.python_helper_cells) + "\n" + cell[0], - ) + cell_type = TYPE_PYTHON if cell.is_vdk_run_cell() else TYPE_SQL + runner_func = NotebookStepFuncFactory.run_python_step if cell.is_vdk_run_cell() \ + else NotebookStepFuncFactory.run_sql_step + if cell.is_vdk_run_cell(): + cell.add_code(self.python_helper_cells) + step = NotebookStep( + name="".join( + [ + self.file_path.name.replace(".ipynb", "_"), + str(index), + ] + ), + type=cell_type, + runner_func=runner_func, + file_path=self.file_path, + job_dir=context.job_directory, + code=cell.get_code(), + ) context.step_builder.add_step(step) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py index c9ce419ae6..565edfac75 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_based_step.py @@ -152,8 +152,8 @@ def invoke_run_function(func: Callable, job_input: IJobInput, step: NotebookStep errors.log_and_throw( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=log, - what_happened=f"I'm trying to call method 'run' and failed.", - why_it_happened=f"Method is missing at least one job input parameter to be passed", + what_happened="I'm trying to call method 'run' and failed.", + why_it_happened="Method is missing at least one job input parameter to be passed", consequences=f"Current Step {step.name} from {step.file_path}" f"will fail, and as a result the whole Data Job will fail. ", countermeasures="Make sure that you have specified a job input parameter in the signature of the " diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py index 003a821857..767fb26a0a 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook_plugin.py @@ -21,7 +21,7 @@ def initialize_job(self, context: JobContext): if len(notebook_files) >= 1: for file_path in notebook_files: nb: Notebook = Notebook(file_path) - nb.extract_notebook_steps(context) + nb.register_notebook_steps(context) @hookimpl diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index 69404ca1f4..68049eaae4 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -29,7 +29,7 @@ def test_notebook_plugin(self) -> None: ) cli_assert_equal(0, result) actual_rs: Result = self.__runner.invoke( - ["sqlite-query", "--query", f"SELECT * FROM rest_target_table"] + ["sqlite-query", "--query", "SELECT * FROM rest_target_table"] ) assert actual_rs.stdout == ( " userId id title completed\n" @@ -55,7 +55,7 @@ def test_notebook_plugin_with_multiple_run_methods(self) -> None: ) cli_assert_equal(0, result) actual_rs: Result = self.__runner.invoke( - ["sqlite-query", "--query", f"SELECT * FROM rest_target_table_copy"] + ["sqlite-query", "--query", "SELECT * FROM rest_target_table_copy"] ) assert actual_rs.stdout == ( " userId id title completed\n" From 37cee4b9b346e69b3811e2088a270e0449ded585 Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Thu, 8 Dec 2022 14:01:59 +0200 Subject: [PATCH 10/14] Rename test methods --- projects/vdk-plugins/vdk-notebook/tests/test_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py index 68049eaae4..9e583f4cda 100644 --- a/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py +++ b/projects/vdk-plugins/vdk-notebook/tests/test_plugin.py @@ -23,7 +23,7 @@ class JupyterTests(unittest.TestCase): def setUp(self) -> None: self.__runner = CliEntryBasedTestRunner(notebook_plugin, sqlite_plugin) - def test_notebook_plugin(self) -> None: + def test_successful_job(self) -> None: result: Result = self.__runner.invoke( ["run", jobs_path_from_caller_directory("rest-api-job")] ) @@ -49,7 +49,7 @@ def test_failing_job_with_code_error(self) -> None: ) cli_assert_equal(2, result) - def test_notebook_plugin_with_multiple_run_methods(self) -> None: + def test_successful_job_with_multiple_run_methods(self) -> None: result: Result = self.__runner.invoke( ["run", jobs_path_from_caller_directory("rest-api-multiple-run-methods")] ) From 1bddaddf732ecbc516c27b16f4adcbfd6827680e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 Dec 2022 12:03:03 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../vdk-notebook/src/vdk/plugin/notebook/cell.py | 14 +++++++++----- .../src/vdk/plugin/notebook/notebook.py | 15 ++++++++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py index 8ceb9260d7..a0e3931977 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py @@ -1,9 +1,13 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + class Cell: """ - Helper class that retrieves data from Jupyter cells - Only the data essential for running a VDK data job is saved - Other data is ignored - """ + Helper class that retrieves data from Jupyter cells + Only the data essential for running a VDK data job is saved + Other data is ignored + """ + def __init__(self, jupyter_cell): self.tags = jupyter_cell["metadata"].get("tags", {}) self.source = "".join(jupyter_cell["source"]) @@ -26,5 +30,5 @@ def add_code(self, cells): def get_code(self): if self.source.startswith("%sql"): code = self.source.replace("%sql", "") - return code.replace(';', '') + return code.replace(";", "") return self.source diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py index b244bd4509..47b97d53ea 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/notebook.py @@ -10,9 +10,9 @@ from vdk.internal.builtin_plugins.run.file_based_step import TYPE_SQL from vdk.internal.builtin_plugins.run.job_context import JobContext from vdk.internal.core import errors +from vdk.plugin.notebook.cell import Cell from vdk.plugin.notebook.notebook_based_step import NotebookStep from vdk.plugin.notebook.notebook_based_step import NotebookStepFuncFactory -from vdk.plugin.notebook.cell import Cell log = logging.getLogger(__name__) @@ -61,15 +61,17 @@ def __init__(self, file_path: Path): self.sql_and_run_cells.append(cell) else: self.python_helper_cells.append(cell) - log.debug(f"{len(self.sql_and_run_cells) + len(self.python_helper_cells)} " - f"cells with vdk tag were detected!") + log.debug( + f"{len(self.sql_and_run_cells) + len(self.python_helper_cells)} " + f"cells with vdk tag were detected!" + ) except json.JSONDecodeError as e: errors.log_and_rethrow( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=log, what_happened=f"Failed to read the {file_path.name} file.", why_it_happened=f"The provided {file_path.name} cannot be loaded into json format and " - f"cannot be read as a Jupyter notebook", + f"cannot be read as a Jupyter notebook", consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, countermeasures=f"Check the {file_path.name} format again", exception=e, @@ -81,8 +83,11 @@ def register_notebook_steps(self, context: JobContext): log.debug(f"Neither VDK run methods nor SQL statements were detected!") for index, cell in enumerate(self.sql_and_run_cells): cell_type = TYPE_PYTHON if cell.is_vdk_run_cell() else TYPE_SQL - runner_func = NotebookStepFuncFactory.run_python_step if cell.is_vdk_run_cell() \ + runner_func = ( + NotebookStepFuncFactory.run_python_step + if cell.is_vdk_run_cell() else NotebookStepFuncFactory.run_sql_step + ) if cell.is_vdk_run_cell(): cell.add_code(self.python_helper_cells) step = NotebookStep( From c092b34f82fca9beb2afde21c1e639cda7c9ae9c Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Fri, 9 Dec 2022 15:08:13 +0200 Subject: [PATCH 12/14] Exclude the test job with syntax error from .pre-commit --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ee56e780d5..e5612f9706 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,8 @@ repos: hooks: - id: black language_version: python3 - exclude: ^projects/vdk-core/tests/functional/run/jobs/syntax-error-job/1_step.py + exclude: (^projects/vdk-core/tests/functional/run/jobs/syntax-error-job/1_step.py | + ^projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb) # ensure pydoc is up-to standard - repo: https://github.com/pycqa/pydocstyle rev: 6.1.1 From e53f614a96cfba2e4d879ac5ce19ae01b22c5a9b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Dec 2022 13:10:35 +0000 Subject: [PATCH 13/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py index a0e3931977..97ba085b3b 100644 --- a/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py +++ b/projects/vdk-plugins/vdk-notebook/src/vdk/plugin/notebook/cell.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 + class Cell: """ Helper class that retrieves data from Jupyter cells From 423c588a52c03b03873569397257bcd4e4d4a32c Mon Sep 17 00:00:00 2001 From: Duygu Hasan Date: Fri, 9 Dec 2022 15:19:35 +0200 Subject: [PATCH 14/14] Changes on .pre-commit exclude regex --- .pre-commit-config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e5612f9706..6dee781cc6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,8 +28,7 @@ repos: hooks: - id: black language_version: python3 - exclude: (^projects/vdk-core/tests/functional/run/jobs/syntax-error-job/1_step.py | - ^projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb) + exclude: (^projects/vdk-core/tests/functional/run/jobs/syntax-error-job/1_step.py|^projects/vdk-plugins/vdk-notebook/tests/jobs/rest-api-job-fail-syntax-error/steps.ipynb) # ensure pydoc is up-to standard - repo: https://github.com/pycqa/pydocstyle rev: 6.1.1