diff --git a/projects/vdk-plugins/vdk-smarter/.plugin-ci.yml b/projects/vdk-plugins/vdk-smarter/.plugin-ci.yml new file mode 100644 index 0000000000..59e2649ae9 --- /dev/null +++ b/projects/vdk-plugins/vdk-smarter/.plugin-ci.yml @@ -0,0 +1,25 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +gi# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +image: "python:3.7" + +.build-vdk-smarter: + variables: + PLUGIN_NAME: vdk-smarter + extends: .build-plugin + +build-py37-vdk-smarter: + extends: .build-vdk-smarter + image: "python:3.7" + +build-py311-vdk-smarter: + extends: .build-vdk-smarter + image: "python:3.11" + +release-vdk-smarter: + variables: + PLUGIN_NAME: vdk-smarter + extends: .release-plugin diff --git a/projects/vdk-plugins/vdk-smarter/README.md b/projects/vdk-plugins/vdk-smarter/README.md new file mode 100644 index 0000000000..2fdf1fef60 --- /dev/null +++ b/projects/vdk-plugins/vdk-smarter/README.md @@ -0,0 +1,100 @@ +# VDK Smarter + +Making VDK smarter by employing ML/AI. + + + +## Usage + +``` +pip install vdk-smarter +``` + +### Configuration + +(`vdk config-help` is useful command to browse all config options of your installation of vdk) + + +### Example + +TODO# VDK Smarter + +Making VDK smarter by employing ML/AI. + + + +## Usage + +``` +pip install vdk-smarter +``` + +### Configuration + +(`vdk config-help` and search for configuration starting with "openai") + +### Example + +By default reviews are disabled since they are expensive. + +To enable you need to set `openai_review_enabled` to `true` in the configuration and `openai_api_key`. +You can see `vdk config-help` (search for configuration with openai prefix) for more information. + +Once enabled on vdk run each query statement executed will be also reviewed and scored. + +``` +vdk run example +``` +```bash +Query: +CREATE TABLE IF NOT EXISTS super_collider.example_table( + vc_id STRING, + esx_id STRING, + vm_count INT +) STORED AS PARQUET; + + +Review: + { + "score": 5, + "review": "No further changes needed. The query is efficient, readable, and follows best practices. + There are no potential errors, optimization, or security vulnerabilities."} + +... +Query: +CREATE TABLE IF NOT EXISTS super_collider.example_fact_snapshot( + vc_id STRING, + esx_count BIGINT, + vm_count BIGINT +) STORED AS PARQUET; + + +Review: +{"score": 4, "review": "The query is well-structured and follows best practices. + However, there is an opportunity to improve its readability by + adding comments to explain the purpose of the query and the meaning of the parameters. + Additionally, there is potential to optimize the query by providing more precise column types."} +``` + +At the end a report is generated `queries_reviews_report.md` with all the queries and their reviews. + +### Build and testing + +``` +pip install -r requirements.txt +pip install -e . +pytest +``` + +In VDK repo [../build-plugin.sh](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/build-plugin.sh) script can be used also. + + +#### Note about the CICD: + +.plugin-ci.yaml is needed only for plugins part of [Versatile Data Kit Plugin repo](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins). + +The CI/CD is separated in two stages, a build stage and a release stage. +The build stage is made up of a few jobs, all which inherit from the same +job configuration and only differ in the Python version they use (3.7, 3.8, 3.9 and 3.10). +They run according to rules, which are ordered in a way such that changes to a +plugin's directory trigger the plugin CI, but changes to a different plugin does not. diff --git a/projects/vdk-plugins/vdk-smarter/requirements.txt b/projects/vdk-plugins/vdk-smarter/requirements.txt new file mode 100644 index 0000000000..4a1b3ecaa5 --- /dev/null +++ b/projects/vdk-plugins/vdk-smarter/requirements.txt @@ -0,0 +1,8 @@ +# this file is used to provide testing requirements +# for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section + +openai + +pytest +vdk-core +vdk-test-utils diff --git a/projects/vdk-plugins/vdk-smarter/setup.py b/projects/vdk-plugins/vdk-smarter/setup.py new file mode 100644 index 0000000000..a828ba511e --- /dev/null +++ b/projects/vdk-plugins/vdk-smarter/setup.py @@ -0,0 +1,37 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import pathlib + +import setuptools + +""" +Builds a package with the help of setuptools in order for this package to be imported in other projects +""" + +__version__ = "0.1.0" + +setuptools.setup( + name="vdk-smarter", + version=__version__, + url="https://github.com/vmware/versatile-data-kit", + description="Making VDK smarter by employing ML/AI.", + long_description=pathlib.Path("README.md").read_text(), + long_description_content_type="text/markdown", + install_requires=["vdk-core", "openai"], + package_dir={"": "src"}, + packages=setuptools.find_namespace_packages(where="src"), + # This is the only vdk plugin specifc part + # Define entry point called "vdk.plugin.run" with name of plugin and module to act as entry point. + entry_points={ + "vdk.plugin.run": ["vdk-smarter = vdk.plugin.smarter.openai_plugin_entry"] + }, + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], +) diff --git a/projects/vdk-plugins/vdk-smarter/src/vdk/plugin/smarter/openai_plugin_entry.py b/projects/vdk-plugins/vdk-smarter/src/vdk/plugin/smarter/openai_plugin_entry.py new file mode 100644 index 0000000000..2d8c1d4d76 --- /dev/null +++ b/projects/vdk-plugins/vdk-smarter/src/vdk/plugin/smarter/openai_plugin_entry.py @@ -0,0 +1,110 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import logging +from collections import OrderedDict +from typing import List + +import openai +from vdk.api.plugin.hook_markers import hookimpl +from vdk.api.plugin.plugin_registry import IPluginRegistry +from vdk.internal.builtin_plugins.connection.decoration_cursor import DecorationCursor +from vdk.internal.builtin_plugins.connection.execution_cursor import ExecutionCursor +from vdk.internal.core.config import ConfigurationBuilder +from vdk.internal.core.context import CoreContext + +log = logging.getLogger(__name__) + + +class OpenAiPlugin: + def __init__(self): + self._review_enabled = False + self._queries = OrderedDict() + self._openai_model = "gpt-3.5-turbo" + + @hookimpl(tryfirst=True) + def vdk_configure(self, config_builder: ConfigurationBuilder): + # TODO: support non open ai models and make it configurable + config_builder.add( + key="openai_api_key", + default_value="", + description=""" + OpenAI API key. You can generete one on your OpenAI account page. + (possibly https://platform.openai.com/account/api-keys) + See best practices for api keys in https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety + """, + ) + config_builder.add( + key="openai_model", + default_value="text-davinci-003", + description=""" + OpenAI model to be used. + See more in https://platform.openai.com/docs/models/overview + """, + ) + config_builder.add( + key="openai_review_enabled", + default_value=False, + description="If enabled, it will review each SQL query executed by the job and create summary file at the end.", + ) + + @hookimpl + def vdk_initialize(self, context: CoreContext) -> None: + openai.api_key = context.configuration.get_value("openai_api_key") + self._review_enabled = context.configuration.get_value("openai_review_enabled") + self._openai_model = context.configuration.get_value("openai_model") + + def _review_sql_query(self, sql_query: str): + # Refine the prompt and make configurable + prompt = ( + """Using your extensive knowledge of Impala SQL, analyze the following SQL query and provide a specific feedback. + The feedback should include its efficiency, readability, possible optimization, potential errors, + adherence to best practices, and security vulnerabilities, if any. Provide a score (1 a lot of work needed, 5 - no further changes needed) + Return the answer in format {"score": ?, "review": "?" } Here is the SQL query: + """ + + sql_query + ) + + # Generate the review + # TODO: make configurable most things + response = openai.Completion.create( + engine=self._openai_model, + prompt=prompt, + max_tokens=1000, + n=1, + stop=None, + temperature=0.7, + ) + + # Extract the generated review from the response + review = response.choices[0].text.strip() + self._queries[sql_query] = review + + return review + + @hookimpl + def db_connection_decorate_operation( + self, decoration_cursor: DecorationCursor + ) -> None: + if self._review_enabled: + try: + managed_operation = decoration_cursor.get_managed_operation() + review = self._review_sql_query(managed_operation.get_operation()) + log.info( + f"Query:\n{managed_operation.get_operation()}\n\nReview:\n{review}\n" + ) + except Exception as e: + log.error(f"Failed to review SQL query: {e}") + + @hookimpl + def vdk_exit(self, context: CoreContext, exit_code: int) -> None: + if self._review_enabled: + with open("queries_reviews_report.md", "w") as f: + f.write("# SQL Query Reviews\n") + for query, review in self._queries.items(): + f.write(f"## SQL Query\n\n```sql\n{query}\n```\n\n") + f.write(f"### Review\n\n{review}\n\n") + + +@hookimpl +def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): + plugin_registry.load_plugin_with_hooks_impl(OpenAiPlugin(), "OpenAiPlugin")