Skip to content

Commit

Permalink
Merge main into person/miroslavi/introduce-data-job-deployment-entity
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] authored Aug 29, 2023
2 parents 95029e4 + e1bb698 commit 1a4f3a3
Show file tree
Hide file tree
Showing 10 changed files with 519 additions and 0 deletions.
22 changes: 22 additions & 0 deletions projects/vdk-plugins/vdk-duckdb/.plugin-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0

image: "python:3.7"

.build-vdk-duckdb:
variables:
PLUGIN_NAME: vdk-duckdb
extends: .build-plugin

build-py37-vdk-duckdb:
extends: .build-vdk-duckdb
image: "python:3.7"

build-py311-vdk-duckdb:
extends: .build-vdk-duckdb
image: "python:3.11"

release-vdk-duckdb:
variables:
PLUGIN_NAME: vdk-duckdb
extends: .release-plugin
45 changes: 45 additions & 0 deletions projects/vdk-plugins/vdk-duckdb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# duckdb

Simple description of my project.

TODO: what the project is about, what is its purpose


## Usage

```
pip install vdk-duckdb
```

### Configuration

(`vdk config-help` is useful command to browse all config options of your installation of vdk)

| Name | Description | (example) Value |
|---|---|---|
| dummy_config_key | Dummy configuration | "Dummy" |

### Example

TODO

### Build and testing

```
pip install -r requirements.txt
pip install -e .
pytest
```

In VDK repo [../build-plugin.sh](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/build-plugin.sh) script can be used also.


#### Note about the CICD:

.plugin-ci.yaml is needed only for plugins part of [Versatile Data Kit Plugin repo](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins).

The CI/CD is separated in two stages, a build stage and a release stage.
The build stage is made up of a few jobs, all which inherit from the same
job configuration and only differ in the Python version they use (3.7, 3.8, 3.9 and 3.10).
They run according to rules, which are ordered in a way such that changes to a
plugin's directory trigger the plugin CI, but changes to a different plugin does not.
12 changes: 12 additions & 0 deletions projects/vdk-plugins/vdk-duckdb/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# this file is used to provide testing requirements
# for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section

click
duckdb
pytest

pytest
pytest-cov

vdk-core
vdk-test-utils
40 changes: 40 additions & 0 deletions projects/vdk-plugins/vdk-duckdb/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import pathlib

import setuptools

"""
Builds a package with the help of setuptools in order for this package to be imported in other projects
"""

__version__ = "0.1.0"

setuptools.setup(
name="vdk-duckdb",
version=__version__,
url="https://github.com/vmware/versatile-data-kit",
description="DuckDB Plugin for VDK.",
long_description=pathlib.Path("README.md").read_text(),
long_description_content_type="text/markdown",
install_requires=["vdk-core", "tabulate"],
package_dir={"": "src"},
packages=setuptools.find_namespace_packages(where="src"),
# This is the only vdk plugin specific part
# Define entry point called "vdk.plugin.run" with name of plugin and module to act as entry point.
entry_points={"vdk.plugin.run": ["vdk-duckdb = vdk.plugin.duckdb.duckdb_plugin"]},
classifiers=[
"Development Status :: 2 - Pre-Alpha",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
],
project_urls={
"Documentation": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-duckdb",
"Source Code": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-duckdb",
"Bug Tracker": "https://github.com/vmware/versatile-data-kit/issues/new/choose",
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import pathlib
import tempfile

from vdk.internal.core.config import Configuration
from vdk.internal.core.config import ConfigurationBuilder

DUCKDB_FILE = "DUCKDB_FILE"
DUCKDB_INGEST_AUTO_CREATE_TABLE_ENABLED = "DUCKDB_INGEST_AUTO_CREATE_TABLE_ENABLED"


class DuckDBConfiguration:
def __init__(self, configuration: Configuration):
self.__config = configuration

def get_auto_create_table_enabled(self) -> bool:
return self.__config.get_value(DUCKDB_INGEST_AUTO_CREATE_TABLE_ENABLED)

def get_duckdb_file(self):
duckdb_file_path = self.__config.get_value(DUCKDB_FILE) or "default_path.duckdb"
return pathlib.Path(duckdb_file_path)


def add_definitions(config_builder: ConfigurationBuilder):
config_builder.add(
key=DUCKDB_FILE,
default_value=str(
pathlib.Path(tempfile.gettempdir()).joinpath("vdk-duckdb.db")
),
description="The file of the DuckDB database.",
)
config_builder.add(
key=DUCKDB_INGEST_AUTO_CREATE_TABLE_ENABLED,
default_value=True,
description="If set to true, auto create table if it does not exist during ingestion."
"This is only applicable when ingesting data into DuckDB (ingest method is DuckDB).",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import logging
import pathlib
import tempfile
from typing import List

import duckdb
from vdk.internal.util.decorators import closing_noexcept_on_close

log = logging.getLogger(__name__)


class DuckDBConnection:
"""
Create file based DuckDB database.
"""

def __init__(
self,
duckdb_file: pathlib.Path = pathlib.Path(tempfile.gettempdir()).joinpath(
"vdk-duckdb.db"
),
):
self.__db_file = duckdb_file

def new_connection(self):
log.info(
f"Creating new connection against local file database located at: {self.__db_file}"
)
return duckdb.connect(f"{self.__db_file}")

def execute_query(self, query: str) -> List[List]:
conn = self.new_connection()
with closing_noexcept_on_close(conn.cursor()) as cursor:
cursor.execute(query)
return cursor.fetchall()
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import logging
import pathlib

import click
import duckdb
from tabulate import tabulate
from vdk.api.plugin.hook_markers import hookimpl
from vdk.internal.builtin_plugins.run.job_context import JobContext
from vdk.internal.core.config import ConfigurationBuilder
from vdk.internal.util.decorators import closing_noexcept_on_close

log = logging.getLogger(__name__)
"""
Include the plugins implementation. For example:
"""


@hookimpl
def vdk_configure(config_builder: ConfigurationBuilder) -> None:
"""Define the configuration settings needed for duckdb"""
config_builder.add("DUCKDB_FILE", default_value="mydb.duckdb")


@hookimpl
def initialize_job(context: JobContext) -> None:
conf = context.core_context.configuration
duckdb_file = conf.get_value("DUCKDB_FILE")

context.connections.add_open_connection_factory_method(
"DUCKDB", lambda: duckdb.connect(database=duckdb_file)
)


@click.command(
name="duckdb-query", help="Execute a DuckDB query against a local DUCKDB database."
)
@click.option("-q", "--query", type=click.STRING, required=True)
@click.pass_context
def duckdb_query(ctx: click.Context, query):
conf = ctx.obj.configuration
duckdb_file = conf.get_value("DUCKDB_FILE")
conn = duckdb.connect(database=duckdb_file)

with closing_noexcept_on_close(conn.cursor()) as cursor:
cursor.execute(query)
column_names = (
[column_info[0] for column_info in cursor.description]
if cursor.description
else () # same as the default value for the headers parameters of the tabulate function
)
res = cursor.fetchall()
click.echo(tabulate(res, headers=column_names))


@hookimpl
def vdk_command_line(root_command: click.Group):
"""Here we extend the vdk with a new command called "duckdb-query"
enabling users to execute"""
root_command.add_command(duckdb_query)
Loading

0 comments on commit 1a4f3a3

Please sign in to comment.