From 5f667aebeee52abd8dc7fb0521f53722c4b9e586 Mon Sep 17 00:00:00 2001 From: Antoni Ivanov Date: Wed, 1 Nov 2023 13:22:22 +0200 Subject: [PATCH] vdk-data-source-git: data source for git POC Extracts content from Git repositories along with associated file metadata. See README for more details --- .../vdk-data-source-git/.plugin-ci.yml | 22 +++++ .../vdk-plugins/vdk-data-source-git/README.md | 60 +++++++++++++ .../vdk-data-source-git/requirements.txt | 7 ++ .../vdk-plugins/vdk-data-source-git/setup.py | 44 ++++++++++ .../vdk/plugin/data_source_git/git_source.py | 88 +++++++++++++++++++ .../plugin/data_source_git/plugin_entry.py | 23 +++++ .../src/vdk/plugin/data_source_git/utils.py | 16 ++++ .../vdk-plugins/vdk-data-source-git/tests/dsc | 1 + .../tests/jobs/ingest-git-job/step.py | 16 ++++ .../tests/test_run_ingest_git.py | 24 +++++ 10 files changed, 301 insertions(+) create mode 100644 projects/vdk-plugins/vdk-data-source-git/.plugin-ci.yml create mode 100644 projects/vdk-plugins/vdk-data-source-git/README.md create mode 100644 projects/vdk-plugins/vdk-data-source-git/requirements.txt create mode 100644 projects/vdk-plugins/vdk-data-source-git/setup.py create mode 100644 projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/git_source.py create mode 100644 projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/plugin_entry.py create mode 100644 projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/utils.py create mode 160000 projects/vdk-plugins/vdk-data-source-git/tests/dsc create mode 100644 projects/vdk-plugins/vdk-data-source-git/tests/jobs/ingest-git-job/step.py create mode 100644 projects/vdk-plugins/vdk-data-source-git/tests/test_run_ingest_git.py diff --git a/projects/vdk-plugins/vdk-data-source-git/.plugin-ci.yml b/projects/vdk-plugins/vdk-data-source-git/.plugin-ci.yml new file mode 100644 index 0000000000..176c888158 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/.plugin-ci.yml @@ -0,0 +1,22 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 + +image: "python:3.7" + +.build-vdk-data-source-git: + variables: + PLUGIN_NAME: vdk-data-source-git + extends: .build-plugin + +build-py37-vdk-data-source-git: + extends: .build-vdk-data-source-git + image: "python:3.7" + +build-py311-vdk-data-source-git: + extends: .build-vdk-data-source-git + image: "python:3.11" + +release-vdk-data-source-git: + variables: + PLUGIN_NAME: vdk-data-source-git + extends: .release-plugin diff --git a/projects/vdk-plugins/vdk-data-source-git/README.md b/projects/vdk-plugins/vdk-data-source-git/README.md new file mode 100644 index 0000000000..2d7d2cd7d6 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/README.md @@ -0,0 +1,60 @@ +# data-source-git + +Extracts content from Git repositories along with associated file metadata. + +## Usage + +``` +pip install vdk-data-source-git +``` + +### Extracted Data Schema + +The extracted data is returned in a `DataSourcePayload` object with two main components: `content` and `metadata`. + +#### `content` + +The `content` field contains the actual content of the file as a string. + +#### `metadata` + +The `metadata` field contains a dictionary with the following schema: + +| Key | Description | Data Type | Example | +|------------------------|---------------------------------------------------|-----------|---------------| +| `size` | The size of the file in bytes | Integer | 12345 | +| `path` | The path of the file in the repository | String | "src/main.py" | +| `num_lines` | The number of lines in the file | Integer | 678 | +| `file_extension` | The file extension | String | ".py" | +| `programming_language` | The detected programming language of the file | String | "Python" | +| `is_likely_test_file` | Flag indicating if the file is likely a test file | Boolean | false | + +### Configuration + +(`vdk config-help` is useful command to browse all config options of your installation of vdk) + +| Name | Description | (example) Value | +|---------|------------------------------------------|--------------------------------| +| git_url | URL of the Git repository to be cloned. | "https://github.com/user/repo" | + + +### Build and testing + +``` +pip install -r requirements.txt +pip install -e . +pytest +``` + +In VDK repo [../build-plugin.sh](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/build-plugin.sh) script can be used also. + + +#### Note about the CICD: + +.plugin-ci.yaml is needed only for plugins part of [Versatile Data Kit Plugin repo](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins). + +The CI/CD is separated in two stages, a build stage and a release stage. +The build stage is made up of a few jobs, all which inherit from the same +job configuration and only differ in the Python version they use (3.7, 3.8, 3.9 and 3.10). +They run according to rules, which are ordered in a way such that changes to a +plugin's directory trigger the plugin CI, but changes to a different plugin does not. diff --git a/projects/vdk-plugins/vdk-data-source-git/requirements.txt b/projects/vdk-plugins/vdk-data-source-git/requirements.txt new file mode 100644 index 0000000000..900e1b689f --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/requirements.txt @@ -0,0 +1,7 @@ +# this file is used to provide testing requirements +# for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section + + +pytest +vdk-core +vdk-test-utils diff --git a/projects/vdk-plugins/vdk-data-source-git/setup.py b/projects/vdk-plugins/vdk-data-source-git/setup.py new file mode 100644 index 0000000000..45be378d5d --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/setup.py @@ -0,0 +1,44 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import pathlib + +import setuptools + +""" +Builds a package with the help of setuptools in order for this package to be imported in other projects +""" + +__version__ = "0.1.0" + +setuptools.setup( + name="vdk-data-source-git", + version=__version__, + url="https://github.com/vmware/versatile-data-kit", + description="Read Git repository data source", + long_description=pathlib.Path("README.md").read_text(), + long_description_content_type="text/markdown", + install_requires=["vdk-core", "vdk-data-sources", "pygments", "dulwich"], + package_dir={"": "src"}, + packages=setuptools.find_namespace_packages(where="src"), + # This is the only vdk plugin specifc part + # Define entry point called "vdk.plugin.run" with name of plugin and module to act as entry point. + entry_points={ + "vdk.plugin.run": [ + "vdk-data-source-git = vdk.plugin.data_source_git.plugin_entry" + ] + }, + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], + project_urls={ + "Documentation": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-data-source-git", + "Source Code": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-data-source-git", + "Bug Tracker": "https://github.com/vmware/versatile-data-kit/issues/new/choose", + }, +) diff --git a/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/git_source.py b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/git_source.py new file mode 100644 index 0000000000..2bf5bdffc1 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/git_source.py @@ -0,0 +1,88 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import os +import tempfile +from typing import Iterator +from typing import List +from typing import Optional + +from dulwich import porcelain +from vdk.plugin.data_source_git.utils import detect_language +from vdk.plugin.data_source_git.utils import is_test_file +from vdk.plugin.data_sources.config import config_class +from vdk.plugin.data_sources.config import config_field +from vdk.plugin.data_sources.data_source import DataSourcePayload +from vdk.plugin.data_sources.data_source import IDataSource +from vdk.plugin.data_sources.data_source import ( + IDataSourceConfiguration, +) +from vdk.plugin.data_sources.data_source import IDataSourceStream +from vdk.plugin.data_sources.factory import data_source +from vdk.plugin.data_sources.state import IDataSourceState + +DESCRIPTION = """Git data source. +Extract content from Git repositories and associated file metadata. +""" + + +@config_class(name="git", description=DESCRIPTION) +class GitDataSourceConfiguration(IDataSourceConfiguration): + git_url: str = config_field(description="Git URL that would be cloned. ") + git_ssh_key: Optional[str] = config_field( + description="SSH key to use when cloning the repo." + "Leave empty if no authentication is needed", + default="", + ) + + +@data_source(name="git", config_class=GitDataSourceConfiguration) +class GitDataSource(IDataSource): + """ + Data source who is only generating some dummy data for testing purposes. + """ + + def __init__(self): + self._config = None + self._streams = [] + + def configure(self, config: GitDataSourceConfiguration): + self._config = config + + def connect(self, state: IDataSourceState): + if not self._streams: + self._streams = [GitDataSourceStream(self._config.git_url)] + + def disconnect(self): + self._streams = [] + + def streams(self) -> List[IDataSourceStream]: + return self._streams + + +class GitDataSourceStream(IDataSourceStream): + """ """ + + def name(self) -> str: + return self._url + + def __init__(self, url: str): + self._url = url + + def read(self) -> Iterator[DataSourcePayload]: + with tempfile.TemporaryDirectory() as tmp_dir: + repo = porcelain.clone(source=self._url, target=tmp_dir, depth=1) + + for path, entry in repo.open_index().items(): + file_path = path.decode("utf-8") + blob = repo.get_object(entry.sha) + # TODO: VDK send_object/tabular_data for ingestion doesn't support bytes so we convert it for now. + data = blob.data.decode("utf-8") + metadata = { + "size": len(data), + "path": file_path, + "num_lines": data.count("\n") + 1, + "file_extension": os.path.splitext(file_path)[1], + "programming_language": detect_language(file_path), + "is_likely_test_file": is_test_file(file_path), + } + yield DataSourcePayload({"content": data}, metadata=metadata) diff --git a/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/plugin_entry.py b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/plugin_entry.py new file mode 100644 index 0000000000..ca4ca43212 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/plugin_entry.py @@ -0,0 +1,23 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from typing import List + +from vdk.api.plugin.hook_markers import hookimpl +from vdk.api.plugin.plugin_registry import IPluginRegistry +from vdk.plugin.data_source_git.git_source import GitDataSource +from vdk.plugin.data_sources.factory import IDataSourceFactory + +""" +Include the plugins implementation. For example: +""" + + +class DataSourceGitPlugin: + @hookimpl + def vdk_data_sources_register(self, data_source_factory: IDataSourceFactory): + data_source_factory.register_data_source_class(GitDataSource) + + +@hookimpl +def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): + plugin_registry.load_plugin_with_hooks_impl(DataSourceGitPlugin(), "DataSourceGit") diff --git a/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/utils.py b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/utils.py new file mode 100644 index 0000000000..3d27672ffa --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/utils.py @@ -0,0 +1,16 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pygments.lexers import get_lexer_for_filename +from pygments.util import ClassNotFound + + +def is_test_file(file_path: str) -> bool: + return "test" in file_path + + +def detect_language(file_path: str) -> str: + try: + lexer = get_lexer_for_filename(file_path) + return lexer.name + except ClassNotFound: + return "Unknown" diff --git a/projects/vdk-plugins/vdk-data-source-git/tests/dsc b/projects/vdk-plugins/vdk-data-source-git/tests/dsc new file mode 160000 index 0000000000..2c9988e7f3 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/tests/dsc @@ -0,0 +1 @@ +Subproject commit 2c9988e7f324100d973537c1a7b5517f090399e0 diff --git a/projects/vdk-plugins/vdk-data-source-git/tests/jobs/ingest-git-job/step.py b/projects/vdk-plugins/vdk-data-source-git/tests/jobs/ingest-git-job/step.py new file mode 100644 index 0000000000..9fe07abc59 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/tests/jobs/ingest-git-job/step.py @@ -0,0 +1,16 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.data_sources.mapping.data_flow import DataFlowInput +from vdk.plugin.data_sources.mapping.definitions import DataFlowMappingDefinition +from vdk.plugin.data_sources.mapping.definitions import DestinationDefinition +from vdk.plugin.data_sources.mapping.definitions import SourceDefinition + + +def run(job_input: IJobInput): + url = "https://github.com/versatile-data-kit-demo/dsc" + source = SourceDefinition(id=url, name="git", config={"git_url": url}) + destination = DestinationDefinition(id="test", method="memory") + + with DataFlowInput(job_input) as flow_input: + flow_input.start(DataFlowMappingDefinition(source, destination)) diff --git a/projects/vdk-plugins/vdk-data-source-git/tests/test_run_ingest_git.py b/projects/vdk-plugins/vdk-data-source-git/tests/test_run_ingest_git.py new file mode 100644 index 0000000000..46617e3aa3 --- /dev/null +++ b/projects/vdk-plugins/vdk-data-source-git/tests/test_run_ingest_git.py @@ -0,0 +1,24 @@ +# Copyright 2021-2023 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from click.testing import Result +from vdk.plugin.data_source_git import plugin_entry +from vdk.plugin.data_sources import plugin_entry as data_sources_plugin_entry +from vdk.plugin.test_utils.util_funcs import cli_assert_equal +from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner +from vdk.plugin.test_utils.util_funcs import jobs_path_from_caller_directory +from vdk.plugin.test_utils.util_plugins import IngestIntoMemoryPlugin + + +def test_run_ingest_git(): + ingest_plugin = IngestIntoMemoryPlugin() + runner = CliEntryBasedTestRunner( + ingest_plugin, data_sources_plugin_entry, plugin_entry + ) + + result: Result = runner.invoke( + ["run", jobs_path_from_caller_directory("ingest-git-job")] + ) + + cli_assert_equal(0, result) + + assert len(ingest_plugin.payloads) > 0