-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
vdk-data-source-git: data source for git POC
Extracts content from Git repositories along with associated file metadata. See README for more details
- Loading branch information
1 parent
5369fc6
commit cd9e53c
Showing
10 changed files
with
301 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
image: "python:3.7" | ||
|
||
.build-vdk-data-source-git: | ||
variables: | ||
PLUGIN_NAME: vdk-data-source-git | ||
extends: .build-plugin | ||
|
||
build-py37-vdk-data-source-git: | ||
extends: .build-vdk-data-source-git | ||
image: "python:3.7" | ||
|
||
build-py311-vdk-data-source-git: | ||
extends: .build-vdk-data-source-git | ||
image: "python:3.11" | ||
|
||
release-vdk-data-source-git: | ||
variables: | ||
PLUGIN_NAME: vdk-data-source-git | ||
extends: .release-plugin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# data-source-git | ||
|
||
Extracts content from Git repositories along with associated file metadata. | ||
|
||
## Usage | ||
|
||
``` | ||
pip install vdk-data-source-git | ||
``` | ||
|
||
### Extracted Data Schema | ||
|
||
The extracted data is returned in a `DataSourcePayload` object with two main components: `content` and `metadata`. | ||
|
||
#### `content` | ||
|
||
The `content` field contains the actual content of the file as a string. | ||
|
||
#### `metadata` | ||
|
||
The `metadata` field contains a dictionary with the following schema: | ||
|
||
| Key | Description | Data Type | Example | | ||
|------------------------|---------------------------------------------------|-----------|---------------| | ||
| `size` | The size of the file in bytes | Integer | 12345 | | ||
| `path` | The path of the file in the repository | String | "src/main.py" | | ||
| `num_lines` | The number of lines in the file | Integer | 678 | | ||
| `file_extension` | The file extension | String | ".py" | | ||
| `programming_language` | The detected programming language of the file | String | "Python" | | ||
| `is_likely_test_file` | Flag indicating if the file is likely a test file | Boolean | false | | ||
|
||
### Configuration | ||
|
||
(`vdk config-help` is useful command to browse all config options of your installation of vdk) | ||
|
||
| Name | Description | (example) Value | | ||
|---------|------------------------------------------|--------------------------------| | ||
| git_url | URL of the Git repository to be cloned. | "https://github.com/user/repo" | | ||
|
||
|
||
### Build and testing | ||
|
||
``` | ||
pip install -r requirements.txt | ||
pip install -e . | ||
pytest | ||
``` | ||
|
||
In VDK repo [../build-plugin.sh](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/build-plugin.sh) script can be used also. | ||
|
||
|
||
#### Note about the CICD: | ||
|
||
.plugin-ci.yaml is needed only for plugins part of [Versatile Data Kit Plugin repo](https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins). | ||
|
||
The CI/CD is separated in two stages, a build stage and a release stage. | ||
The build stage is made up of a few jobs, all which inherit from the same | ||
job configuration and only differ in the Python version they use (3.7, 3.8, 3.9 and 3.10). | ||
They run according to rules, which are ordered in a way such that changes to a | ||
plugin's directory trigger the plugin CI, but changes to a different plugin does not. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# this file is used to provide testing requirements | ||
# for requirements (dependencies) needed during and after installation of the plugin see (and update) setup.py install_requires section | ||
|
||
|
||
pytest | ||
vdk-core | ||
vdk-test-utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import pathlib | ||
|
||
import setuptools | ||
|
||
""" | ||
Builds a package with the help of setuptools in order for this package to be imported in other projects | ||
""" | ||
|
||
__version__ = "0.1.0" | ||
|
||
setuptools.setup( | ||
name="vdk-data-source-git", | ||
version=__version__, | ||
url="https://github.com/vmware/versatile-data-kit", | ||
description="Read Git repository data source", | ||
long_description=pathlib.Path("README.md").read_text(), | ||
long_description_content_type="text/markdown", | ||
install_requires=["vdk-core", "pygments", "dulwich"], | ||
package_dir={"": "src"}, | ||
packages=setuptools.find_namespace_packages(where="src"), | ||
# This is the only vdk plugin specifc part | ||
# Define entry point called "vdk.plugin.run" with name of plugin and module to act as entry point. | ||
entry_points={ | ||
"vdk.plugin.run": [ | ||
"vdk-data-source-git = vdk.plugin.data_source_git.plugin_entry" | ||
] | ||
}, | ||
classifiers=[ | ||
"Development Status :: 2 - Pre-Alpha", | ||
"License :: OSI Approved :: Apache Software License", | ||
"Programming Language :: Python :: 3.7", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
], | ||
project_urls={ | ||
"Documentation": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-data-source-git", | ||
"Source Code": "https://github.com/vmware/versatile-data-kit/tree/main/projects/vdk-plugins/vdk-data-source-git", | ||
"Bug Tracker": "https://github.com/vmware/versatile-data-kit/issues/new/choose", | ||
}, | ||
) |
88 changes: 88 additions & 0 deletions
88
projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/git_source.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import os | ||
import tempfile | ||
from typing import Iterator | ||
from typing import List | ||
from typing import Optional | ||
|
||
from dulwich import porcelain | ||
from vdk.plugin.data_source_git.utils import detect_language | ||
from vdk.plugin.data_source_git.utils import is_test_file | ||
from vdk.plugin.data_sources.config import config_class | ||
from vdk.plugin.data_sources.config import config_field | ||
from vdk.plugin.data_sources.data_source import DataSourcePayload | ||
from vdk.plugin.data_sources.data_source import IDataSource | ||
from vdk.plugin.data_sources.data_source import ( | ||
IDataSourceConfiguration, | ||
) | ||
from vdk.plugin.data_sources.data_source import IDataSourceStream | ||
from vdk.plugin.data_sources.factory import data_source | ||
from vdk.plugin.data_sources.state import IDataSourceState | ||
|
||
DESCRIPTION = """Git data source. | ||
Extract content from Git repositories and associated file metadata. | ||
""" | ||
|
||
|
||
@config_class(name="git", description=DESCRIPTION) | ||
class GitDataSourceConfiguration(IDataSourceConfiguration): | ||
git_url: str = config_field(description="Git URL that would be cloned. ") | ||
git_ssh_key: Optional[str] = config_field( | ||
description="SSH key to use when cloning the repo." | ||
"Leave empty if no authentication is needed", | ||
default="", | ||
) | ||
|
||
|
||
@data_source(name="git", config_class=GitDataSourceConfiguration) | ||
class GitDataSource(IDataSource): | ||
""" | ||
Data source who is only generating some dummy data for testing purposes. | ||
""" | ||
|
||
def __init__(self): | ||
self._config = None | ||
self._streams = [] | ||
|
||
def configure(self, config: GitDataSourceConfiguration): | ||
self._config = config | ||
|
||
def connect(self, state: IDataSourceState): | ||
if not self._streams: | ||
self._streams = [GitDataSourceStream(self._config.git_url)] | ||
|
||
def disconnect(self): | ||
self._streams = [] | ||
|
||
def streams(self) -> List[IDataSourceStream]: | ||
return self._streams | ||
|
||
|
||
class GitDataSourceStream(IDataSourceStream): | ||
""" """ | ||
|
||
def name(self) -> str: | ||
return self._url | ||
|
||
def __init__(self, url: str): | ||
self._url = url | ||
|
||
def read(self) -> Iterator[DataSourcePayload]: | ||
with tempfile.TemporaryDirectory() as tmp_dir: | ||
repo = porcelain.clone(source=self._url, target=tmp_dir, depth=1) | ||
|
||
for path, entry in repo.open_index().items(): | ||
file_path = path.decode("utf-8") | ||
blob = repo.get_object(entry.sha) | ||
# TODO: VDK send_object/tabular_data for ingestion doesn't support bytes so we convert it for now. | ||
data = blob.data.decode("utf-8") | ||
metadata = { | ||
"size": len(data), | ||
"path": file_path, | ||
"num_lines": data.count("\n") + 1, | ||
"file_extension": os.path.splitext(file_path)[1], | ||
"programming_language": detect_language(file_path), | ||
"is_likely_test_file": is_test_file(file_path), | ||
} | ||
yield DataSourcePayload({"content": data}, metadata=metadata) |
23 changes: 23 additions & 0 deletions
23
projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/plugin_entry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from typing import List | ||
|
||
from vdk.api.plugin.hook_markers import hookimpl | ||
from vdk.api.plugin.plugin_registry import IPluginRegistry | ||
from vdk.plugin.data_source_git.git_source import GitDataSource | ||
from vdk.plugin.data_sources.factory import IDataSourceFactory | ||
|
||
""" | ||
Include the plugins implementation. For example: | ||
""" | ||
|
||
|
||
class DataSourceGitPlugin: | ||
@hookimpl | ||
def vdk_data_sources_register(self, data_source_factory: IDataSourceFactory): | ||
data_source_factory.register_data_source_class(GitDataSource) | ||
|
||
|
||
@hookimpl | ||
def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): | ||
plugin_registry.load_plugin_with_hooks_impl(DataSourceGitPlugin(), "DataSourceGit") |
16 changes: 16 additions & 0 deletions
16
projects/vdk-plugins/vdk-data-source-git/src/vdk/plugin/data_source_git/utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from pygments.lexers import get_lexer_for_filename | ||
from pygments.util import ClassNotFound | ||
|
||
|
||
def is_test_file(file_path: str) -> bool: | ||
return "test" in file_path | ||
|
||
|
||
def detect_language(file_path: str) -> str: | ||
try: | ||
lexer = get_lexer_for_filename(file_path) | ||
return lexer.name | ||
except ClassNotFound: | ||
return "Unknown" |
Submodule dsc
added at
2c9988
16 changes: 16 additions & 0 deletions
16
projects/vdk-plugins/vdk-data-source-git/tests/jobs/ingest-git-job/step.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from vdk.api.job_input import IJobInput | ||
from vdk.plugin.data_sources.mapping.data_flow import DataFlowInput | ||
from vdk.plugin.data_sources.mapping.definitions import DataFlowMappingDefinition | ||
from vdk.plugin.data_sources.mapping.definitions import DestinationDefinition | ||
from vdk.plugin.data_sources.mapping.definitions import SourceDefinition | ||
|
||
|
||
def run(job_input: IJobInput): | ||
url = "https://github.com/versatile-data-kit-demo/dsc" | ||
source = SourceDefinition(id=url, name="git", config={"git_url": url}) | ||
destination = DestinationDefinition(id="test", method="memory") | ||
|
||
with DataFlowInput(job_input) as flow_input: | ||
flow_input.start(DataFlowMappingDefinition(source, destination)) |
24 changes: 24 additions & 0 deletions
24
projects/vdk-plugins/vdk-data-source-git/tests/test_run_ingest_git.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Copyright 2021-2023 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from click.testing import Result | ||
from vdk.plugin.data_source_git import plugin_entry | ||
from vdk.plugin.data_sources import plugin_entry as data_sources_plugin_entry | ||
from vdk.plugin.test_utils.util_funcs import cli_assert_equal | ||
from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner | ||
from vdk.plugin.test_utils.util_funcs import jobs_path_from_caller_directory | ||
from vdk.plugin.test_utils.util_plugins import IngestIntoMemoryPlugin | ||
|
||
|
||
def test_run_ingest_git(): | ||
ingest_plugin = IngestIntoMemoryPlugin() | ||
runner = CliEntryBasedTestRunner( | ||
ingest_plugin, data_sources_plugin_entry, plugin_entry | ||
) | ||
|
||
result: Result = runner.invoke( | ||
["run", jobs_path_from_caller_directory("ingest-git-job")] | ||
) | ||
|
||
cli_assert_equal(0, result) | ||
|
||
assert len(ingest_plugin.payloads) > 0 |