-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
182771c
commit cceb5da
Showing
11 changed files
with
126 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,24 @@ | ||
# Copyright 2021-2024 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import os.path | ||
import pathlib | ||
Check notice on line 4 in examples/pgvector-embedder/00_properties.py
|
||
|
||
from vdk.api.job_input import IJobInput | ||
|
||
|
||
def run(job_input: IJobInput): | ||
properties = job_input.get_all_properties() | ||
|
||
data_file = os.path.join(job_input.get_job_directory(), "documents_example.json") | ||
output_embeddings = os.path.join( | ||
job_input.get_temporary_write_directory(), "embeddings_example.pkl" | ||
) | ||
properties.update( | ||
dict( | ||
destination_embeddings_table="vdk_doc_embeddings_ai", | ||
destination_metadata_table="vdk_doc_metadata_ai", | ||
destination_embeddings_table="vdk_doc_embeddings", | ||
destination_metadata_table="vdk_doc_metadata", | ||
data_file=data_file, | ||
output_embeddings=output_embeddings, | ||
) | ||
) | ||
job_input.set_all_properties(properties) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
# Copyright 2021-2024 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import os | ||
|
||
DOCUMENTS_JSON_FILE_LOCATION = "documents_example.json" | ||
EMBEDDINGS_PKL_FILE_LOCATION = "embeddings_example.pkl" | ||
|
||
def get_value(job_input, key: str, default_value=None): | ||
return job_input.get_arguments().get( | ||
key, job_input.get_property(key, os.environ.get(key.upper(), default_value)) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
This jobs contains a ETL (Extract, Load, Transform) pipelines designed for processing data from Confluence and embedding it using pgvector | ||
|
||
The jobs are orchestrated using the vdk-dag plugin to run in a defined sequence. | ||
|
||
# Job structure | ||
|
||
Here are the two main jobs: | ||
|
||
- Extracts raw data from Confluence and loads it into a specified location (table, file, etc.). | ||
- pgvector-embedder: Transforms the extracted data by embedding it using pgvector and stores the metadata and embeddings in specified tables (vdk_confluence_metadata and vdk_confluence_embeddings). | ||
|
||
TODO (missing vdk feature): as the idea is for this to be used as a template, we need to allow somehow VDK to handle automatically jobs specified in the DAG | ||
Currently a the job specified (e.g confluence-reader) must be deployed and deployed VDK jobs can only run one execution at a time. | ||
What can we do to solve that? | ||
|
||
A) Create a separate deployment automatically | ||
B) Run the job with the arguments provided as a separate job instance | ||
- what about job properties - maybe it should inhert the parent job properties ? Or ignore them and only accept arguments? | ||
C) ... | ||
|
||
TODO (missing vdk feature): how do I pick between different jobs to compose them? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
; Supported format: https://docs.python.org/3/library/configparser.html#supported-ini-file-structure | ||
|
||
; This is the only file required to deploy a Data Job. | ||
; Read more to understand what each option means: | ||
|
||
; Information about the owner of the Data Job | ||
[owner] | ||
|
||
; Team is a way to group Data Jobs that belonged to the same team. | ||
team = my-team | ||
|
||
[vdk] | ||
dags_max_concurrent_running_jobs = 2 | ||
dags_delayed_jobs_min_delay_seconds = 1 | ||
dags_delayed_jobs_randomized_added_delay_seconds = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright 2021-2024 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from vdk.plugin.dag.dag_runner import DagInput | ||
|
||
# ELT | ||
|
||
jobs = [ | ||
dict( | ||
job_name="confluence-reader", | ||
team_name="my-team", | ||
fail_dag_on_error=True, | ||
arguments=dict(data_file=f"/tmp/confluence.json"), | ||
depends_on=[], | ||
), | ||
dict( | ||
job_name="pgvector-embedder", | ||
team_name="my-team", | ||
fail_dag_on_error=True, | ||
arguments=dict( | ||
data_file=f"/tmp/confluence.json", | ||
Check warning on line 20 in examples/rag-dag-pipeline/pipeline.py
|
||
destination_metadata_table="vdk_confluence_metadata", | ||
destination_embeddings_table="vdk_confluence_embeddings", | ||
), | ||
depends_on=["confluence-reader"], | ||
), | ||
] | ||
|
||
|
||
def run(job_input) -> None: | ||
DagInput().run_dag(jobs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
vdk-dag |