Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

examples: Make RAG examples a bit more generic and demoable #3085

Merged
merged 4 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/confluence-reader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ The `ConfluenceDataSource` class is the heart of this data job. It provides a se

- `fetch_updated_pages_in_confluence_space()`: Fetches updated pages in the Confluence space based on the last modification date.
- `fetch_all_pages_in_confluence_space()`: Retrieves all pages in the Confluence space.
- `fetch_updated_documents_by_parent_id(parent_page_id)`: Recursively fetches updated documents based on a parent page ID, ensuring that nested pages are also captured.
- `flag_deleted_pages()`: Flags deleted pages based on the current Confluence data.
- `update_saved_documents()`: Updates the saved documents in the JSON file with the latest data.

Expand Down
102 changes: 53 additions & 49 deletions examples/confluence-reader/fetch_confluence_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import os
from datetime import datetime

from confluence_document import ConfluenceDocument
Expand All @@ -10,9 +11,6 @@

log = logging.getLogger(__name__)

CONFLUENCE_DATA_FILE = "confluence_data.json"
LAST_MODIFICATION_FILE = "last_modification.txt"


def read_json_file(file_path):
try:
Expand Down Expand Up @@ -72,32 +70,17 @@ def flag_deleted_pages(file_path, current_confluence_documents):

current_page_ids = {doc.metadata["id"] for doc in current_confluence_documents}

num_deleted = 0
for doc in existing_docs:
if doc.metadata["id"] not in current_page_ids:
doc.metadata["deleted"] = True
num_deleted += 1
log.info(f"Found {num_deleted} deleted pages.")

serialized_docs = [doc.serialize() for doc in existing_docs]
write_json_file(file_path, serialized_docs)


def read_last_modification_date():
try:
with open(LAST_MODIFICATION_FILE) as file:
return file.read().strip()
except FileNotFoundError:
log.error(f"{LAST_MODIFICATION_FILE} not found. Using default date.")
return datetime.min.strftime("%Y-%m-%d %H:%M")


def update_last_modification_date():
try:
with open(LAST_MODIFICATION_FILE, "w") as file:
formatted_date = datetime.now().strftime("%Y-%m-%d %H:%M")
file.write(formatted_date)
except OSError as e:
log.error(f"Error writing to file: {e}")


class ConfluenceDataSource:
"""
A class for retrieving and managing data from a Confluence space.
Expand All @@ -114,7 +97,6 @@ class ConfluenceDataSource:
Methods:
fetch_updated_pages_in_confluence_space(): Fetches updated pages in the Confluence space based on the last modification date.
fetch_all_pages_in_confluence_space(): Retrieves all pages in the Confluence space.
fetch_updated_documents_by_parent_id(parent_page_id): Recursively fetches updated documents based on a parent page ID.
flag_deleted_pages(): Flags deleted pages based on the current Confluence data.
update_saved_documents(): Updates the saved documents in the JSON file with the latest data.

Expand All @@ -128,60 +110,82 @@ def __init__(self, confluence_url, token, space_key):

def fetch_confluence_documents(self, cql_query):
try:
raw_documents = self.loader.load(cql=cql_query, limit=10, max_pages=10)
# TODO: think about configurable limits ? or some streaming solution
# How do we fit all documents in memory ?
raw_documents = self.loader.load(cql=cql_query, limit=50, max_pages=200)
return [
ConfluenceDocument(doc.metadata, doc.page_content)
for doc in raw_documents
]
except Exception as e:
log.error(f"Error fetching documents from Confluence: {e}")
return []
raise e

def fetch_updated_pages_in_confluence_space(self):
last_date = read_last_modification_date()
update_last_modification_date()
def fetch_updated_pages_in_confluence_space(
self, last_date="1900-02-06 17:54", parent_page_id=None
):
# TODO: this really should be called not when page is read but after it's successfully processed.
cql_query = (
f"lastModified > '{last_date}' and type = page and space = {self.space_key}"
)

if parent_page_id:
# https://developer.atlassian.com/server/confluence/cql-field-reference/#ancestor
cql_query += f" and ancestor = {parent_page_id}"

return self.fetch_confluence_documents(cql_query)

def fetch_all_pages_in_confluence_space(self):
def fetch_all_pages_in_confluence_space(self, parent_page_id=None):
# TODO: this is very inefficient as we are actually downloading everything
# the rest api offer expand query parameter for that but langchain loader limits all expansion to return body always.
# See https://docs.atlassian.com/atlassian-confluence/REST/5.5/
# We can hack around with by subclassing ContentFormat enum ? and try to convince library devs to add metadata only response in the loader
cql_query = f"type = page and space = {self.space_key}"
if parent_page_id:
cql_query += f" and ancestor = {parent_page_id}"
return self.fetch_confluence_documents(cql_query)

def fetch_updated_documents_by_parent_id(self, parent_page_id):
last_modified_date = read_last_modification_date()
update_last_modification_date()

def fetch_updated_recursive(page_id, last_modified_date):
updated_documents = []
cql_query = f"type = page and parent = {page_id} and lastModified > '{last_modified_date}'"
child_documents = self.fetch_confluence_documents(cql_query)

for doc in child_documents:
updated_documents.append(doc)
updated_documents.extend(
fetch_updated_recursive(doc["id"], last_modified_date)
)
def get_value(job_input, key: str, default_value=None):
return job_input.get_arguments().get(
key, job_input.get_property(key, os.environ.get(key.upper(), default_value))
)

return updated_documents

return fetch_updated_recursive(parent_page_id, last_modified_date)
def set_property(job_input: IJobInput, key, value):
props = job_input.get_all_properties()
props[key] = value
job_input.set_all_properties(props)


def run(job_input: IJobInput):
log.info(f"Starting job step {__name__}")

confluence_url = job_input.get_property("confluence_url", "YOUR_CONFLUENCE_URL")
token = job_input.get_property("confluence_token", "YOUR_CONFLUENCE_TOKEN")
space_key = job_input.get_property("confluence_space_key", "YOUR_SPACE_KEY")
confluence_url = get_value(job_input, "confluence_url")
token = get_value(job_input, "confluence_token")
space_key = get_value(job_input, "confluence_space_key")
parent_page_id = get_value(job_input, "confluence_parent_page_id")
last_date = get_value(job_input, "last_date", "1900-01-01 12:00")
data_file = get_value(
job_input,
"data_file",
os.path.join(job_input.get_temporary_write_directory(), "confluence_data.json"),
)

confluence_reader = ConfluenceDataSource(confluence_url, token, space_key)

updated_docs = confluence_reader.fetch_updated_pages_in_confluence_space()
update_saved_documents(CONFLUENCE_DATA_FILE, updated_docs)
updated_docs = confluence_reader.fetch_updated_pages_in_confluence_space(
last_date, parent_page_id
)
log.info(f"Found {len(updated_docs)} updated pages")
update_saved_documents(data_file, updated_docs)

# This is buggy , it doesn't account for server timezone and local timezone
# But also assumes that server clock and local clock are synchronized (which they are likely not)
# The ts should be the one of the latest processed page.
set_property(job_input, "last_date", datetime.now().strftime("%Y-%m-%d %H:%M"))

flag_deleted_pages(
CONFLUENCE_DATA_FILE, confluence_reader.fetch_all_pages_in_confluence_space()
data_file,
confluence_reader.fetch_all_pages_in_confluence_space(parent_page_id),
)
1 change: 0 additions & 1 deletion examples/confluence-reader/last_modification.txt

This file was deleted.

19 changes: 0 additions & 19 deletions examples/embed-ingest-job-example/30_create_schema.sql

This file was deleted.

5 changes: 0 additions & 5 deletions examples/embed-ingest-job-example/config.py

This file was deleted.

24 changes: 24 additions & 0 deletions examples/pgvector-embedder/00_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2021-2024 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import os.path
import pathlib

Check notice on line 4 in examples/pgvector-embedder/00_properties.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/00_properties.py#L4

'pathlib' imported but unused (F401)

Check warning on line 4 in examples/pgvector-embedder/00_properties.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/00_properties.py#L4

Unused import pathlib

from vdk.api.job_input import IJobInput


def run(job_input: IJobInput):
properties = job_input.get_all_properties()

data_file = os.path.join(job_input.get_job_directory(), "documents_example.json")
output_embeddings = os.path.join(
job_input.get_temporary_write_directory(), "embeddings_example.pkl"
)
properties.update(
dict(
destination_embeddings_table="vdk_doc_embeddings",
destination_metadata_table="vdk_doc_metadata",
data_file=data_file,
output_embeddings=output_embeddings,
)
)
job_input.set_all_properties(properties)
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import pathlib
import re

import nltk
from config import DOCUMENTS_JSON_FILE_LOCATION
from config import EMBEDDINGS_PKL_FILE_LOCATION
from config import get_value
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
Expand Down Expand Up @@ -84,9 +82,8 @@ def setup_nltk(temp_dir):
def run(job_input: IJobInput):
log.info(f"Starting job step {__name__}")

data_job_dir = pathlib.Path(job_input.get_job_directory())
input_json = data_job_dir / DOCUMENTS_JSON_FILE_LOCATION
output_embeddings = data_job_dir / EMBEDDINGS_PKL_FILE_LOCATION
input_json = get_value(job_input, "data_file")
output_embeddings = get_value(job_input, "output_embeddings")

temp_dir = job_input.get_temporary_write_directory()
setup_nltk(temp_dir)
Expand Down
21 changes: 21 additions & 0 deletions examples/pgvector-embedder/30_create_schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
--TODO (missing vdk feature): we need to drop the tables as postgres pluigin doesn't support upserts (updates)

Check failure on line 1 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L1

Expected SET NOCOUNT ON near top of file

Check failure on line 1 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L1

Expected SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED near top of file

DROP TABLE IF EXISTS public.{destination_embeddings_table} CASCADE;

Check warning on line 3 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L3

Expected TSQL Keyword to be capitalized

Check failure on line 3 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L3

syntax error at or near "{"
DROP TABLE IF EXISTS public.{destination_metadata_table} CASCADE;

Check warning on line 4 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L4

Expected TSQL Keyword to be capitalized

Check failure on line 4 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L4

syntax error at or near "{"

-- TODO (missing vdk feature): we need to create the tables as the postgres plugin doesn't support automatic schema inference
CREATE TABLE IF NOT EXISTS public.{destination_embeddings_table}

Check warning on line 7 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L7

Expected TSQL Keyword to be capitalized

Check failure on line 7 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L7

syntax error at or near "{"
(
id SERIAL PRIMARY KEY,
embedding public.vector

Check warning on line 10 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L10

Expected TSQL Keyword to be capitalized
);

CREATE TABLE IF NOT EXISTS public.{destination_metadata_table}

Check warning on line 13 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L13

Expected TSQL Keyword to be capitalized

Check failure on line 13 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L13

syntax error at or near "{"
(
id INTEGER PRIMARY KEY,
title TEXT,
source TEXT,
data TEXT,
deleted BOOLEAN,
CONSTRAINT fk_metadata_embeddings FOREIGN KEY (id) REFERENCES public.{destination_embeddings_table}(id)

Check warning on line 20 in examples/pgvector-embedder/30_create_schema.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/30_create_schema.sql#L20

Expected TSQL Keyword to be capitalized
);
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import pathlib
import pickle

import numpy as np
from config import DOCUMENTS_JSON_FILE_LOCATION
from config import EMBEDDINGS_PKL_FILE_LOCATION
from config import get_value
from vdk.api.job_input import IJobInput

log = logging.getLogger(__name__)
Expand All @@ -16,9 +14,8 @@
def run(job_input: IJobInput):
log.info(f"Starting job step {__name__}")

data_job_dir = pathlib.Path(job_input.get_job_directory())
input_embeddings_path = data_job_dir / EMBEDDINGS_PKL_FILE_LOCATION
input_documents_path = data_job_dir / DOCUMENTS_JSON_FILE_LOCATION
input_embeddings_path = get_value(job_input, "output_embeddings")
input_documents_path = get_value(job_input, "data_file")

with open(input_embeddings_path, "rb") as file:
embeddings = pickle.load(file)
Expand All @@ -27,6 +24,8 @@ def run(job_input: IJobInput):

print(len(documents), len(embeddings))

# TODO: our postgres plugin doesn't support updates (upserts) so updating with same ID fails.

for i, embedding in enumerate(embeddings):
embedding_list = (
embedding.tolist() if isinstance(embedding, np.ndarray) else embedding
Expand All @@ -37,7 +36,7 @@ def run(job_input: IJobInput):
}
job_input.send_object_for_ingestion(
payload=embedding_payload,
destination_table="vdk_confluence_doc_embeddings_example",
destination_table=get_value(job_input, "destination_embeddings_table"),
)

for document in documents:
Expand All @@ -50,5 +49,5 @@ def run(job_input: IJobInput):
}
job_input.send_object_for_ingestion(
payload=metadata_payload,
destination_table="vdk_confluence_doc_metadata_example",
destination_table=get_value(job_input, "destination_metadata_table"),
)
11 changes: 11 additions & 0 deletions examples/pgvector-embedder/50_cleanup_deleted_rows.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
-- TODO (missing vdk feature): this may not be necessary if our Ingestion framework supports deletion

Check failure on line 1 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L1

Expected SET ANSI_NULLS ON near top of file

Check failure on line 1 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L1

Expected SET NOCOUNT ON near top of file

Check failure on line 1 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L1

Expected SET QUOTED_IDENTIFIER ON near top of file

Check failure on line 1 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L1

Expected SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED near top of file

-- Step 1: Delete from metadata table where deleted is true
DELETE FROM public.{destination_metadata_table}

Check warning on line 4 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L4

Expected TSQL Keyword to be capitalized
WHERE deleted = TRUE;

-- Step 2: Delete from embeddings table where id not present in metadata table
DELETE FROM public.{destination_embeddings_table}

Check warning on line 8 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L8

Expected TSQL Keyword to be capitalized

Check failure on line 8 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L8

syntax error at or near "{"
WHERE id NOT IN (
SELECT id FROM public.{destination_metadata_table}

Check warning on line 10 in examples/pgvector-embedder/50_cleanup_deleted_rows.sql

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

examples/pgvector-embedder/50_cleanup_deleted_rows.sql#L10

Expected TSQL Keyword to be capitalized
);
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
# Embed And Ingest Data Job Example

The following Versatile Data Kit example allows you to embed your Confluence JSON data
The following Versatile Data Kit example allows you to embed documenta data and metadata (in certain format)
and ingest it into Postgres instance with pgvector.

# Expected input format

```python
[
{
"metadata": {
"title": "Page (or chunk) title",
"id": "Content page ID",
"source": "Source URL",
"deleted": <is the content being deleted in the source>
},
"data": "Content Text"
},
```

# Create embeddings for the data
The fetched data from the previous step is read, cleaned and embedded using the
[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) HuggingFace SentenceTransformer Embedding model.
Expand Down
9 changes: 9 additions & 0 deletions examples/pgvector-embedder/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright 2021-2024 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import os


def get_value(job_input, key: str, default_value=None):
return job_input.get_arguments().get(
key, job_input.get_property(key, os.environ.get(key.upper(), default_value))
)
Loading
Loading