vmware · antoniivanov · Feb 8, 2024 · Feb 5, 2024 · Feb 6, 2024 · Feb 6, 2024
diff --git a/examples/confluence-reader/README.md b/examples/confluence-reader/README.md
@@ -8,7 +8,6 @@ The `ConfluenceDataSource` class is the heart of this data job. It provides a se
 
 - `fetch_updated_pages_in_confluence_space()`: Fetches updated pages in the Confluence space based on the last modification date.
 - `fetch_all_pages_in_confluence_space()`: Retrieves all pages in the Confluence space.
-- `fetch_updated_documents_by_parent_id(parent_page_id)`: Recursively fetches updated documents based on a parent page ID, ensuring that nested pages are also captured.
 - `flag_deleted_pages()`: Flags deleted pages based on the current Confluence data.
 - `update_saved_documents()`: Updates the saved documents in the JSON file with the latest data.
 

diff --git a/examples/confluence-reader/fetch_confluence_space.py b/examples/confluence-reader/fetch_confluence_space.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
 import logging
+import os
 from datetime import datetime
 
 from confluence_document import ConfluenceDocument
@@ -10,9 +11,6 @@
 
 log = logging.getLogger(__name__)
 
-CONFLUENCE_DATA_FILE = "confluence_data.json"
-LAST_MODIFICATION_FILE = "last_modification.txt"
-
 
 def read_json_file(file_path):
     try:
@@ -72,32 +70,17 @@ def flag_deleted_pages(file_path, current_confluence_documents):
 
     current_page_ids = {doc.metadata["id"] for doc in current_confluence_documents}
 
+    num_deleted = 0
     for doc in existing_docs:
         if doc.metadata["id"] not in current_page_ids:
             doc.metadata["deleted"] = True
+            num_deleted += 1
+    log.info(f"Found {num_deleted} deleted pages.")
 
     serialized_docs = [doc.serialize() for doc in existing_docs]
     write_json_file(file_path, serialized_docs)
 
 
-def read_last_modification_date():
-    try:
-        with open(LAST_MODIFICATION_FILE) as file:
-            return file.read().strip()
-    except FileNotFoundError:
-        log.error(f"{LAST_MODIFICATION_FILE} not found. Using default date.")
-        return datetime.min.strftime("%Y-%m-%d %H:%M")
-
-
-def update_last_modification_date():
-    try:
-        with open(LAST_MODIFICATION_FILE, "w") as file:
-            formatted_date = datetime.now().strftime("%Y-%m-%d %H:%M")
-            file.write(formatted_date)
-    except OSError as e:
-        log.error(f"Error writing to file: {e}")
-
-
 class ConfluenceDataSource:
     """
     A class for retrieving and managing data from a Confluence space.
@@ -114,7 +97,6 @@ class ConfluenceDataSource:
     Methods:
         fetch_updated_pages_in_confluence_space(): Fetches updated pages in the Confluence space based on the last modification date.
         fetch_all_pages_in_confluence_space(): Retrieves all pages in the Confluence space.
-        fetch_updated_documents_by_parent_id(parent_page_id): Recursively fetches updated documents based on a parent page ID.
         flag_deleted_pages(): Flags deleted pages based on the current Confluence data.
         update_saved_documents(): Updates the saved documents in the JSON file with the latest data.
 
@@ -128,60 +110,82 @@ def __init__(self, confluence_url, token, space_key):
 
     def fetch_confluence_documents(self, cql_query):
         try:
-            raw_documents = self.loader.load(cql=cql_query, limit=10, max_pages=10)
+            # TODO: think about configurable limits ? or some streaming solution
+            # How do we fit all documents in memory ?
+            raw_documents = self.loader.load(cql=cql_query, limit=50, max_pages=200)
             return [
                 ConfluenceDocument(doc.metadata, doc.page_content)
                 for doc in raw_documents
             ]
         except Exception as e:
             log.error(f"Error fetching documents from Confluence: {e}")
-            return []
+            raise e
 
-    def fetch_updated_pages_in_confluence_space(self):
-        last_date = read_last_modification_date()
-        update_last_modification_date()
+    def fetch_updated_pages_in_confluence_space(
+        self, last_date="1900-02-06 17:54", parent_page_id=None
+    ):
+        # TODO: this really should be called not when page is read but after it's successfully processed.
         cql_query = (
             f"lastModified > '{last_date}' and type = page and space = {self.space_key}"
         )
 
+        if parent_page_id:
+            # https://developer.atlassian.com/server/confluence/cql-field-reference/#ancestor
+            cql_query += f" and ancestor = {parent_page_id}"
+
         return self.fetch_confluence_documents(cql_query)
 
-    def fetch_all_pages_in_confluence_space(self):
+    def fetch_all_pages_in_confluence_space(self, parent_page_id=None):
+        # TODO: this is very inefficient as we are actually downloading everything
+        # the rest api offer expand query parameter for that but langchain loader limits all expansion to return body always.
+        # See https://docs.atlassian.com/atlassian-confluence/REST/5.5/
+        # We can hack around with by subclassing ContentFormat enum ? and try to convince library devs to add metadata only response in the loader
         cql_query = f"type = page and space = {self.space_key}"
+        if parent_page_id:
+            cql_query += f" and ancestor = {parent_page_id}"
         return self.fetch_confluence_documents(cql_query)
 
-    def fetch_updated_documents_by_parent_id(self, parent_page_id):
-        last_modified_date = read_last_modification_date()
-        update_last_modification_date()
 
-        def fetch_updated_recursive(page_id, last_modified_date):
-            updated_documents = []
-            cql_query = f"type = page and parent = {page_id} and lastModified > '{last_modified_date}'"
-            child_documents = self.fetch_confluence_documents(cql_query)
-
-            for doc in child_documents:
-                updated_documents.append(doc)
-                updated_documents.extend(
-                    fetch_updated_recursive(doc["id"], last_modified_date)
-                )
+def get_value(job_input, key: str, default_value=None):
+    return job_input.get_arguments().get(
+        key, job_input.get_property(key, os.environ.get(key.upper(), default_value))
+    )
 
-            return updated_documents
 
-        return fetch_updated_recursive(parent_page_id, last_modified_date)
+def set_property(job_input: IJobInput, key, value):
+    props = job_input.get_all_properties()
+    props[key] = value
+    job_input.set_all_properties(props)
 
 
 def run(job_input: IJobInput):
     log.info(f"Starting job step {__name__}")
 
-    confluence_url = job_input.get_property("confluence_url", "YOUR_CONFLUENCE_URL")
-    token = job_input.get_property("confluence_token", "YOUR_CONFLUENCE_TOKEN")
-    space_key = job_input.get_property("confluence_space_key", "YOUR_SPACE_KEY")
+    confluence_url = get_value(job_input, "confluence_url")
+    token = get_value(job_input, "confluence_token")
+    space_key = get_value(job_input, "confluence_space_key")
+    parent_page_id = get_value(job_input, "confluence_parent_page_id")
+    last_date = get_value(job_input, "last_date", "1900-01-01 12:00")
+    data_file = get_value(
+        job_input,
+        "data_file",
+        os.path.join(job_input.get_temporary_write_directory(), "confluence_data.json"),
+    )
 
     confluence_reader = ConfluenceDataSource(confluence_url, token, space_key)
 
-    updated_docs = confluence_reader.fetch_updated_pages_in_confluence_space()
-    update_saved_documents(CONFLUENCE_DATA_FILE, updated_docs)
+    updated_docs = confluence_reader.fetch_updated_pages_in_confluence_space(
+        last_date, parent_page_id
+    )
+    log.info(f"Found {len(updated_docs)} updated pages")
+    update_saved_documents(data_file, updated_docs)
+
+    # This is buggy , it doesn't account for server timezone and local timezone
+    # But also assumes that server clock and local clock are synchronized (which they are likely not)
+    # The ts should be the one of the latest processed page.
+    set_property(job_input, "last_date", datetime.now().strftime("%Y-%m-%d %H:%M"))
 
     flag_deleted_pages(
-        CONFLUENCE_DATA_FILE, confluence_reader.fetch_all_pages_in_confluence_space()
+        data_file,
+        confluence_reader.fetch_all_pages_in_confluence_space(parent_page_id),
     )
diff --git a/examples/confluence-reader/last_modification.txt b/examples/confluence-reader/last_modification.txt
diff --git a/examples/embed-ingest-job-example/30_create_schema.sql b/examples/embed-ingest-job-example/30_create_schema.sql
diff --git a/examples/embed-ingest-job-example/config.py b/examples/embed-ingest-job-example/config.py
diff --git a/examples/pgvector-embedder/00_properties.py b/examples/pgvector-embedder/00_properties.py
@@ -0,0 +1,24 @@
+# Copyright 2021-2024 VMware, Inc.
+# SPDX-License-Identifier: Apache-2.0
+import os.path
+import pathlib
+
+from vdk.api.job_input import IJobInput
+
+
+def run(job_input: IJobInput):
+    properties = job_input.get_all_properties()
+
+    data_file = os.path.join(job_input.get_job_directory(), "documents_example.json")
+    output_embeddings = os.path.join(
+        job_input.get_temporary_write_directory(), "embeddings_example.pkl"
+    )
+    properties.update(
+        dict(
+            destination_embeddings_table="vdk_doc_embeddings",
+            destination_metadata_table="vdk_doc_metadata",
+            data_file=data_file,
+            output_embeddings=output_embeddings,
+        )
+    )
+    job_input.set_all_properties(properties)
diff --git a/...b-example/20_clean_and_embed_json_data.py → ...-embedder/20_clean_and_embed_json_data.py b/...b-example/20_clean_and_embed_json_data.py → ...-embedder/20_clean_and_embed_json_data.py
@@ -2,12 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
 import logging
-import pathlib
 import re
 
 import nltk
-from config import DOCUMENTS_JSON_FILE_LOCATION
-from config import EMBEDDINGS_PKL_FILE_LOCATION
+from config import get_value
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from sentence_transformers import SentenceTransformer
@@ -84,9 +82,8 @@ def setup_nltk(temp_dir):
 def run(job_input: IJobInput):
     log.info(f"Starting job step {__name__}")
 
-    data_job_dir = pathlib.Path(job_input.get_job_directory())
-    input_json = data_job_dir / DOCUMENTS_JSON_FILE_LOCATION
-    output_embeddings = data_job_dir / EMBEDDINGS_PKL_FILE_LOCATION
+    input_json = get_value(job_input, "data_file")
+    output_embeddings = get_value(job_input, "output_embeddings")
 
     temp_dir = job_input.get_temporary_write_directory()
     setup_nltk(temp_dir)

diff --git a/examples/pgvector-embedder/30_create_schema.sql b/examples/pgvector-embedder/30_create_schema.sql
@@ -0,0 +1,21 @@
+--TODO (missing vdk feature): we need to drop the tables as postgres pluigin doesn't support upserts (updates)
+
+DROP TABLE IF EXISTS public.{destination_embeddings_table} CASCADE;
+DROP TABLE IF EXISTS public.{destination_metadata_table} CASCADE;
+
+-- TODO (missing vdk feature): we need to create the tables as the postgres plugin doesn't support automatic schema inference
+CREATE TABLE IF NOT EXISTS public.{destination_embeddings_table}
+(
+    id SERIAL PRIMARY KEY,
+    embedding public.vector
+);
+
+CREATE TABLE IF NOT EXISTS public.{destination_metadata_table}
+(
+    id INTEGER PRIMARY KEY,
+    title TEXT,
+    source TEXT,
+    data TEXT,
+    deleted BOOLEAN,
+    CONSTRAINT fk_metadata_embeddings FOREIGN KEY (id) REFERENCES public.{destination_embeddings_table}(id)
+);
diff --git a/...ngest-job-example/40_ingest_embeddings.py → ...pgvector-embedder/40_ingest_embeddings.py b/...ngest-job-example/40_ingest_embeddings.py → ...pgvector-embedder/40_ingest_embeddings.py
@@ -2,12 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
 import logging
-import pathlib
 import pickle
 
 import numpy as np
-from config import DOCUMENTS_JSON_FILE_LOCATION
-from config import EMBEDDINGS_PKL_FILE_LOCATION
+from config import get_value
 from vdk.api.job_input import IJobInput
 
 log = logging.getLogger(__name__)
@@ -16,9 +14,8 @@
 def run(job_input: IJobInput):
     log.info(f"Starting job step {__name__}")
 
-    data_job_dir = pathlib.Path(job_input.get_job_directory())
-    input_embeddings_path = data_job_dir / EMBEDDINGS_PKL_FILE_LOCATION
-    input_documents_path = data_job_dir / DOCUMENTS_JSON_FILE_LOCATION
+    input_embeddings_path = get_value(job_input, "output_embeddings")
+    input_documents_path = get_value(job_input, "data_file")
 
     with open(input_embeddings_path, "rb") as file:
         embeddings = pickle.load(file)
@@ -27,6 +24,8 @@ def run(job_input: IJobInput):
 
     print(len(documents), len(embeddings))
 
+    # TODO: our postgres plugin doesn't support updates (upserts) so updating with same ID fails.
+
     for i, embedding in enumerate(embeddings):
         embedding_list = (
             embedding.tolist() if isinstance(embedding, np.ndarray) else embedding
@@ -37,7 +36,7 @@ def run(job_input: IJobInput):
         }
         job_input.send_object_for_ingestion(
             payload=embedding_payload,
-            destination_table="vdk_confluence_doc_embeddings_example",
+            destination_table=get_value(job_input, "destination_embeddings_table"),
         )
 
     for document in documents:
@@ -50,5 +49,5 @@ def run(job_input: IJobInput):
         }
         job_input.send_object_for_ingestion(
             payload=metadata_payload,
-            destination_table="vdk_confluence_doc_metadata_example",
+            destination_table=get_value(job_input, "destination_metadata_table"),
         )
diff --git a/examples/pgvector-embedder/50_cleanup_deleted_rows.sql b/examples/pgvector-embedder/50_cleanup_deleted_rows.sql
@@ -0,0 +1,11 @@
+-- TODO (missing vdk feature): this may not be necessary if our Ingestion framework supports deletion
+
+-- Step 1: Delete from metadata table where deleted is true
+DELETE FROM public.{destination_metadata_table}
+WHERE deleted = TRUE;
+
+-- Step 2: Delete from embeddings table where id not present in metadata table
+DELETE FROM public.{destination_embeddings_table}
+WHERE id NOT IN (
+    SELECT id FROM public.{destination_metadata_table}
+);
diff --git a/examples/embed-ingest-job-example/README.md → examples/pgvector-embedder/README.md b/examples/embed-ingest-job-example/README.md → examples/pgvector-embedder/README.md
@@ -1,8 +1,23 @@
 # Embed And Ingest Data Job Example
 
-The following Versatile Data Kit example allows you to embed your Confluence JSON data
+The following Versatile Data Kit example allows you to embed documenta data and metadata (in certain format)
 and ingest it into Postgres instance with pgvector.
 
+# Expected input format
+
+```python
+[
+    {
+        "metadata": {
+            "title": "Page (or chunk) title",
+            "id": "Content page ID",
+            "source": "Source URL",
+            "deleted": <is the content being deleted in the source>
+        },
+        "data": "Content Text"
+    },
+```
+
 # Create embeddings for the data
 The fetched data from the previous step is read, cleaned and embedded using the
 [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) HuggingFace SentenceTransformer Embedding model.

diff --git a/examples/embed-ingest-job-example/config.ini → examples/pgvector-embedder/config.ini b/examples/embed-ingest-job-example/config.ini → examples/pgvector-embedder/config.ini
diff --git a/examples/pgvector-embedder/config.py b/examples/pgvector-embedder/config.py
@@ -0,0 +1,9 @@
+# Copyright 2021-2024 VMware, Inc.
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+
+def get_value(job_input, key: str, default_value=None):
+    return job_input.get_arguments().get(
+        key, job_input.get_property(key, os.environ.get(key.upper(), default_value))
+    )
diff --git a/...ingest-job-example/documents_example.json → .../pgvector-embedder/documents_example.json b/...ingest-job-example/documents_example.json → .../pgvector-embedder/documents_example.json
diff --git a/...embed-ingest-job-example/requirements.txt → examples/pgvector-embedder/requirements.txt b/...embed-ingest-job-example/requirements.txt → examples/pgvector-embedder/requirements.txt