vdk-examples: example job with confluence reader (#3070)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
vmware · Feb 5, 2024 · 3f6aa77 · 3f6aa77
1 parent 4c61615
commit 3f6aa77
Show file tree

Hide file tree

Showing 6 changed files with 276 additions and 0 deletions.
diff --git a/examples/confluence-reader/README.md b/examples/confluence-reader/README.md
@@ -0,0 +1,44 @@
+# Confluence Data Retrieval Example
+
+In this data job example, we demonstrate how to efficiently retrieve and manage data from a Confluence space using the Confluence Data Retrieval Class. This class is a versatile utility that simplifies the process of fetching, tracking updates, and flagging deleted pages in a Confluence space. The resulting data is stored in a JSON file for further analysis.
+
+## Confluence Data Retrieval Class
+
+The `ConfluenceDataSource` class is the heart of this data job. It provides a set of methods for interacting with Confluence data:
+
+- `fetch_updated_pages_in_confluence_space()`: Fetches updated pages in the Confluence space based on the last modification date.
+- `fetch_all_pages_in_confluence_space()`: Retrieves all pages in the Confluence space.
+- `fetch_updated_documents_by_parent_id(parent_page_id)`: Recursively fetches updated documents based on a parent page ID, ensuring that nested pages are also captured.
+- `flag_deleted_pages()`: Flags deleted pages based on the current Confluence data.
+- `update_saved_documents()`: Updates the saved documents in the JSON file with the latest data.
+
+These methods make use of the last_modification.txt file to determine the last modification date and track changes in the Confluence space, allowing for efficient data retrieval and management.
+
+## JSON Data Format
+
+The resulting JSON data (confluence_data.json) is generated using the `ConfluenceDocument` class (see confluence_document.py).
+It follows this structured format:
+
+```json
+[
+    {
+        "metadata": {
+            "title": "Page Title",
+            "id": "Page ID",
+            "source": "Source URL",
+            "deleted": false
+        },
+        "data": "Page Content Text"
+    },
+    {
+        "metadata": {
+            "title": "Another Page Title",
+            "id": "Another Page ID",
+            "source": "Another Source URL",
+            "deleted": true
+        },
+        "data": "Another Page Content Text"
+    }
+]
+
+```
diff --git a/examples/confluence-reader/confluence_data.json b/examples/confluence-reader/confluence_data.json
@@ -0,0 +1 @@
+[]
diff --git a/examples/confluence-reader/confluence_document.py b/examples/confluence-reader/confluence_document.py
@@ -0,0 +1,36 @@
+# Copyright 2021-2024 VMware, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+class ConfluenceDocument:
+    def __init__(self, metadata, data, deleted=False):
+        """
+        Initializes a ConfluenceDocument instance.
+
+        :param metadata: A dictionary containing metadata about the Confluence document.
+                         Expected to contain 'title', 'id', and 'source'.
+                         'deleted' key will be added to indicate if the document is considered deleted.
+        :param data: A string representing the content of the Confluence page.
+        """
+        self.validate_metadata(metadata)
+        metadata["deleted"] = deleted
+        self.metadata = metadata
+        self.data = data
+
+    def serialize(self):
+        """
+        Serializes the ConfluenceDocument instance into a dictionary.
+        """
+        return {"metadata": self.metadata, "data": self.data}
+
+    @staticmethod
+    def validate_metadata(metadata):
+        """
+        Validates the metadata dictionary to ensure it contains required keys plus checks for 'deleted'.
+
+        :param metadata: A dictionary containing metadata about the Confluence document.
+        :raises ValueError: If metadata does not contain the required keys ('title', 'id', 'source').
+        """
+        required_keys = {"title", "id", "source"}
+        if not required_keys.issubset(metadata):
+            missing_keys = required_keys - metadata.keys()
+            raise ValueError(f"Metadata is missing required keys: {missing_keys}")
diff --git a/examples/confluence-reader/fetch_confluence_space.py b/examples/confluence-reader/fetch_confluence_space.py
@@ -0,0 +1,187 @@
+# Copyright 2021-2024 VMware, Inc.
+# SPDX-License-Identifier: Apache-2.0
+import json
+import logging
+from datetime import datetime
+
+from confluence_document import ConfluenceDocument
+from langchain_community.document_loaders import ConfluenceLoader
+from vdk.api.job_input import IJobInput
+
+log = logging.getLogger(__name__)
+
+CONFLUENCE_DATA_FILE = "confluence_data.json"
+LAST_MODIFICATION_FILE = "last_modification.txt"
+
+
+def read_json_file(file_path):
+    try:
+        with open(file_path) as file:
+            return json.load(file)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        log.error(f"Error reading JSON file: {e}")
+        return None
+
+
+def write_json_file(file_path, data):
+    try:
+        with open(file_path, "w") as file:
+            json.dump(data, file, indent=4)
+    except OSError as e:
+        log.error(f"Error writing JSON file: {e}")
+
+
+def update_saved_documents(file_path, new_docs):
+    existing_docs = read_json_file(file_path) or []
+
+    if (
+        isinstance(existing_docs, list)
+        and existing_docs
+        and isinstance(existing_docs[0], dict)
+    ):
+        existing_docs = [
+            ConfluenceDocument(
+                doc["metadata"], doc["data"], doc["metadata"].get("deleted", False)
+            )
+            for doc in existing_docs
+        ]
+
+    existing_docs_dict = {doc.metadata["id"]: doc for doc in existing_docs}
+
+    for doc in new_docs:
+        existing_docs_dict[doc.metadata["id"]] = doc
+
+    updated_docs_list = list(existing_docs_dict.values())
+
+    serialized_docs = [doc.serialize() for doc in updated_docs_list]
+    write_json_file(file_path, serialized_docs)
+
+
+def flag_deleted_pages(file_path, current_confluence_documents):
+    existing_docs = read_json_file(file_path)
+    if existing_docs is None:
+        log.error("Existing documents not found. Exiting.")
+        return
+
+    existing_docs = [
+        ConfluenceDocument(
+            doc["metadata"], doc["data"], doc["metadata"].get("deleted", False)
+        )
+        for doc in existing_docs
+    ]
+
+    current_page_ids = {doc.metadata["id"] for doc in current_confluence_documents}
+
+    for doc in existing_docs:
+        if doc.metadata["id"] not in current_page_ids:
+            doc.metadata["deleted"] = True
+
+    serialized_docs = [doc.serialize() for doc in existing_docs]
+    write_json_file(file_path, serialized_docs)
+
+
+def read_last_modification_date():
+    try:
+        with open(LAST_MODIFICATION_FILE) as file:
+            return file.read().strip()
+    except FileNotFoundError:
+        log.error(f"{LAST_MODIFICATION_FILE} not found. Using default date.")
+        return datetime.min.strftime("%Y-%m-%d %H:%M")
+
+
+def update_last_modification_date():
+    try:
+        with open(LAST_MODIFICATION_FILE, "w") as file:
+            formatted_date = datetime.now().strftime("%Y-%m-%d %H:%M")
+            file.write(formatted_date)
+    except OSError as e:
+        log.error(f"Error writing to file: {e}")
+
+
+class ConfluenceDataSource:
+    """
+    A class for retrieving and managing data from a Confluence space.
+
+    This class provides methods to interact with Confluence data, including fetching updated pages,
+    retrieving all pages, fetching updated documents by parent ID, flagging deleted pages, and updating saved documents.
+
+    Attributes:
+        confluence_url (str): The URL of the Confluence instance.
+        token (str): The authentication token for accessing Confluence.
+        space_key (str): The key of the Confluence space to retrieve data from.
+        loader (ConfluenceLoader): An instance of the ConfluenceLoader for data retrieval.
+
+    Methods:
+        fetch_updated_pages_in_confluence_space(): Fetches updated pages in the Confluence space based on the last modification date.
+        fetch_all_pages_in_confluence_space(): Retrieves all pages in the Confluence space.
+        fetch_updated_documents_by_parent_id(parent_page_id): Recursively fetches updated documents based on a parent page ID.
+        flag_deleted_pages(): Flags deleted pages based on the current Confluence data.
+        update_saved_documents(): Updates the saved documents in the JSON file with the latest data.
+
+    """
+
+    def __init__(self, confluence_url, token, space_key):
+        self.confluence_url = confluence_url
+        self.token = token
+        self.space_key = space_key
+        self.loader = ConfluenceLoader(url=self.confluence_url, token=self.token)
+
+    def fetch_confluence_documents(self, cql_query):
+        try:
+            raw_documents = self.loader.load(cql=cql_query, limit=10, max_pages=10)
+            return [
+                ConfluenceDocument(doc.metadata, doc.page_content)
+                for doc in raw_documents
+            ]
+        except Exception as e:
+            log.error(f"Error fetching documents from Confluence: {e}")
+            return []
+
+    def fetch_updated_pages_in_confluence_space(self):
+        last_date = read_last_modification_date()
+        update_last_modification_date()
+        cql_query = (
+            f"lastModified > '{last_date}' and type = page and space = {self.space_key}"
+        )
+
+        return self.fetch_confluence_documents(cql_query)
+
+    def fetch_all_pages_in_confluence_space(self):
+        cql_query = f"type = page and space = {self.space_key}"
+        return self.fetch_confluence_documents(cql_query)
+
+    def fetch_updated_documents_by_parent_id(self, parent_page_id):
+        last_modified_date = read_last_modification_date()
+        update_last_modification_date()
+
+        def fetch_updated_recursive(page_id, last_modified_date):
+            updated_documents = []
+            cql_query = f"type = page and parent = {page_id} and lastModified > '{last_modified_date}'"
+            child_documents = self.fetch_confluence_documents(cql_query)
+
+            for doc in child_documents:
+                updated_documents.append(doc)
+                updated_documents.extend(
+                    fetch_updated_recursive(doc["id"], last_modified_date)
+                )
+
+            return updated_documents
+
+        return fetch_updated_recursive(parent_page_id, last_modified_date)
+
+
+def run(job_input: IJobInput):
+    log.info(f"Starting job step {__name__}")
+
+    confluence_url = job_input.get_property("confluence_url", "YOUR_CONFLUENCE_URL")
+    token = job_input.get_property("confluence_token", "YOUR_CONFLUENCE_TOKEN")
+    space_key = job_input.get_property("confluence_space_key", "YOUR_SPACE_KEY")
+
+    confluence_reader = ConfluenceDataSource(confluence_url, token, space_key)
+
+    updated_docs = confluence_reader.fetch_updated_pages_in_confluence_space()
+    update_saved_documents(CONFLUENCE_DATA_FILE, updated_docs)
+
+    flag_deleted_pages(
+        CONFLUENCE_DATA_FILE, confluence_reader.fetch_all_pages_in_confluence_space()
+    )
diff --git a/examples/confluence-reader/last_modification.txt b/examples/confluence-reader/last_modification.txt
@@ -0,0 +1 @@
+2000-01-01 00:00
diff --git a/examples/confluence-reader/requirements.txt b/examples/confluence-reader/requirements.txt
@@ -0,0 +1,7 @@
+# Python jobs can specify extra library dependencies in requirements.txt file.
+# See https://pip.readthedocs.io/en/stable/user_guide/#requirements-files
+# The file is optional and can be deleted if no extra library dependencies are necessary.
+
+atlassian-python-api
+langchain_community
+lxml