Document agent phase 1 (ag2ai#978)

* rebase with create a docling_query_engine * FIx format * Add DocumentTriageAgent * Add parser agent * Update and rebase * Add document agent and tested basic features * Update document agent * Update utils to support auto create directories and path validations * Update docling_query_engine to support Path input type * Update docling_doc_ingest_agent * Wrap up document agent phase 1 * Document agent init commit * Delete tmp notebook * pre-commit run on all files * Move document agent files to agents/experimental/document_agent * Update file paths * Update notebook * Clean up logging * Update docling_query_engine to support Path input type * Move document agent files to agents/experimental/document_agent * Document Agent: Add docstrings * Fix incompatibility error for python 3.9 * Add DocumentAgent to experimental init, tweak prompt * Update structure output config and add checker for query engine index * Update document_agent unit tests * test fixed * polishing * wip * Error management, prompt tweaks, summary agent system message * pre-commit run on new files * Allow collection name to be used, keeping ingested document names * Notebooks updated * Logging update --------- Co-authored-by: Eric-Shang <[email protected]> Co-authored-by: Davor Runje <[email protected]> Co-authored-by: Mark Sze <[email protected]>
harishmohanraj · Feb 19, 2025 · 74902f3 · 74902f3
1 parent 11ae546
commit 74902f3
Show file tree

Hide file tree

Showing 21 changed files with 1,061 additions and 112 deletions.
diff --git a/.gitignore b/.gitignore
@@ -190,3 +190,5 @@ local_cache
 notebook/result.png
 
 notebook/coding
+
+chroma
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -67,7 +67,9 @@ repos:
               notebook/agentchat_graph_rag_neo4j.ipynb |
               notebook/agentchat_swarm_graphrag_telemetry_trip_planner.ipynb |
               website/node_modules/.* |
-              website/notebooks/.*
+              website/notebooks/.* |
+              test/agents/experimental/document_agent/pdf_parsed/Toast_financial_report.md |
+              test/agents/experimental/document_agent/pdf_parsed/nvidia_10k_2024.md
             )$
   # See https://jaredkhan.com/blog/mypy-pre-commit
   - repo: local

diff --git a/RFCs/000/RFC-000-global-run-function.ipynb.txt b/RFCs/000/RFC-000-global-run-function.ipynb.txt
@@ -0,0 +1,129 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any, Protocol, runtime_checkable\n",
+    "\n",
+    "from autogen import Agent\n",
+    "\n",
+    "\n",
+    "class RunResponse:\n",
+    "    pass\n",
+    "\n",
+    "\n",
+    "class Cost:\n",
+    "    def __init__(self, **kwargs: Any):\n",
+    "        self._cost: dict[str, Any] = kwargs.copy()\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def _add_elements(key: str, x: dict[str, Any], y: dict[str, Any]) -> Any:\n",
+    "        if key in x and key in y:\n",
+    "            return x[key] + y[key]\n",
+    "        elif key in x:\n",
+    "            return x[key]\n",
+    "        elif key in y:\n",
+    "            return y[key]\n",
+    "        else:\n",
+    "            raise KeyError(f\"Key {key} not found in either dictionary\")\n",
+    "\n",
+    "    def __add__(self, other: \"Cost\") -> \"Cost\":\n",
+    "        keys = set(self._cost.keys()) | set(other._cost.keys())\n",
+    "        return Cost(**{key: self._add_elements(key, self._cost, other._cost) for key in keys})\n",
+    "\n",
+    "\n",
+    "@runtime_checkable\n",
+    "class EventProtocol(Protocol):\n",
+    "    @property\n",
+    "    def cost(self) -> Cost:\n",
+    "        return Cost()\n",
+    "\n",
+    "\n",
+    "class RunResponse:\n",
+    "    @property\n",
+    "    def events(self) -> list[EventProtocol]:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "def run(\n",
+    "    *agents: Agent,\n",
+    ") -> RunResponse:\n",
+    "    return RunResponse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agents: list[Agent] = []\n",
+    "\n",
+    "response = run(*agents, message=\"What is the meaning of life?\")\n",
+    "\n",
+    "total_cost = Cost(0)\n",
+    "for m in response.events:\n",
+    "    total_cost += m.cost\n",
+    "    if isinstance(m, InputRequest):\n",
+    "        s = input(m.prompt)\n",
+    "        m.respond(s)\n",
+    "    elif isinstance(m, OutputMessage):\n",
+    "        print(m.message)\n",
+    "    elif isinstance(m, ToolRequest):\n",
+    "        tool = m.tool"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv-3.10-core",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/autogen/agentchat/conversable_agent.py b/autogen/agentchat/conversable_agent.py
@@ -1743,7 +1743,7 @@ def get_chat_results(self, chat_index: Optional[int] = None) -> Union[list[ChatR
         else:
             return self._finished_chats
 
-    def reset(self):
+    def reset(self) -> None:
         """Reset the agent."""
         self.clear_history()
         self.reset_consecutive_auto_reply_counter()

diff --git a/autogen/agents/experimental/__init__.py b/autogen/agents/experimental/__init__.py
@@ -4,8 +4,17 @@
 
 from .deep_research import DeepResearchAgent
 from .discord import DiscordAgent
+from .document_agent import DoclingDocIngestAgent, DocumentAgent
 from .slack import SlackAgent
 from .telegram import TelegramAgent
 from .websurfer import WebSurferAgent
 
-__all__ = ["DeepResearchAgent", "DiscordAgent", "SlackAgent", "TelegramAgent", "WebSurferAgent"]
+__all__ = [
+    "DeepResearchAgent",
+    "DiscordAgent",
+    "DoclingDocIngestAgent",
+    "DocumentAgent",
+    "SlackAgent",
+    "TelegramAgent",
+    "WebSurferAgent",
+]
diff --git a/autogen/agentchat/contrib/rag/__init__.py → ...s/experimental/document_agent/__init__.py b/autogen/agentchat/contrib/rag/__init__.py → ...s/experimental/document_agent/__init__.py
@@ -2,7 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from .docling_doc_ingest_agent import DoclingDocIngestAgent
+from .document_agent import DocumentAgent
 from .document_utils import handle_input
 from .parser_utils import docling_parse_docs
 
-__all__ = ["docling_parse_docs", "handle_input"]
+__all__ = ["DoclingDocIngestAgent", "DocumentAgent", "docling_parse_docs", "handle_input"]
diff --git a/autogen/agents/experimental/document_agent/docling_doc_ingest_agent.py b/autogen/agents/experimental/document_agent/docling_doc_ingest_agent.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+from .... import ConversableAgent
+from ....agentchat.contrib.swarm_agent import SwarmResult
+from ....doc_utils import export_module
+from ..document_agent.parser_utils import docling_parse_docs
+from .docling_query_engine import DoclingMdQueryEngine
+from .document_utils import preprocess_path
+
+__all__ = ["DoclingDocIngestAgent"]
+
+logger = logging.getLogger(__name__)
+
+DOCLING_PARSE_TOOL_NAME = "docling_parse_docs"
+
+DEFAULT_DOCLING_PARSER_PROMPT = f"""
+You are an expert in parsing and understanding text. You can use {DOCLING_PARSE_TOOL_NAME} tool to parse various documents and extract information from them. You can only use the tool once per turn.
+"""
+
+
+@export_module("autogen.agents.experimental")
+class DoclingDocIngestAgent(ConversableAgent):
+    """
+    A DoclingDocIngestAgent is a swarm agent that ingests documents using the docling_parse_docs tool.
+    """
+
+    def __init__(
+        self,
+        name: Optional[str] = None,
+        llm_config: Optional[Union[dict, Literal[False]]] = None,  # type: ignore[type-arg]
+        parsed_docs_path: Optional[Union[Path, str]] = None,
+        query_engine: Optional[DoclingMdQueryEngine] = None,
+        return_agent_success: str = "TaskManagerAgent",
+        return_agent_error: str = "ErrorManagerAgent",
+        collection_name: Optional[str] = None,
+    ):
+        """
+        Initialize the DoclingDocIngestAgent.
+
+        Args:
+        name (str): The name of the DoclingDocIngestAgent.
+        llm_config (Optional[Union[dict, Literal[False]]]): The configuration for the LLM.
+        parsed_docs_path (Union[Path, str]): The path where parsed documents will be stored.
+        query_engine (Optional[DoclingMdQueryEngine]): The DoclingMdQueryEngine to use for querying documents.
+        collection_name (Optional[str]): The unique name for the Chromadb collection. Set this to a value to reuse a collection. If a query_engine is provided, this will be ignored.
+        """
+        name = name or "DoclingDocIngestAgent"
+
+        parsed_docs_path = parsed_docs_path or Path("./parsed_docs")
+        parsed_docs_path = preprocess_path(str_or_path=parsed_docs_path, mk_path=True)
+
+        self.docling_query_engine = query_engine or DoclingMdQueryEngine(collection_name=collection_name)
+
+        def data_ingest_task(context_variables: dict) -> SwarmResult:  # type: ignore[type-arg]
+            """
+            A tool for Swarm agent to ingests documents using the docling_parse_docs to parse documents to markdown
+            and add them to the docling_query_engine.
+
+            Args:
+            context_variables (dict): The context variables for the task.
+
+            Returns:
+            SwarmResult: The result of the task.
+            """
+
+            try:
+                input_file_path = ""
+                tasks = context_variables.get("DocumentsToIngest", [])
+                while tasks:
+                    task = tasks.pop()
+                    input_file_path = task["path_or_url"]
+                    output_files = docling_parse_docs(
+                        input_file_path=input_file_path, output_dir_path=parsed_docs_path, output_formats=["markdown"]
+                    )
+
+                    # Limit to one output markdown file for now.
+                    if output_files:
+                        output_file = output_files[0]
+                        if output_file.suffix == ".md":
+                            self.docling_query_engine.add_docs(new_doc_paths=[output_file])
+
+                    # Keep track of documents ingested
+                    context_variables["DocumentsIngested"].append(input_file_path)
+
+                context_variables["CompletedTaskCount"] += 1
+                logger.info("data_ingest_task context_variables:", context_variables)
+
+            except Exception as e:
+                return SwarmResult(
+                    agent=return_agent_error,
+                    values=f"Data Ingestion Task Failed, Error {e}: '{input_file_path}'",
+                    context_variables=context_variables,
+                )
+
+            return SwarmResult(
+                agent=return_agent_success,
+                values=f"Data Ingestion Task Completed for {input_file_path}",
+                context_variables=context_variables,
+            )
+
+        super().__init__(
+            name=name,
+            llm_config=llm_config,
+            functions=[data_ingest_task],
+            system_message=DEFAULT_DOCLING_PARSER_PROMPT,
+        )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -190,3 +190,5 @@ local_cache
		notebook/result.png

		notebook/coding

		chroma