Added --test, consolidate files, reports working (#83)

Significant-Gravitas · Jul 11, 2023 · 8df8290 · 8df8290
1 parent 437e066
commit 8df8290
Show file tree

Hide file tree

Showing 18 changed files with 277 additions and 90 deletions.
diff --git a/agbenchmark/RegressionManager.py → agbenchmark/ReportManager.py b/agbenchmark/RegressionManager.py → agbenchmark/ReportManager.py
@@ -1,12 +1,17 @@
 import json
-from typing import Union
+import os
+import sys
+import time
+from datetime import datetime
+from typing import Any, Dict, Union
 
 
-class RegressionManager:
+class ReportManager:
     """Abstracts interaction with the regression tests file"""
 
     def __init__(self, filename: str):
         self.filename = filename
+        self.start_time = time.time()
         self.load()
 
     def load(self) -> None:
@@ -40,6 +45,18 @@ def remove_test(self, test_name: str) -> None:
             del self.tests[test_name]
             self.save()
 
+    def end_info_report(self, config: Dict[str, Any]) -> None:
+        command = " ".join(sys.argv)
+        self.tests = {
+            "command": command.split(os.sep)[-1],
+            "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
+            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "tests": self.tests,
+            "config": config,
+        }
+
+        self.save()
+
     def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
         if isinstance(value, str):
             return value.replace("\\\\", "/")  # escape \ with \\

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
@@ -3,6 +3,7 @@
 import subprocess
 import sys
 import time
+from pathlib import Path
 from typing import Any, Dict
 
 from dotenv import load_dotenv
@@ -21,6 +22,7 @@ def run_agent(
     """Calling to get a response"""
 
     if MOCK_FLAG:
+        print("ITS A MOCK TEST", challenge_location)
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
@@ -30,19 +32,13 @@ def run_agent(
             f"Running Python function '{config['entry_path']}' with timeout {timeout}"
         )
 
-        # Get the current working directory
-        cwd = os.path.join(os.getcwd(), config["home_path"])
-
-        # Add current directory to Python's import path
-        sys.path.append(cwd)
-
         command = [sys.executable, config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             universal_newlines=True,
-            cwd=cwd,
+            cwd=os.getcwd(),
         )
 
         start_time = time.time()
@@ -79,7 +75,9 @@ def run_agent(
 def copy_artifacts_into_workspace(
     workspace: str, artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
-    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+    # this file is at agbenchmark\agent_interface.py
+    script_dir = Path(__file__).resolve().parent.parent
+    source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)
 
     # Check if source_dir exists, if not then return immediately.
     if not os.path.exists(source_dir):

diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
@@ -1,4 +1,5 @@
 import json
+from pathlib import Path
 from typing import List, Optional
 
 from pydantic import BaseModel
@@ -32,7 +33,12 @@ def serialize(self, path: str) -> None:
 
     @staticmethod
     def deserialize(path: str) -> "ChallengeData":
+        # this script is in root/agbenchmark/challenges/define_task_types.py
+        script_dir = Path(__file__).resolve().parent.parent.parent
+        path = str(script_dir / path)
+
         print("Deserializing", path)
+
         with open(path, "r") as file:
             data = json.load(file)
         return ChallengeData(**data)
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
@@ -2,7 +2,7 @@
   "name": "TestSearch",
   "category": ["interface"],
   "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "This is a Heading\nThis is a paragraph.",
     "should_contain": ["Heading", "paragraph"],

diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
@@ -19,7 +19,7 @@
 IMPROVE = os.getenv("IMPROVE", "False")
 
 
-json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
+json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
 
 
 def get_test_path(json_file: str) -> str:
@@ -55,7 +55,7 @@ def generate_tests() -> None:
             )
             sys.path.append(str(custom_python_location))
 
-            for (module_loader, name, ispkg) in pkgutil.iter_modules(
+            for module_loader, name, ispkg in pkgutil.iter_modules(
                 [str(custom_python_location)]
             ):
                 module = importlib.import_module(name)

diff --git a/config.json → agbenchmark/config.json b/config.json → agbenchmark/config.json
@@ -1,6 +1,5 @@
 {
   "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "benchmarks.py",
-  "home_path": "agent/mini-agi",
+  "entry_path": "agbenchmark/benchmarks.py",
   "cutoff": 60
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
@@ -6,9 +6,10 @@
 
 import pytest
 
-from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
     CONFIG_PATH,
+    INFO_TESTS_PATH,
     REGRESSION_TESTS_PATH,
     get_regression_data,
 )
@@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+regression_manager = ReportManager(REGRESSION_TESTS_PATH)
+info_manager = ReportManager(INFO_TESTS_PATH)
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         print("pytest_runtest_makereport", test_details)
         if call.excinfo is None:
             regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+            test_details["success"] = True
         else:
             regression_manager.remove_test(item.nodeid.split("::")[1])
+            test_details["success"] = False
+            test_details["fail_reason"] = str(call.excinfo.value)
 
+        info_manager.add_test(item.nodeid.split("::")[1], test_details)
 
-def pytest_sessionfinish() -> None:
-    """Called at the end of the session to save regression tests"""
+
+def pytest_sessionfinish(session: Any) -> None:
+    """Called at the end of the session to save regression tests and info"""
+    with open(CONFIG_PATH, "r") as f:
+        config = json.load(f)
+
+    info_manager.end_info_report(config)
     regression_manager.save()
 
 

diff --git a/regression_tests.json → agbenchmark/regression_tests.json b/regression_tests.json → agbenchmark/regression_tests.json
@@ -1,90 +1,99 @@
 {
-    "TestBasicMemory": {
+    "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
-            "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m1"
-    },
-    "TestBasicRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile",
-            "TestSearch"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r1"
-    },
-    "TestCreateSimpleWebServer": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d3"
+        "test": "agbenchmark/challenges/interface/read_file",
+        "success": true
     },
-    "TestDebugSimpleTypoWithGuidance": {
+    "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [
             "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/code/d1"
+        "test": "agbenchmark/challenges/memory/m1",
+        "success": true
     },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "medium",
+    "TestBasicRetrieval": {
+        "difficulty": "basic",
         "dependencies": [
-            "TestDebugSimpleTypoWithGuidance"
+            "TestWriteFile",
+            "TestSearch"
         ],
-        "test": "agbenchmark/challenges/code/d2"
+        "test": "agbenchmark/challenges/retrieval/r1",
+        "success": true
     },
-    "TestReadFile": {
+    "TestRememberMultipleIds": {
         "difficulty": "basic",
         "dependencies": [
-            "TestWriteFile"
+            "TestBasicMemory"
         ],
-        "test": "agbenchmark/challenges/interface/read_file"
+        "test": "agbenchmark/challenges/memory/m2",
+        "success": true
     },
-    "TestRememberMultipleIds": {
+    "TestRetrieval2": {
         "difficulty": "basic",
         "dependencies": [
-            "TestBasicMemory"
+            "TestBasicRetrieval"
         ],
-        "test": "agbenchmark/challenges/memory/m2"
+        "test": "agbenchmark/challenges/retrieval/r2",
+        "success": true
     },
     "TestRememberMultipleIdsWithNoise": {
         "difficulty": "medium",
         "dependencies": [
             "TestRememberMultipleIds"
         ],
-        "test": "agbenchmark/challenges/memory/m3"
+        "test": "agbenchmark/challenges/memory/m3",
+        "success": true
     },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
+    "TestRetrieval3": {
+        "difficulty": "basic",
         "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
+            "TestRetrieval2"
         ],
-        "test": "agbenchmark/challenges/memory/m4"
+        "test": "agbenchmark/challenges/retrieval/r3",
+        "success": true
     },
-    "TestRetrieval2": {
-        "difficulty": "basic",
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
         "dependencies": [
-            "TestBasicRetrieval"
+            "TestRememberMultipleIdsWithNoise"
         ],
-        "test": "agbenchmark/challenges/retrieval/r2"
+        "test": "agbenchmark/challenges/memory/m4",
+        "success": true
     },
-    "TestRetrieval3": {
+    "TestSearch": {
         "difficulty": "basic",
         "dependencies": [
-            "TestRetrieval2"
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/retrieval/r3"
+        "test": "agbenchmark/challenges/interface/search",
+        "success": true
     },
-    "TestSearch": {
+    "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/challenges/interface/search"
+        "test": "agbenchmark/challenges/interface/write_file",
+        "success": true
     },
-    "TestWriteFile": {
+    "TestDebugSimpleTypoWithGuidance": {
         "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file"
+        "dependencies": [
+            "TestReadFile",
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/code/d1",
+        "success": true
+    },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestDebugSimpleTypoWithGuidance"
+        ],
+        "test": "agbenchmark/challenges/code/d2",
+        "success": true
     }
 }