Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

Commit

Permalink
Added --test, consolidate files, reports working (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
SilenNaihin authored Jul 11, 2023
1 parent 437e066 commit 8df8290
Show file tree
Hide file tree
Showing 18 changed files with 277 additions and 90 deletions.
21 changes: 19 additions & 2 deletions agbenchmark/RegressionManager.py → agbenchmark/ReportManager.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json
from typing import Union
import os
import sys
import time
from datetime import datetime
from typing import Any, Dict, Union


class RegressionManager:
class ReportManager:
"""Abstracts interaction with the regression tests file"""

def __init__(self, filename: str):
self.filename = filename
self.start_time = time.time()
self.load()

def load(self) -> None:
Expand Down Expand Up @@ -40,6 +45,18 @@ def remove_test(self, test_name: str) -> None:
del self.tests[test_name]
self.save()

def end_info_report(self, config: Dict[str, Any]) -> None:
command = " ".join(sys.argv)
self.tests = {
"command": command.split(os.sep)[-1],
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
"tests": self.tests,
"config": config,
}

self.save()

def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
if isinstance(value, str):
return value.replace("\\\\", "/") # escape \ with \\
Expand Down
14 changes: 6 additions & 8 deletions agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import subprocess
import sys
import time
from pathlib import Path
from typing import Any, Dict

from dotenv import load_dotenv
Expand All @@ -21,6 +22,7 @@ def run_agent(
"""Calling to get a response"""

if MOCK_FLAG:
print("ITS A MOCK TEST", challenge_location)
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
Expand All @@ -30,19 +32,13 @@ def run_agent(
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
)

# Get the current working directory
cwd = os.path.join(os.getcwd(), config["home_path"])

# Add current directory to Python's import path
sys.path.append(cwd)

command = [sys.executable, config["entry_path"], str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=cwd,
cwd=os.getcwd(),
)

start_time = time.time()
Expand Down Expand Up @@ -79,7 +75,9 @@ def run_agent(
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
# this file is at agbenchmark\agent_interface.py
script_dir = Path(__file__).resolve().parent.parent
source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)

# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):
Expand Down
6 changes: 6 additions & 0 deletions agbenchmark/challenges/define_task_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from pathlib import Path
from typing import List, Optional

from pydantic import BaseModel
Expand Down Expand Up @@ -32,7 +33,12 @@ def serialize(self, path: str) -> None:

@staticmethod
def deserialize(path: str) -> "ChallengeData":
# this script is in root/agbenchmark/challenges/define_task_types.py
script_dir = Path(__file__).resolve().parent.parent.parent
path = str(script_dir / path)

print("Deserializing", path)

with open(path, "r") as file:
data = json.load(file)
return ChallengeData(**data)
2 changes: 1 addition & 1 deletion agbenchmark/challenges/interface/search/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "TestSearch",
"category": ["interface"],
"task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
"dependencies": [],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "This is a Heading\nThis is a paragraph.",
"should_contain": ["Heading", "paragraph"],
Expand Down
4 changes: 2 additions & 2 deletions agbenchmark/challenges/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
IMPROVE = os.getenv("IMPROVE", "False")


json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)


def get_test_path(json_file: str) -> str:
Expand Down Expand Up @@ -55,7 +55,7 @@ def generate_tests() -> None:
)
sys.path.append(str(custom_python_location))

for (module_loader, name, ispkg) in pkgutil.iter_modules(
for module_loader, name, ispkg in pkgutil.iter_modules(
[str(custom_python_location)]
):
module = importlib.import_module(name)
Expand Down
3 changes: 1 addition & 2 deletions config.json → agbenchmark/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
{
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "benchmarks.py",
"home_path": "agent/mini-agi",
"entry_path": "agbenchmark/benchmarks.py",
"cutoff": 60
}
19 changes: 15 additions & 4 deletions agbenchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import pytest

from agbenchmark.RegressionManager import RegressionManager
from agbenchmark.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
INFO_TESTS_PATH,
REGRESSION_TESTS_PATH,
get_regression_data,
)
Expand Down Expand Up @@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
return request.param


regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
info_manager = ReportManager(INFO_TESTS_PATH)


def pytest_runtest_makereport(item: Any, call: Any) -> None:
Expand All @@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
print("pytest_runtest_makereport", test_details)
if call.excinfo is None:
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
test_details["success"] = True
else:
regression_manager.remove_test(item.nodeid.split("::")[1])
test_details["success"] = False
test_details["fail_reason"] = str(call.excinfo.value)

info_manager.add_test(item.nodeid.split("::")[1], test_details)

def pytest_sessionfinish() -> None:
"""Called at the end of the session to save regression tests"""

def pytest_sessionfinish(session: Any) -> None:
"""Called at the end of the session to save regression tests and info"""
with open(CONFIG_PATH, "r") as f:
config = json.load(f)

info_manager.end_info_report(config)
regression_manager.save()


Expand Down
99 changes: 54 additions & 45 deletions regression_tests.json → agbenchmark/regression_tests.json
Original file line number Diff line number Diff line change
@@ -1,90 +1,99 @@
{
"TestBasicMemory": {
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1"
},
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1"
},
"TestCreateSimpleWebServer": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d3"
"test": "agbenchmark/challenges/interface/read_file",
"success": true
},
"TestDebugSimpleTypoWithGuidance": {
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1"
"test": "agbenchmark/challenges/memory/m1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/code/d2"
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
},
"TestReadFile": {
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
"TestBasicMemory"
],
"test": "agbenchmark/challenges/interface/read_file"
"test": "agbenchmark/challenges/memory/m2",
"success": true
},
"TestRememberMultipleIds": {
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/memory/m2"
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3"
"test": "agbenchmark/challenges/memory/m3",
"success": true
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
"TestRetrieval2"
],
"test": "agbenchmark/challenges/memory/m4"
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
},
"TestRetrieval2": {
"difficulty": "basic",
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestBasicRetrieval"
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/retrieval/r2"
"test": "agbenchmark/challenges/memory/m4",
"success": true
},
"TestRetrieval3": {
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
"TestWriteFile"
],
"test": "agbenchmark/challenges/retrieval/r3"
"test": "agbenchmark/challenges/interface/search",
"success": true
},
"TestSearch": {
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/search"
"test": "agbenchmark/challenges/interface/write_file",
"success": true
},
"TestWriteFile": {
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file"
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2",
"success": true
}
}
Loading

0 comments on commit 8df8290

Please sign in to comment.