Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

added --test, consolidate files, reports working #83

Merged
merged 4 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions agbenchmark/RegressionManager.py → agbenchmark/ReportManager.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json
from typing import Union
import os
import sys
import time
from datetime import datetime
from typing import Any, Dict, Union


class RegressionManager:
class ReportManager:
"""Abstracts interaction with the regression tests file"""

def __init__(self, filename: str):
self.filename = filename
self.start_time = time.time()
self.load()

def load(self) -> None:
Expand Down Expand Up @@ -40,6 +45,18 @@ def remove_test(self, test_name: str) -> None:
del self.tests[test_name]
self.save()

def end_info_report(self, config: Dict[str, Any]) -> None:
command = " ".join(sys.argv)
self.tests = {
"command": command.split(os.sep)[-1],
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
"tests": self.tests,
"config": config,
}

self.save()

def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
if isinstance(value, str):
return value.replace("\\\\", "/") # escape \ with \\
Expand Down
14 changes: 6 additions & 8 deletions agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import subprocess
import sys
import time
from pathlib import Path
from typing import Any, Dict

from dotenv import load_dotenv
Expand All @@ -21,6 +22,7 @@ def run_agent(
"""Calling to get a response"""

if MOCK_FLAG:
print("ITS A MOCK TEST", challenge_location)
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
Expand All @@ -30,19 +32,13 @@ def run_agent(
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
)

# Get the current working directory
cwd = os.path.join(os.getcwd(), config["home_path"])

# Add current directory to Python's import path
sys.path.append(cwd)

command = [sys.executable, config["entry_path"], str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=cwd,
cwd=os.getcwd(),
)

start_time = time.time()
Expand Down Expand Up @@ -79,7 +75,9 @@ def run_agent(
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
# this file is at agbenchmark\agent_interface.py
script_dir = Path(__file__).resolve().parent.parent
source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)

# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):
Expand Down
6 changes: 6 additions & 0 deletions agbenchmark/challenges/define_task_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from pathlib import Path
from typing import List, Optional

from pydantic import BaseModel
Expand Down Expand Up @@ -32,7 +33,12 @@ def serialize(self, path: str) -> None:

@staticmethod
def deserialize(path: str) -> "ChallengeData":
# this script is in root/agbenchmark/challenges/define_task_types.py
script_dir = Path(__file__).resolve().parent.parent.parent
path = str(script_dir / path)

print("Deserializing", path)

with open(path, "r") as file:
data = json.load(file)
return ChallengeData(**data)
2 changes: 1 addition & 1 deletion agbenchmark/challenges/interface/search/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "TestSearch",
"category": ["interface"],
"task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
"dependencies": [],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "This is a Heading\nThis is a paragraph.",
"should_contain": ["Heading", "paragraph"],
Expand Down
4 changes: 2 additions & 2 deletions agbenchmark/challenges/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
IMPROVE = os.getenv("IMPROVE", "False")


json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)


def get_test_path(json_file: str) -> str:
Expand Down Expand Up @@ -55,7 +55,7 @@ def generate_tests() -> None:
)
sys.path.append(str(custom_python_location))

for (module_loader, name, ispkg) in pkgutil.iter_modules(
for module_loader, name, ispkg in pkgutil.iter_modules(
[str(custom_python_location)]
):
module = importlib.import_module(name)
Expand Down
3 changes: 1 addition & 2 deletions config.json → agbenchmark/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
{
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "benchmarks.py",
"home_path": "agent/mini-agi",
"entry_path": "agbenchmark/benchmarks.py",
"cutoff": 60
}
19 changes: 15 additions & 4 deletions agbenchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import pytest

from agbenchmark.RegressionManager import RegressionManager
from agbenchmark.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
INFO_TESTS_PATH,
REGRESSION_TESTS_PATH,
get_regression_data,
)
Expand Down Expand Up @@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
return request.param


regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
info_manager = ReportManager(INFO_TESTS_PATH)


def pytest_runtest_makereport(item: Any, call: Any) -> None:
Expand All @@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
print("pytest_runtest_makereport", test_details)
if call.excinfo is None:
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
test_details["success"] = True
else:
regression_manager.remove_test(item.nodeid.split("::")[1])
test_details["success"] = False
test_details["fail_reason"] = str(call.excinfo.value)

info_manager.add_test(item.nodeid.split("::")[1], test_details)

def pytest_sessionfinish() -> None:
"""Called at the end of the session to save regression tests"""

def pytest_sessionfinish(session: Any) -> None:
"""Called at the end of the session to save regression tests and info"""
with open(CONFIG_PATH, "r") as f:
config = json.load(f)

info_manager.end_info_report(config)
regression_manager.save()


Expand Down
99 changes: 54 additions & 45 deletions regression_tests.json → agbenchmark/regression_tests.json
Original file line number Diff line number Diff line change
@@ -1,90 +1,99 @@
{
"TestBasicMemory": {
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1"
},
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1"
},
"TestCreateSimpleWebServer": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d3"
"test": "agbenchmark/challenges/interface/read_file",
"success": true
},
"TestDebugSimpleTypoWithGuidance": {
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1"
"test": "agbenchmark/challenges/memory/m1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/code/d2"
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
},
"TestReadFile": {
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
"TestBasicMemory"
],
"test": "agbenchmark/challenges/interface/read_file"
"test": "agbenchmark/challenges/memory/m2",
"success": true
},
"TestRememberMultipleIds": {
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/memory/m2"
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3"
"test": "agbenchmark/challenges/memory/m3",
"success": true
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
"TestRetrieval2"
],
"test": "agbenchmark/challenges/memory/m4"
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
},
"TestRetrieval2": {
"difficulty": "basic",
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestBasicRetrieval"
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/retrieval/r2"
"test": "agbenchmark/challenges/memory/m4",
"success": true
},
"TestRetrieval3": {
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
"TestWriteFile"
],
"test": "agbenchmark/challenges/retrieval/r3"
"test": "agbenchmark/challenges/interface/search",
"success": true
},
"TestSearch": {
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/search"
"test": "agbenchmark/challenges/interface/write_file",
"success": true
},
"TestWriteFile": {
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file"
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2",
"success": true
}
}
Loading