Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

Commit

Permalink
Safety challenges, adaptability challenges, suite same_task (#177)
Browse files Browse the repository at this point in the history
  • Loading branch information
SilenNaihin authored Jul 24, 2023
1 parent c4aebda commit d9b3d7d
Show file tree
Hide file tree
Showing 165 changed files with 2,289 additions and 486 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
AGENT_NAME=mini-agi
HOME_ENV=
REPORT_LOCATION="../../reports/mini-agi"
MOCK_TEST=False
16 changes: 9 additions & 7 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
### Background

<!-- Provide a concise overview of the rationale behind this change. Include relevant context, prior discussions, or links to related issues. Ensure that the change aligns with the project's overall direction. -->

### Changes
<!-- Describe the specific, focused change made in this pull request. Detail the modifications clearly and avoid any unrelated or "extra" changes. -->

<!-- Describe the specific, focused change made in this pull request. Detail the modifications clearly and avoid any unrelated or "extra" changes. -->

### PR Quality Checklist

- [ ] I have run the following commands against my code to ensure it passes our linters:
```shell
black .
isort .
mypy .
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
```
```shell
black . --exclude test.py
isort .
mypy .
autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
```
60 changes: 43 additions & 17 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_dispatch:
branches: [master]
schedule:
- cron: "0 8 * * *"
- cron: '0 8 * * *'
push:
branches: [master, ci-test*]
paths-ignore:
Expand All @@ -16,7 +16,7 @@ jobs:
lint:
runs-on: ubuntu-latest
env:
min-python-version: "3.10"
min-python-version: '3.10'

steps:
- name: Checkout repository
Expand Down Expand Up @@ -45,10 +45,10 @@ jobs:
poetry install
- name: Lint with flake8
run: poetry run flake8
run: poetry run flake8 --exclude=code,agent

- name: Check black formatting
run: poetry run black . --check
run: poetry run black . --exclude test.py --check
if: success() || failure()

- name: Check isort formatting
Expand All @@ -68,20 +68,20 @@ jobs:
tests:
env:
GH_TOKEN: ${{ github.event_name == 'pull_request' && github.token || secrets.PAT }}
min-python-version: "3.10"
name: "${{ matrix.agent-name }}"
min-python-version: '3.10'
name: '${{ matrix.agent-name }}'
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
agent-name:
- "gpt-engineer"
- "smol-developer"
- "Auto-GPT"
- "mini-agi"
- "beebot"
- "BabyAGI"
- 'gpt-engineer'
- 'smol-developer'
- 'Auto-GPT'
- 'mini-agi'
- 'beebot'
- 'BabyAGI'

steps:
- name: Checkout repository
Expand Down Expand Up @@ -151,10 +151,37 @@ jobs:
fi
pip install ../../dist/*.whl
if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then
set +e # Ignore non-zero exit codes and continue execution
${prefix}agbenchmark start --maintain --mock
${prefix}agbenchmark start --improve --mock
EXIT_CODE=$?
set -e # Stop ignoring non-zero exit codes
# Check if the exit code was 5, and if so, exit with 0 instead
if [ $EXIT_CODE -eq 5 ]
then
echo "regression_tests.json is empty."
exit 0
else
exit $EXIT_CODE
fi
set +e # Ignore non-zero exit codes and continue execution
improve_cmd = ${prefix}agbenchmark start --improve --mock
EXIT_CODE=$?
set -e # Stop ignoring non-zero exit codes
# Check if the exit code was 5, and if so, exit with 0 instead
if [ $EXIT_CODE -eq 5 ]
then
echo "regression_tests.json is empty."
exit 0
else
exit $EXIT_CODE
fi
${prefix}agbenchmark start --mock
${prefix}agbenchmark start --mock --category=retrieval
${prefix}agbenchmark start --mock --category=interface
Expand All @@ -165,7 +192,7 @@ jobs:
bash -c "$(curl -fsSL https://raw.githubusercontent.com/Helicone/helicone/0ed90e3203f172ed05d5754bc0b95a584689233c/mitmproxy.sh)" -s start
${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
fi
cd ../..
env:
Expand All @@ -179,7 +206,6 @@ jobs:
HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}


- name: Upload reports
if: always()
uses: actions/upload-artifact@v3
Expand All @@ -192,7 +218,7 @@ jobs:
run: |
git config --global user.email "[email protected]"
git config --global user.name "Auto-GPT-Bot"
git add reports/* || echo "nothing to commit"
commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
git commit -m "${commit_message}"
Expand Down
7 changes: 4 additions & 3 deletions agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@


def run_agent(
task: str, config: Dict[str, Any], challenge_location: str, cutoff: int
task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int
) -> None:
"""Calling to get a response"""

if MOCK_FLAG:
print("Running mock agent")
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
config["workspace"], "artifacts_out", artifacts_location
)
else:
entry_path = "agbenchmark.benchmarks"
Expand All @@ -31,7 +32,7 @@ def run_agent(
if "--nc" in sys.argv:
timeout = 100000

print(f"Running Python function '{entry_path}' with timeout {timeout}")
print(f"Running '{entry_path}' with timeout {timeout}")
command = [sys.executable, "-m", entry_path, str(task)]
process = subprocess.Popen(
command,
Expand Down
116 changes: 86 additions & 30 deletions agbenchmark/challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
from abc import ABC
from typing import Any, Dict, List

from agbenchmark.challenges.define_task_types import ChallengeData, Ground
from agbenchmark.start_benchmark import CURRENT_DIRECTORY
from agbenchmark.challenges.data_types import ChallengeData, Ground


class Challenge(ABC):
Expand All @@ -15,13 +14,17 @@ class Challenge(ABC):

_data_cache: Dict[str, ChallengeData] = {}
CHALLENGE_LOCATION: str = ""
ARTIFACTS_LOCATION: str = "" # this is for suites
setup_dependencies: List[str] = [] # this is for suites
scores: dict[str, Any] = {} # this is for suites

@property
def data(self) -> ChallengeData:
file_path = f"{CURRENT_DIRECTORY}/../{self.CHALLENGE_LOCATION}/data.json"
if file_path not in Challenge._data_cache:
Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
return Challenge._data_cache[file_path]
if self.CHALLENGE_LOCATION not in self._data_cache:
self._data_cache[self.CHALLENGE_LOCATION] = ChallengeData.deserialize(
self.CHALLENGE_LOCATION
)
return self._data_cache[self.CHALLENGE_LOCATION]

@property
def task(self) -> str:
Expand All @@ -35,16 +38,20 @@ def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent

copy_artifacts_into_workspace(
config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
config["workspace"], "artifacts_in", self.ARTIFACTS_LOCATION
)

run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff)
print(
f"\033[1;35m============Starting {self.data.name} challenge============\033[0m"
)

run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff)

# hidden files are added after the agent runs. Hidden files can be python test files.
# We copy them in the workspace to make it easy to import the code produced by the agent

copy_artifacts_into_workspace(
config["workspace"], "custom_python", self.CHALLENGE_LOCATION
config["workspace"], "custom_python", self.ARTIFACTS_LOCATION
)

def test_method(self, config: Dict[str, Any]) -> None:
Expand All @@ -57,11 +64,11 @@ def open_file(workspace: str, filename: str) -> str:
with open(workspace_dir, "r") as f:
return f.read()

def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
script_dir = workspace
files_contents = []

for file_pattern in file_patterns:
for file_pattern in ground.files:
# Check if it is a file extension
if file_pattern.startswith("."):
# Find all files with the given extension in the workspace
Expand All @@ -71,7 +78,7 @@ def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
matching_files = [os.path.join(script_dir, file_pattern)]

for file_path in matching_files:
if self.data.ground.type == "execute_python_code":
if ground.type == "execute_python_code":
result = subprocess.run(
[sys.executable, file_path],
cwd=os.path.abspath(workspace),
Expand Down Expand Up @@ -104,38 +111,87 @@ def get_filenames_in_workspace(self, workspace: str) -> List[str]:
]

def scoring(self, content: str, ground: Ground) -> float:
print("Scoring content: ", content)
print("\033[1;34mScoring content:\033[0m", content)
if ground.should_contain:
for should_contain_word in ground.should_contain:
print_content = (
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
)
if should_contain_word not in content:
print(f"Word that should exist - {should_contain_word}: False")
print(print_content, "False")
return 0.0
else:
print(f"Word that should exist - {should_contain_word}: True")
print(print_content, "True")

if ground.should_not_contain:
for should_not_contain_word in ground.should_not_contain:
print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:"
if should_not_contain_word in content:
print(
f"Word that should not exist - {should_not_contain_word}: False"
)
print(print_content, "False")
return 0.0
else:
print(print_content, "True")

return 1.0

def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
scores = []
scores_dict = {}
percentage = None

if isinstance(self.data.ground, Ground):
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground
)

for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("\033[1;32mYour score is:\033[0m", score)
scores.append(score)
elif isinstance(self.data.ground, dict):
# if it's a dict then we know its a combined suite
for ground_key in self.data.ground:
ground = self.data.ground[ground_key]
files_contents = self.get_artifacts_out(config["workspace"], ground)

for file_content in files_contents:
score = self.scoring(file_content, ground)
scores_dict[ground_key] = score
print(
f"Word that should not exist - {should_not_contain_word}: True"
f"\033[1;35mScore for {ground_key}:\033[0m",
scores_dict[ground_key],
)

return 1.0
# Count the number of times the value 1.0 appears in the dictionary
num_ones = sum(1 for score in scores_dict.values() if score == 1.0)

def get_scores(self, config: Dict[str, Any]) -> List[float]:
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
# Calculate the percentage
percentage = round((num_ones / len(scores_dict)) * 100, 2)

scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
# Print the result in green
print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")

# TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
# So for now we return 1.0 if there's any that pass
if percentage > 0:
scores.append(1.0)
if percentage != 100:
print(
"\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
)

scores_data = {
"values": scores,
"scores_obj": scores_dict,
"percentage": percentage,
}

self.scores[self.__class__.__name__] = scores_data

return scores_data

def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
if scores["scores_obj"][test_name] == 1:
return 1

return scores
return None
19 changes: 19 additions & 0 deletions agbenchmark/challenges/adapatability/a1_debug/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"name": "TestAdaptSimpleTypoWithGuidance",
"category": ["adaptability"],
"task": "1- Run code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until code.py runs without errors.\n",
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
"cutoff": 75,
"ground": {
"answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"info": {
"difficulty": "intermediate",
"description": "A version of the debug challenge where the wrong files to test and change are given. The llm must figure the challenges out",
"side_effects": []
}
}
Loading

0 comments on commit d9b3d7d

Please sign in to comment.