Safety challenges, adaptability challenges, suite same_task (#177)

Significant-Gravitas · Jul 24, 2023 · d9b3d7d · d9b3d7d
1 parent c4aebda
commit d9b3d7d
Show file tree

Hide file tree

Showing 165 changed files with 2,289 additions and 486 deletions.
diff --git a/.env.example b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-HOME_ENV=
+REPORT_LOCATION="../../reports/mini-agi"
 MOCK_TEST=False
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,15 +1,17 @@
 ### Background
+
 <!-- Provide a concise overview of the rationale behind this change. Include relevant context, prior discussions, or links to related issues. Ensure that the change aligns with the project's overall direction. -->
 
 ### Changes
-<!-- Describe the specific, focused change made in this pull request. Detail the modifications clearly and avoid any unrelated or "extra" changes. -->
 
+<!-- Describe the specific, focused change made in this pull request. Detail the modifications clearly and avoid any unrelated or "extra" changes. -->
 
 ### PR Quality Checklist
+
 - [ ] I have run the following commands against my code to ensure it passes our linters:
-    ```shell
-    black .
-    isort .
-    mypy .
-    autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
-    ```
+  ```shell
+  black . --exclude test.py
+  isort .
+  mypy .
+  autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
+  ```
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     branches: [master]
   schedule:
-    - cron: "0 8 * * *"
+    - cron: '0 8 * * *'
   push:
     branches: [master, ci-test*]
     paths-ignore:
@@ -16,7 +16,7 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     env:
-      min-python-version: "3.10"
+      min-python-version: '3.10'
 
     steps:
       - name: Checkout repository
@@ -45,10 +45,10 @@ jobs:
           poetry install
 
       - name: Lint with flake8
-        run: poetry run flake8
+        run: poetry run flake8 --exclude=code,agent
 
       - name: Check black formatting
-        run: poetry run  black . --check
+        run: poetry run  black . --exclude test.py --check
         if: success() || failure()
 
       - name: Check isort formatting
@@ -68,20 +68,20 @@ jobs:
   tests:
     env:
       GH_TOKEN: ${{ github.event_name == 'pull_request' && github.token || secrets.PAT }}
-      min-python-version: "3.10"
-    name: "${{ matrix.agent-name }}"
+      min-python-version: '3.10'
+    name: '${{ matrix.agent-name }}'
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       fail-fast: false
       matrix:
         agent-name:
-          - "gpt-engineer"
-          - "smol-developer"
-          - "Auto-GPT"
-          - "mini-agi"
-          - "beebot"
-          - "BabyAGI"
+          - 'gpt-engineer'
+          - 'smol-developer'
+          - 'Auto-GPT'
+          - 'mini-agi'
+          - 'beebot'
+          - 'BabyAGI'
 
     steps:
       - name: Checkout repository
@@ -151,10 +151,37 @@ jobs:
           fi
 
           pip install ../../dist/*.whl
-          
+
           if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then
+
+            set +e # Ignore non-zero exit codes and continue execution
             ${prefix}agbenchmark start --maintain --mock
-            ${prefix}agbenchmark start --improve --mock
+            EXIT_CODE=$?
+            set -e  # Stop ignoring non-zero exit codes
+
+            # Check if the exit code was 5, and if so, exit with 0 instead
+            if [ $EXIT_CODE -eq 5 ]
+            then
+              echo "regression_tests.json is empty."
+              exit 0
+            else
+              exit $EXIT_CODE
+            fi
+
+            set +e # Ignore non-zero exit codes and continue execution
+            improve_cmd = ${prefix}agbenchmark start --improve --mock
+            EXIT_CODE=$?
+            set -e  # Stop ignoring non-zero exit codes
+
+            # Check if the exit code was 5, and if so, exit with 0 instead
+            if [ $EXIT_CODE -eq 5 ]
+            then
+              echo "regression_tests.json is empty."
+              exit 0
+            else
+              exit $EXIT_CODE
+            fi
+
             ${prefix}agbenchmark start --mock
             ${prefix}agbenchmark start --mock --category=retrieval
             ${prefix}agbenchmark start --mock --category=interface
@@ -165,7 +192,7 @@ jobs:
             bash -c "$(curl -fsSL https://raw.githubusercontent.com/Helicone/helicone/0ed90e3203f172ed05d5754bc0b95a584689233c/mitmproxy.sh)" -s start
             ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
           fi
-          
+
           cd ../..
 
         env:
@@ -179,7 +206,6 @@ jobs:
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
           REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}
 
-
       - name: Upload reports
         if: always()
         uses: actions/upload-artifact@v3
@@ -192,7 +218,7 @@ jobs:
         run: |
           git config --global user.email "[email protected]"
           git config --global user.name "Auto-GPT-Bot"
-      
+
           git add reports/* || echo "nothing to commit"
           commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
           git commit -m "${commit_message}"

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
@@ -16,13 +16,14 @@
 
 
 def run_agent(
-    task: str, config: Dict[str, Any], challenge_location: str, cutoff: int
+    task: str, config: Dict[str, Any], artifacts_location: str, cutoff: int
 ) -> None:
     """Calling to get a response"""
 
     if MOCK_FLAG:
+        print("Running mock agent")
         copy_artifacts_into_workspace(
-            config["workspace"], "artifacts_out", challenge_location
+            config["workspace"], "artifacts_out", artifacts_location
         )
     else:
         entry_path = "agbenchmark.benchmarks"
@@ -31,7 +32,7 @@ def run_agent(
         if "--nc" in sys.argv:
             timeout = 100000
 
-        print(f"Running Python function '{entry_path}' with timeout {timeout}")
+        print(f"Running '{entry_path}' with timeout {timeout}")
         command = [sys.executable, "-m", entry_path, str(task)]
         process = subprocess.Popen(
             command,

diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
@@ -5,8 +5,7 @@
 from abc import ABC
 from typing import Any, Dict, List
 
-from agbenchmark.challenges.define_task_types import ChallengeData, Ground
-from agbenchmark.start_benchmark import CURRENT_DIRECTORY
+from agbenchmark.challenges.data_types import ChallengeData, Ground
 
 
 class Challenge(ABC):
@@ -15,13 +14,17 @@ class Challenge(ABC):
 
     _data_cache: Dict[str, ChallengeData] = {}
     CHALLENGE_LOCATION: str = ""
+    ARTIFACTS_LOCATION: str = ""  # this is for suites
+    setup_dependencies: List[str] = []  # this is for suites
+    scores: dict[str, Any] = {}  # this is for suites
 
     @property
     def data(self) -> ChallengeData:
-        file_path = f"{CURRENT_DIRECTORY}/../{self.CHALLENGE_LOCATION}/data.json"
-        if file_path not in Challenge._data_cache:
-            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
-        return Challenge._data_cache[file_path]
+        if self.CHALLENGE_LOCATION not in self._data_cache:
+            self._data_cache[self.CHALLENGE_LOCATION] = ChallengeData.deserialize(
+                self.CHALLENGE_LOCATION
+            )
+        return self._data_cache[self.CHALLENGE_LOCATION]
 
     @property
     def task(self) -> str:
@@ -35,16 +38,20 @@ def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
         from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
         copy_artifacts_into_workspace(
-            config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
+            config["workspace"], "artifacts_in", self.ARTIFACTS_LOCATION
         )
 
-        run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff)
+        print(
+            f"\033[1;35m============Starting {self.data.name} challenge============\033[0m"
+        )
+
+        run_agent(self.task, config, self.ARTIFACTS_LOCATION, cutoff)
 
         # hidden files are added after the agent runs. Hidden files can be python test files.
         # We copy them in the workspace to make it easy to import the code produced by the agent
 
         copy_artifacts_into_workspace(
-            config["workspace"], "custom_python", self.CHALLENGE_LOCATION
+            config["workspace"], "custom_python", self.ARTIFACTS_LOCATION
         )
 
     def test_method(self, config: Dict[str, Any]) -> None:
@@ -57,11 +64,11 @@ def open_file(workspace: str, filename: str) -> str:
         with open(workspace_dir, "r") as f:
             return f.read()
 
-    def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
+    def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
         script_dir = workspace
         files_contents = []
 
-        for file_pattern in file_patterns:
+        for file_pattern in ground.files:
             # Check if it is a file extension
             if file_pattern.startswith("."):
                 # Find all files with the given extension in the workspace
@@ -71,7 +78,7 @@ def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
                 matching_files = [os.path.join(script_dir, file_pattern)]
 
             for file_path in matching_files:
-                if self.data.ground.type == "execute_python_code":
+                if ground.type == "execute_python_code":
                     result = subprocess.run(
                         [sys.executable, file_path],
                         cwd=os.path.abspath(workspace),
@@ -104,38 +111,87 @@ def get_filenames_in_workspace(self, workspace: str) -> List[str]:
         ]
 
     def scoring(self, content: str, ground: Ground) -> float:
-        print("Scoring content: ", content)
+        print("\033[1;34mScoring content:\033[0m", content)
         if ground.should_contain:
             for should_contain_word in ground.should_contain:
+                print_content = (
+                    f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
+                )
                 if should_contain_word not in content:
-                    print(f"Word that should exist - {should_contain_word}: False")
+                    print(print_content, "False")
                     return 0.0
                 else:
-                    print(f"Word that should exist - {should_contain_word}: True")
+                    print(print_content, "True")
 
         if ground.should_not_contain:
             for should_not_contain_word in ground.should_not_contain:
+                print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:"
                 if should_not_contain_word in content:
-                    print(
-                        f"Word that should not exist - {should_not_contain_word}: False"
-                    )
+                    print(print_content, "False")
                     return 0.0
                 else:
+                    print(print_content, "True")
+
+        return 1.0
+
+    def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
+        scores = []
+        scores_dict = {}
+        percentage = None
+
+        if isinstance(self.data.ground, Ground):
+            files_contents = self.get_artifacts_out(
+                config["workspace"], self.data.ground
+            )
+
+            for file_content in files_contents:
+                score = self.scoring(file_content, self.data.ground)
+                print("\033[1;32mYour score is:\033[0m", score)
+                scores.append(score)
+        elif isinstance(self.data.ground, dict):
+            # if it's a dict then we know its a combined suite
+            for ground_key in self.data.ground:
+                ground = self.data.ground[ground_key]
+                files_contents = self.get_artifacts_out(config["workspace"], ground)
+
+                for file_content in files_contents:
+                    score = self.scoring(file_content, ground)
+                    scores_dict[ground_key] = score
                     print(
-                        f"Word that should not exist - {should_not_contain_word}: True"
+                        f"\033[1;35mScore for {ground_key}:\033[0m",
+                        scores_dict[ground_key],
                     )
 
-        return 1.0
+            # Count the number of times the value 1.0 appears in the dictionary
+            num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
 
-    def get_scores(self, config: Dict[str, Any]) -> List[float]:
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
+            # Calculate the percentage
+            percentage = round((num_ones / len(scores_dict)) * 100, 2)
 
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+            # Print the result in green
+            print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
+
+            # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
+            # So for now we return 1.0 if there's any that pass
+            if percentage > 0:
+                scores.append(1.0)
+                if percentage != 100:
+                    print(
+                        "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
+                    )
+
+        scores_data = {
+            "values": scores,
+            "scores_obj": scores_dict,
+            "percentage": percentage,
+        }
+
+        self.scores[self.__class__.__name__] = scores_data
+
+        return scores_data
+
+    def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
+        if scores["scores_obj"][test_name] == 1:
+            return 1
 
-        return scores
+        return None
diff --git a/...es/code/d1_debug/artifacts_in/__init__.py → ...ability/a1_debug/artifacts_in/__init__.py b/...es/code/d1_debug/artifacts_in/__init__.py → ...ability/a1_debug/artifacts_in/__init__.py
diff --git a/...lenges/code/d1_debug/artifacts_in/code.py → ...apatability/a1_debug/artifacts_in/code.py b/...lenges/code/d1_debug/artifacts_in/code.py → ...apatability/a1_debug/artifacts_in/code.py
diff --git a/...lenges/code/d1_debug/artifacts_in/test.py → ...apatability/a1_debug/artifacts_in/test.py b/...lenges/code/d1_debug/artifacts_in/test.py → ...apatability/a1_debug/artifacts_in/test.py
diff --git a/...s/code/d1_debug/artifacts_out/__init__.py → ...bility/a1_debug/artifacts_out/__init__.py b/...s/code/d1_debug/artifacts_out/__init__.py → ...bility/a1_debug/artifacts_out/__init__.py
diff --git a/...enges/code/d1_debug/artifacts_out/code.py → ...patability/a1_debug/artifacts_out/code.py b/...enges/code/d1_debug/artifacts_out/code.py → ...patability/a1_debug/artifacts_out/code.py
diff --git a/...enges/code/d1_debug/artifacts_out/test.py → ...patability/a1_debug/artifacts_out/test.py b/...enges/code/d1_debug/artifacts_out/test.py → ...patability/a1_debug/artifacts_out/test.py
diff --git a/agbenchmark/challenges/adapatability/a1_debug/data.json b/agbenchmark/challenges/adapatability/a1_debug/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestAdaptSimpleTypoWithGuidance",
+  "category": ["adaptability"],
+  "task": "1- Run code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until code.py runs without errors.\n",
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 75,
+  "ground": {
+    "answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "A version of the debug challenge where the wrong files to test and change are given. The llm must figure the challenges out",
+    "side_effects": []
+  }
+}
diff --git a/....1_specific/artifacts_out/random_file.txt → ...sla_revenue/artifacts_out/random_file.txt b/....1_specific/artifacts_out/random_file.txt → ...sla_revenue/artifacts_out/random_file.txt