Significant-Gravitas · SilenNaihin · Jul 15, 2023 · Jul 15, 2023 · Jul 15, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
         if: success() || failure()
 
   tests:
-    name: ${{ matrix.agent-name }}
+    name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
     timeout-minutes: 10
     env:
@@ -77,6 +77,8 @@ jobs:
           - "smol-developer"
           - "Auto-GPT"
           - "mini-agi"
+        cache-enabled: [ true, false ]
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -156,13 +158,14 @@ jobs:
           PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
           HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
-          HELICONE_CACHE_ENABLED: true
+          HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }}
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
 
 
       - name: Upload reports
         if: always()
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.agent-name }}
+          name:
+            ${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})
           path: agent/${{ matrix.agent-name }}/agbenchmark
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestDebugSimpleTypoWithGuidance",
-  "category": ["code"],
+  "category": ["code", "iterate"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {

diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestDebugSimpleTypoWithoutGuidance",
-  "category": ["code"],
+  "category": ["code", "iterate"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
   "ground": {

diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestBasicCodeGeneration",
-  "category": ["code", "iterate"],
+  "category": ["code"],
   "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
   "dependencies": ["TestWriteFile"],
   "ground": {

diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5/artifacts_out/__init__.py
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5/artifacts_out/code.py
@@ -0,0 +1,23 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
+    nums_indices = [(num, index) for index, num in enumerate(nums)]
+    nums_indices.sort()
+    for i in range(len(nums_indices) - 2):
+        if i > 0 and nums_indices[i] == nums_indices[i - 1]:
+            continue
+        l, r = i + 1, len(nums_indices) - 1
+        while l < r:
+            three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
+            if three_sum < target:
+                l += 1
+            elif three_sum > target:
+                r -= 1
+            else:
+                indices = sorted(
+                    [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
+                )
+                return indices
+    return None
diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5/custom_python/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import three_sum
+from typing import List
+
+
+def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
+    result = three_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first three numbers
+    nums = [2, 7, 11, 15]
+    target = 20
+    expected_result = [0, 1, 2]
+    test_three_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 2
+    expected_result = [0, 2, 5]
+    test_three_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = 9
+    expected_result = [0, 2, 3]
+    test_three_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestThreeSum",
+  "category": ["code", "iterate"],
+  "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+  "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
+  "ground": {
+    "answer": "The three_sum function coded properly.",
+    "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "Tests ability for the agent to create the three_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agent/gpt-engineer b/agent/gpt-engineer