run bug run tests and prompts

ASSERT-KTH · andre15silva · Mar 19, 2025 · Oct 27, 2024 · Oct 30, 2024 · Nov 13, 2024
commit 1b101dcab84c7fe873365e366a3f96f5b935f265
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
@@ -5,7 +5,7 @@
 
 import subprocess
 import logging
-
+from tqdm import tqdm
 import pandas as pd
 
 class RunBugRun(Benchmark):
@@ -25,32 +25,68 @@ def initialize(self) -> None:
         logging.info("Initializing RunBugRun benchmark...")
 
         python_path = Path(self.get_path(), 'python_valid0.jsonl')
-        # test_path = Path(self.get_path(), 'tests_all.jsonl')
+        test_path = Path(self.get_path(), 'tests_all.jsonl')
 
         python_df = pd.read_json(python_path, lines=True).set_index('problem_id')
-
+        test_df = pd.read_json(test_path, lines=True).set_index('id')
+
+        subprocess.run(
+            f"mkdir -p {self.path}/buggy",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        subprocess.run(
+            f"mkdir -p {self.path}/fixed",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        buggy_submissions = python_df.drop_duplicates(subset=['buggy_submission_id']).head(10)
+
         for prob_id, (buggy_submission_id, buggy_code, fixed_submission_id, fixed_code) \
-        in python_df.drop_duplicates(subset=['buggy_submission_id'])[
-            ['buggy_submission_id','buggy_code', 'fixed_submission_id', 'fixed_code']
-        ].iterrows():
-
-            buggy_file = Path(self.path, f'{prob_id}_{buggy_submission_id}.py')
-            fixed_file = Path(self.path, f'{prob_id}_{fixed_submission_id}.py')
+        in tqdm(
+            buggy_submissions[['buggy_submission_id','buggy_code', 'fixed_submission_id', 'fixed_code']].iterrows(), 
+            total=len(buggy_submissions)
+        ):
+
+            buggy_file = Path(self.path, 'buggy',  f'{prob_id}_{buggy_submission_id}.py')
+            fixed_file = Path(self.path, 'fixed', f'{prob_id}_{buggy_submission_id}.py') # using buggy id for both to maintain file correspondence
+
+            with open(buggy_file, 'w') as f:
+                f.write(buggy_code)
+                f.write('\n')
+
+            with open(fixed_file, 'w') as f:
+                f.write(fixed_code)
+                f.write('\n')
 
             run = subprocess.run(
                 f"""cd {self.get_path()} && 
-                echo '''{buggy_code}''' > {buggy_file} && 
-                echo '''{fixed_code}''' > {fixed_file} && 
                 diff --unified {fixed_file.relative_to(self.path)} {buggy_file.relative_to(self.path)}""",
                 shell=True,
                 capture_output=True
             )
-            if run.returncode:
-                print (run)
 
             diff = PatchSet(run.stdout.decode("utf-8"))
             # Change the source file path to point to the buggy version
             diff[0].source_file = f"{buggy_file.relative_to(self.path)}"
+
+            failing_tests = {}
+
+            for test_id, (test_input, test_output) in test_df[test_df.problem_id == prob_id][['input', 'output']].iterrows():
+                error_code, result = RunBugRunBug.execute_test_case(buggy_file, test_input)
+
+                if error_code:
+                    cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}""" 
+                elif result != test_output.strip():
+                    cause = f"""Expected function with input {test_input.replace('"', "'")} to output {test_output.replace('"', "'").replace("'", r"\'")} but got {result}"""
+                else:
+                    continue # skip passing   
+
+                failing_tests[f"""{test_input} -> {test_output}"""] = cause
 
-            self.add_bug(RunBugRunBug(self, f"{prob_id}_{buggy_submission_id}", str(diff)))
+            self.add_bug(RunBugRunBug(self, f"{prob_id}_{buggy_submission_id}", str(diff), failing_tests))
 
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrunbug.py b/elleelleaime/core/benchmarks/runbugrun/runbugrunbug.py
@@ -1,25 +1,85 @@
 import subprocess
 import shutil
 import os
-from elleelleaime.core.benchmarks.benchmark import Benchmark
+from pathlib import Path
 
-from elleelleaime.core.benchmarks.bug import Bug
+from elleelleaime.core.benchmarks.benchmark import Benchmark
+from elleelleaime.core.benchmarks.bug import RichBug
 from elleelleaime.core.benchmarks.test_result import TestResult
 from elleelleaime.core.benchmarks.compile_result import CompileResult
 
-class RunBugRunBug(Bug):
+class RunBugRunBug(RichBug):
     """
     The class for representing RunBugRun bugs
     """
-
-    def __init__(self, benchmark: Benchmark, bid: str, ground_truth: str) -> None:
-        super().__init__(benchmark, bid, ground_truth, True)
 
     def checkout(self, path: str, fixed: bool = False) -> bool:
-        pass
+        # Remove the directory if it exists
+        shutil.rmtree(path, ignore_errors=True)
+        # Make the directory
+        subprocess.run(
+            f"mkdir -p {path}",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        # Checkout the bug is the same as copying the entire benchmark
+        # Copy source files
+        cmd = f"cd {self.benchmark.get_path()}; mkdir {path}; cp {'fixed' if fixed else 'buggy'}/{self.identifier}.py {path}"
+        run = subprocess.run(cmd, shell=True, capture_output=True, check=True)
+
+        # Copy test files
+        # cmd = f"cd {self.benchmark.get_path()}; mkdir -p {path}/java_testcases/junit; cp java_testcases/junit/{self.identifier}_TEST.java {path}/java_testcases/junit; cp java_testcases/junit/QuixFixOracleHelper.java {path}/java_testcases/junit"
+        # run = subprocess.run(cmd, shell=True, capture_output=True, check=True)
+        return run.returncode == 0
 
     def compile(self, path: str) -> CompileResult:
-        pass
+        file_path = Path(path, f"{self.get_identifier()}.py")
+        assert file_path.exists()
+
+        with open(file_path) as f:
+            bug_code = f.read()
+        assert bug_code
+
+        try:
+            compile(bug_code, file_path, 'exec')
+            return CompileResult(True)
+        except:
+            return CompileResult(False)
 
     def test(self, path: str) -> TestResult:
-        pass
+        file_path = Path(path, f"{self.get_identifier()}.py")
+        assert file_path.exists()
+
+        for test_case in self.failing_tests:
+
+            test_input, test_output = test_case.split(' -> ')
+
+            error_code, result = RunBugRunBug.execute_test_case(file_path, test_input)
+            if error_code:
+                return TestResult(False)
+            elif result != test_output.strip():
+                return TestResult(False)
+
+        return TestResult(True)
+
+    @staticmethod
+    def execute_test_case(code_path, test_input):
+        if test_input.strip():
+            cmd = f"""echo "{test_input}" | python {code_path}"""
+        else:
+            cmd = f"""python {code_path}"""
+
+        run = subprocess.run(
+            cmd, 
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+
+        return run.returncode, run.stderr.decode("utf-8").strip() if run.returncode else run.stdout.decode("utf-8").strip()
+
+    def get_src_test_dir(self, path: str) -> str:
+        pass
+
diff --git a/elleelleaime/core/utils/benchmarks.py b/elleelleaime/core/utils/benchmarks.py
@@ -3,6 +3,7 @@
 from elleelleaime.core.benchmarks.humanevaljava.humanevaljava import HumanEvalJava
 from elleelleaime.core.benchmarks.quixbugs.quixbugs import QuixBugs
 from elleelleaime.core.benchmarks.gitbugjava.gitbugjava import GitBugJava
+from elleelleaime.core.benchmarks.runbugrun.runbugrun import RunBugRun
 
 from typing import Optional
 
@@ -11,6 +12,7 @@
     "HumanEvalJava": HumanEvalJava,
     "QuixBugs": QuixBugs,
     "GitBugJava": GitBugJava,
+    "RunBugRun": RunBugRun
 }
 
 

diff --git a/elleelleaime/core/utils/python/__init__.py b/elleelleaime/core/utils/python/__init__.py
diff --git a/elleelleaime/core/utils/python/python.py b/elleelleaime/core/utils/python/python.py
@@ -0,0 +1,75 @@
+from typing import Optional, Tuple, List
+from unidiff import PatchSet
+from uuid import uuid4
+from pathlib import Path
+import logging
+import getpass, tempfile, difflib, shutil
+import subprocess
+import re
+import ast
+
+from elleelleaime.core.benchmarks.bug import Bug, RichBug
+
+def extract_functions(source_code):
+    # Parse the source code into an AST
+    tree = ast.parse(source_code)
+
+    # Extract all function definitions
+    functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)]
+
+    # Convert the function nodes back to source code
+    function_sources= [ast.get_source_segment(source_code, func) for func in functions]
+
+    return function_sources
+
+def extract_single_function(bug: Bug) -> Optional[Tuple[str, str]]:
+    """
+    Extracts the buggy and fixed code of single-function bugs.
+    Returns None is bug is not single-function
+
+    Args:
+        bug (Bug): THe bug to extract the code from
+
+    Returns:
+        Optional[Tuple[str, str]]: None if the bug is not single-function, otherwise a tuple of the form (buggy_code, fixed_code)
+    """
+    buggy_path = Path(
+        tempfile.gettempdir(),
+        f"elleelleaime-{getpass.getuser()}",
+        bug.get_identifier(),
+        str(uuid4()),
+    )
+    fixed_path = Path(
+        tempfile.gettempdir(),
+        f"elleelleaime-{getpass.getuser()}",
+        bug.get_identifier(),
+        str(uuid4()),
+    )
+
+    try:
+        # Checkout the buggy and fixed versions of the bug
+        bug.checkout(str(buggy_path), fixed=False)
+        bug.checkout(str(fixed_path), fixed=True)
+
+        with open(Path(buggy_path, f"{bug.get_identifier()}.py")) as f:
+            buggy_code = f.read()
+
+        with open(Path(fixed_path, f"{bug.get_identifier()}.py")) as f:
+            fixed_code = f.read()
+
+        buggy_functions = extract_functions(buggy_code)
+        fixed_functions = extract_functions(fixed_code)
+
+        assert len(buggy_functions) == len(fixed_functions)
+
+        # if len(buggy_functions) == len(fixed_functions) == 1:
+        #     return buggy_functions[0], fixed_functions[0]
+
+        # most of run bug run are straight through scripts, not functions
+        return buggy_code, fixed_code
+
+
+    finally:
+        # Remove the checked-out bugs
+        shutil.rmtree(buggy_path, ignore_errors=True)
+        shutil.rmtree(fixed_path, ignore_errors=True)
diff --git a/elleelleaime/sample/registry.py b/elleelleaime/sample/registry.py
@@ -1,6 +1,7 @@
 from .strategy import PromptingStrategy
 from .strategies.infilling import InfillingPrompting
 from .strategies.instruct import InstructPrompting
+from .strategies.instruct_python import InstructPromptingPython
 
 
 class PromptStrategyRegistry:
@@ -11,6 +12,7 @@ class PromptStrategyRegistry:
     __STRATEGIES: dict[str, type] = {
         "infilling": InfillingPrompting,
         "instruct": InstructPrompting,
+        "instruct_python": InstructPromptingPython,
     }
 
     @classmethod

diff --git a/elleelleaime/sample/strategies/instruct_python.py b/elleelleaime/sample/strategies/instruct_python.py
@@ -0,0 +1,93 @@
+from typing import Optional, Tuple
+from unidiff import PatchSet
+
+from elleelleaime.sample.strategy import PromptingStrategy
+from elleelleaime.core.benchmarks.bug import RichBug
+from elleelleaime.core.utils.python.python import (
+    extract_single_function,
+    # extract_failing_test_cases,
+)
+
+
+class InstructPromptingPython(PromptingStrategy):
+    """
+    Implements instruction prompting strategies.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__("instruct_python")
+
+    def instruct(
+        self, bug: RichBug
+    ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """
+        Builds an instruction prompt for the given bug.
+
+        Args:
+            bug: The bug to generate the prompt for.
+        Returns:
+            Tuple: A tuple of the form (buggy_code, fixed_code, prompt).
+        """
+        result = extract_single_function(bug)
+        if result is None:
+            return None, None, None
+
+        buggy_code, fixed_code = result
+
+        failing_test_causes = bug.get_failing_tests()
+
+        failing_tests_string = ""
+        for test_case, cause in failing_test_causes.items():
+            failing_tests_string += f"""Test `{test_case}`:
+```python
+assert result == {test_case.split(' -> ')[-1]}
+```
+Test `{test_case}` error:
+```
+{cause}
+```
+
+"""
+
+        prompt = f"""You are an automatic program repair tool. Your task is to fix the provided buggy code.
+
+The following code contains a buggy function:
+```python
+{buggy_code}
+```
+
+The code fails the following tests.
+
+{failing_tests_string}
+Please provide a fixed version of the buggy function, and only that function, inside a code block.
+"""
+
+        return buggy_code, fixed_code, prompt
+
+    def prompt(self, bug: RichBug) -> dict[str, Optional[str]]:
+        """
+        Returns the prompt for the given bug.
+
+        :param bug: The bug to generate the prompt for.
+        """
+        result = {
+            "identifier": bug.get_identifier(),
+            "buggy_code": None,
+            "fixed_code": None,
+            "prompt_strategy": self.strategy_name,
+            "prompt": None,
+            "ground_truth": bug.get_ground_truth(),
+        }
+
+        diff = PatchSet(bug.get_ground_truth())
+
+        # This strategy only supports single-file prompts
+        if len(diff) != 1:
+            return result
+
+        (
+            result["buggy_code"],
+            result["fixed_code"],
+            result["prompt"],
+        ) = self.instruct(bug)
+        return result