Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for run-bug-run runbugrun #39 WIP #166

Merged
merged 25 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
run bug run tests and prompts
  • Loading branch information
cadddr authored and andre15silva committed Dec 28, 2024
commit 1b101dcab84c7fe873365e366a3f96f5b935f265
64 changes: 50 additions & 14 deletions elleelleaime/core/benchmarks/runbugrun/runbugrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import subprocess
import logging

from tqdm import tqdm
import pandas as pd

class RunBugRun(Benchmark):
Expand All @@ -25,32 +25,68 @@ def initialize(self) -> None:
logging.info("Initializing RunBugRun benchmark...")

python_path = Path(self.get_path(), 'python_valid0.jsonl')
# test_path = Path(self.get_path(), 'tests_all.jsonl')
test_path = Path(self.get_path(), 'tests_all.jsonl')

python_df = pd.read_json(python_path, lines=True).set_index('problem_id')

test_df = pd.read_json(test_path, lines=True).set_index('id')

subprocess.run(
f"mkdir -p {self.path}/buggy",
shell=True,
capture_output=True,
check=True,
)

subprocess.run(
f"mkdir -p {self.path}/fixed",
shell=True,
capture_output=True,
check=True,
)

buggy_submissions = python_df.drop_duplicates(subset=['buggy_submission_id']).head(10)

for prob_id, (buggy_submission_id, buggy_code, fixed_submission_id, fixed_code) \
in python_df.drop_duplicates(subset=['buggy_submission_id'])[
['buggy_submission_id','buggy_code', 'fixed_submission_id', 'fixed_code']
].iterrows():

buggy_file = Path(self.path, f'{prob_id}_{buggy_submission_id}.py')
fixed_file = Path(self.path, f'{prob_id}_{fixed_submission_id}.py')
in tqdm(
buggy_submissions[['buggy_submission_id','buggy_code', 'fixed_submission_id', 'fixed_code']].iterrows(),
total=len(buggy_submissions)
):

buggy_file = Path(self.path, 'buggy', f'{prob_id}_{buggy_submission_id}.py')
fixed_file = Path(self.path, 'fixed', f'{prob_id}_{buggy_submission_id}.py') # using buggy id for both to maintain file correspondence

with open(buggy_file, 'w') as f:
f.write(buggy_code)
f.write('\n')

with open(fixed_file, 'w') as f:
f.write(fixed_code)
f.write('\n')

run = subprocess.run(
f"""cd {self.get_path()} &&
echo '''{buggy_code}''' > {buggy_file} &&
echo '''{fixed_code}''' > {fixed_file} &&
diff --unified {fixed_file.relative_to(self.path)} {buggy_file.relative_to(self.path)}""",
shell=True,
capture_output=True
)
if run.returncode:
print (run)

diff = PatchSet(run.stdout.decode("utf-8"))
# Change the source file path to point to the buggy version
diff[0].source_file = f"{buggy_file.relative_to(self.path)}"

failing_tests = {}

for test_id, (test_input, test_output) in test_df[test_df.problem_id == prob_id][['input', 'output']].iterrows():
error_code, result = RunBugRunBug.execute_test_case(buggy_file, test_input)

if error_code:
cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}"""
elif result != test_output.strip():
cause = f"""Expected function with input {test_input.replace('"', "'")} to output {test_output.replace('"', "'").replace("'", r"\'")} but got {result}"""
else:
continue # skip passing

failing_tests[f"""{test_input} -> {test_output}"""] = cause

self.add_bug(RunBugRunBug(self, f"{prob_id}_{buggy_submission_id}", str(diff)))
self.add_bug(RunBugRunBug(self, f"{prob_id}_{buggy_submission_id}", str(diff), failing_tests))

78 changes: 69 additions & 9 deletions elleelleaime/core/benchmarks/runbugrun/runbugrunbug.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,85 @@
import subprocess
import shutil
import os
from elleelleaime.core.benchmarks.benchmark import Benchmark
from pathlib import Path

from elleelleaime.core.benchmarks.bug import Bug
from elleelleaime.core.benchmarks.benchmark import Benchmark
from elleelleaime.core.benchmarks.bug import RichBug
from elleelleaime.core.benchmarks.test_result import TestResult
from elleelleaime.core.benchmarks.compile_result import CompileResult

class RunBugRunBug(Bug):
class RunBugRunBug(RichBug):
"""
The class for representing RunBugRun bugs
"""

def __init__(self, benchmark: Benchmark, bid: str, ground_truth: str) -> None:
super().__init__(benchmark, bid, ground_truth, True)

def checkout(self, path: str, fixed: bool = False) -> bool:
pass
# Remove the directory if it exists
shutil.rmtree(path, ignore_errors=True)
# Make the directory
subprocess.run(
f"mkdir -p {path}",
shell=True,
capture_output=True,
check=True,
)

# Checkout the bug is the same as copying the entire benchmark
# Copy source files
cmd = f"cd {self.benchmark.get_path()}; mkdir {path}; cp {'fixed' if fixed else 'buggy'}/{self.identifier}.py {path}"
run = subprocess.run(cmd, shell=True, capture_output=True, check=True)

# Copy test files
# cmd = f"cd {self.benchmark.get_path()}; mkdir -p {path}/java_testcases/junit; cp java_testcases/junit/{self.identifier}_TEST.java {path}/java_testcases/junit; cp java_testcases/junit/QuixFixOracleHelper.java {path}/java_testcases/junit"
# run = subprocess.run(cmd, shell=True, capture_output=True, check=True)
return run.returncode == 0

def compile(self, path: str) -> CompileResult:
pass
file_path = Path(path, f"{self.get_identifier()}.py")
assert file_path.exists()

with open(file_path) as f:
bug_code = f.read()
assert bug_code

try:
compile(bug_code, file_path, 'exec')
return CompileResult(True)
except:
return CompileResult(False)

def test(self, path: str) -> TestResult:
pass
file_path = Path(path, f"{self.get_identifier()}.py")
assert file_path.exists()

for test_case in self.failing_tests:

test_input, test_output = test_case.split(' -> ')

error_code, result = RunBugRunBug.execute_test_case(file_path, test_input)
if error_code:
return TestResult(False)
elif result != test_output.strip():
return TestResult(False)

return TestResult(True)

@staticmethod
def execute_test_case(code_path, test_input):
if test_input.strip():
cmd = f"""echo "{test_input}" | python {code_path}"""
else:
cmd = f"""python {code_path}"""

run = subprocess.run(
cmd,
shell=True,
capture_output=True,
check=False,
)

return run.returncode, run.stderr.decode("utf-8").strip() if run.returncode else run.stdout.decode("utf-8").strip()

def get_src_test_dir(self, path: str) -> str:
pass

2 changes: 2 additions & 0 deletions elleelleaime/core/utils/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from elleelleaime.core.benchmarks.humanevaljava.humanevaljava import HumanEvalJava
from elleelleaime.core.benchmarks.quixbugs.quixbugs import QuixBugs
from elleelleaime.core.benchmarks.gitbugjava.gitbugjava import GitBugJava
from elleelleaime.core.benchmarks.runbugrun.runbugrun import RunBugRun

from typing import Optional

Expand All @@ -11,6 +12,7 @@
"HumanEvalJava": HumanEvalJava,
"QuixBugs": QuixBugs,
"GitBugJava": GitBugJava,
"RunBugRun": RunBugRun
}


Expand Down
Empty file.
75 changes: 75 additions & 0 deletions elleelleaime/core/utils/python/python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Optional, Tuple, List
from unidiff import PatchSet
from uuid import uuid4
from pathlib import Path
import logging
import getpass, tempfile, difflib, shutil
import subprocess
import re
import ast

from elleelleaime.core.benchmarks.bug import Bug, RichBug

def extract_functions(source_code):
# Parse the source code into an AST
tree = ast.parse(source_code)

# Extract all function definitions
functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)]

# Convert the function nodes back to source code
function_sources= [ast.get_source_segment(source_code, func) for func in functions]

return function_sources

def extract_single_function(bug: Bug) -> Optional[Tuple[str, str]]:
"""
Extracts the buggy and fixed code of single-function bugs.
Returns None is bug is not single-function

Args:
bug (Bug): THe bug to extract the code from

Returns:
Optional[Tuple[str, str]]: None if the bug is not single-function, otherwise a tuple of the form (buggy_code, fixed_code)
"""
buggy_path = Path(
tempfile.gettempdir(),
f"elleelleaime-{getpass.getuser()}",
bug.get_identifier(),
str(uuid4()),
)
fixed_path = Path(
tempfile.gettempdir(),
f"elleelleaime-{getpass.getuser()}",
bug.get_identifier(),
str(uuid4()),
)

try:
# Checkout the buggy and fixed versions of the bug
bug.checkout(str(buggy_path), fixed=False)
bug.checkout(str(fixed_path), fixed=True)

with open(Path(buggy_path, f"{bug.get_identifier()}.py")) as f:
buggy_code = f.read()

with open(Path(fixed_path, f"{bug.get_identifier()}.py")) as f:
fixed_code = f.read()

buggy_functions = extract_functions(buggy_code)
fixed_functions = extract_functions(fixed_code)

assert len(buggy_functions) == len(fixed_functions)

# if len(buggy_functions) == len(fixed_functions) == 1:
# return buggy_functions[0], fixed_functions[0]

# most of run bug run are straight through scripts, not functions
return buggy_code, fixed_code


finally:
# Remove the checked-out bugs
shutil.rmtree(buggy_path, ignore_errors=True)
shutil.rmtree(fixed_path, ignore_errors=True)
2 changes: 2 additions & 0 deletions elleelleaime/sample/registry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .strategy import PromptingStrategy
from .strategies.infilling import InfillingPrompting
from .strategies.instruct import InstructPrompting
from .strategies.instruct_python import InstructPromptingPython


class PromptStrategyRegistry:
Expand All @@ -11,6 +12,7 @@ class PromptStrategyRegistry:
__STRATEGIES: dict[str, type] = {
"infilling": InfillingPrompting,
"instruct": InstructPrompting,
"instruct_python": InstructPromptingPython,
}

@classmethod
Expand Down
93 changes: 93 additions & 0 deletions elleelleaime/sample/strategies/instruct_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import Optional, Tuple
from unidiff import PatchSet

from elleelleaime.sample.strategy import PromptingStrategy
from elleelleaime.core.benchmarks.bug import RichBug
from elleelleaime.core.utils.python.python import (
extract_single_function,
# extract_failing_test_cases,
)


class InstructPromptingPython(PromptingStrategy):
"""
Implements instruction prompting strategies.
"""

def __init__(self, **kwargs):
super().__init__("instruct_python")

def instruct(
self, bug: RichBug
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Builds an instruction prompt for the given bug.

Args:
bug: The bug to generate the prompt for.
Returns:
Tuple: A tuple of the form (buggy_code, fixed_code, prompt).
"""
result = extract_single_function(bug)
if result is None:
return None, None, None

buggy_code, fixed_code = result

failing_test_causes = bug.get_failing_tests()

failing_tests_string = ""
for test_case, cause in failing_test_causes.items():
failing_tests_string += f"""Test `{test_case}`:
```python
assert result == {test_case.split(' -> ')[-1]}
```
Test `{test_case}` error:
```
{cause}
```

"""

prompt = f"""You are an automatic program repair tool. Your task is to fix the provided buggy code.

The following code contains a buggy function:
```python
{buggy_code}
```

The code fails the following tests.

{failing_tests_string}
Please provide a fixed version of the buggy function, and only that function, inside a code block.
"""

return buggy_code, fixed_code, prompt

def prompt(self, bug: RichBug) -> dict[str, Optional[str]]:
"""
Returns the prompt for the given bug.

:param bug: The bug to generate the prompt for.
"""
result = {
"identifier": bug.get_identifier(),
"buggy_code": None,
"fixed_code": None,
"prompt_strategy": self.strategy_name,
"prompt": None,
"ground_truth": bug.get_ground_truth(),
}

diff = PatchSet(bug.get_ground_truth())

# This strategy only supports single-file prompts
if len(diff) != 1:
return result

(
result["buggy_code"],
result["fixed_code"],
result["prompt"],
) = self.instruct(bug)
return result
Loading