-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathcreate_dataset.py
More file actions
114 lines (89 loc) · 3.83 KB
/
create_dataset.py
File metadata and controls
114 lines (89 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import json
import csv
from pathlib import Path
def build_problem_statement(sample):
"""Build a problem statement from a sample's failing tests."""
try:
i = 0 if sample.get("strategy") == "FAIL_PASS" else 1
actions_runs = sample.get("actions_runs")
if not actions_runs or not actions_runs[i]:
raise ValueError("No test results found in the sample data")
failing_tests = [
test
for test in actions_runs[i][0]["tests"]
if test["results"][0]["result"] == "Failure"
]
if not failing_tests:
raise ValueError("No failing tests found in the sample data")
# Build a focused, test-driven problem statement
problem_statement = ""
if project_name := sample.get('project_name'):
problem_statement += f"Project: {project_name}\n\n"
problem_statement += (
"The test suite has uncovered a bug in the code. Below are the failing tests "
"that pinpoint the incorrect behavior. Each test failure provides clues about "
"what's wrong and how the code should behave.\n\n"
"Current test failures:\n"
)
# Add each failing test with its details
for test in failing_tests:
test_name = test.get('name', 'Unknown test')
test_class = test.get('classname', 'Unknown class')
test_result = test.get('results', [{}])[0]
error_type = test_result.get('type', 'Unknown error')
error_msg = test_result.get('message', 'No error message provided')
problem_statement += f"\nTest: {test_class}#{test_name}\n"
problem_statement += f"Type: {error_type}\n"
problem_statement += f"Message: {error_msg}\n"
# Add instructions focused on using tests for verification
problem_statement += (
"\nFix the bug in the code to make the failing tests pass. The tests act as both a bug report "
"and a verification tool."
)
return problem_statement
except Exception as e:
raise ValueError(f"Error building problem statement: {str(e)}")
# Path to the bugs directory
bugs_dir = Path(__file__).parent.parent / "data" / "bugs"
# Output CSV file
output_file = Path(__file__).parent.parent / "dataset.csv"
# Create a list to store all entries
entries = []
# Read ignored list
ignored_bugs = set()
ignored_path = Path(__file__).parent / "ignored.txt"
with open(ignored_path, "r") as f:
for line in f.readlines():
ignored_bugs.add(line.strip())
# Process each json file in the bugs directory
for json_file in bugs_dir.glob("*.json"):
pid = json_file.stem
with json_file.open() as f:
for line in f:
bug_info = json.loads(line)
bid = bug_info["commit_hash"][:12]
if f"{pid}-{bid}" in ignored_bugs:
print(f"Ignoring {pid}-{bid}")
continue
# Create instance_id
instance_id = f"{pid}-{bid}".lower()
# Create image tag
image_tag = f"gitbugjava.eval.x86_64.{instance_id}:msbench-0.1.0"
# Add entry
entries.append(
{
"instance_id": instance_id,
"problem_statement": build_problem_statement(bug_info),
"image_tag": image_tag,
"bug_patch": bug_info["bug_patch"],
}
)
# Write to CSV
with output_file.open("w", newline="") as f:
writer = csv.DictWriter(
f, fieldnames=["instance_id", "problem_statement", "image_tag", "bug_patch"]
)
writer.writeheader()
writer.writerows(entries)
print(f"Created dataset with {len(entries)} entries at {output_file}")