Skip to content

Commit e6cdf18

Browse files
authored
[Evaluation] Log empty patch stats for SWE-Bench (OpenHands#2776)
* bump swebench version since the fix PR is merged * add empy generation stats from latest pr * delete eval_outputs if it already exists * handle non string patch
1 parent 0b8d357 commit e6cdf18

3 files changed

Lines changed: 26 additions & 1 deletion

File tree

evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515

1616

1717
def process_git_patch(patch):
18+
if not isinstance(patch, str):
19+
return ''
20+
1821
if not patch.strip():
1922
# skip empty patches
2023
return ''

evaluation/swe_bench/scripts/eval/update_output_with_eval.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
df = pd.read_json(args.input_file, lines=True)
1616

1717
output_md_filepath = os.path.join(dirname, 'README.md')
18-
instance_id_to_status = defaultdict(lambda: {'resolved': False})
18+
instance_id_to_status = defaultdict(
19+
lambda: {'resolved': False, 'empty_generation': False}
20+
)
1921
if os.path.exists(report_json):
2022
with open(report_json, 'r') as f:
2123
report = json.load(f)
@@ -25,7 +27,9 @@
2527
"This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
2628
"## Summary\n"
2729
f"- total instances: {report['total_instances']}\n"
30+
f"- submitted instances: {report['submitted_instances']}\n"
2831
f"- completed instances: {report['completed_instances']}\n"
32+
f"- empty patch instances: {report['empty_patch_instances']}\n"
2933
f"- resolved instances: {report['resolved_instances']}\n"
3034
f"- unresolved instances: {report['unresolved_instances']}\n"
3135
f"- error instances: {report['error_instances']}\n"
@@ -53,6 +57,19 @@
5357
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
5458
)
5559

60+
output_md += '\n## Empty Patch Instances\n'
61+
for instance_id in report['empty_patch_ids']:
62+
instance_id_to_status[instance_id]['empty_generation'] = True
63+
output_md += (
64+
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
65+
)
66+
67+
output_md += '\n## Incomplete Instances\n'
68+
for instance_id in report['incomplete_ids']:
69+
output_md += (
70+
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
71+
)
72+
5673
# Apply the status to the dataframe
5774
def apply_report(row):
5875
instance_id = row['instance_id']

evaluation/swe_bench/scripts/eval_infer.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ if [ -z "$INSTANCE_ID" ]; then
9797

9898
# move the eval results to the target directory
9999
mkdir -p $RESULT_OUTPUT_DIR
100+
# rm eval_outputs directory if it exists
101+
if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
102+
rm -rf $RESULT_OUTPUT_DIR/eval_outputs
103+
fi
104+
100105
mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
101106
mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
102107
echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt

0 commit comments

Comments
 (0)