[Evaluation] Log empty patch stats for SWE-Bench (OpenHands#2776)

xingyaoww · web-flow · commit e6cdf18d3b09 · 2024-07-05T07:03:27.000+08:00
* bump swebench version since the fix PR is merged

* add empy generation stats from latest pr

* delete eval_outputs if it already exists

* handle non string patch
diff --git a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
@@ -15,6 +15,9 @@
 
 
 def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
     if not patch.strip():
         # skip empty patches
         return ''
diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@@ -15,7 +15,9 @@
 df = pd.read_json(args.input_file, lines=True)
 
 output_md_filepath = os.path.join(dirname, 'README.md')
-instance_id_to_status = defaultdict(lambda: {'resolved': False})
+instance_id_to_status = defaultdict(
+    lambda: {'resolved': False, 'empty_generation': False}
+)
 if os.path.exists(report_json):
     with open(report_json, 'r') as f:
         report = json.load(f)
@@ -25,7 +27,9 @@
         "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
         "## Summary\n"
         f"- total instances: {report['total_instances']}\n"
+        f"- submitted instances: {report['submitted_instances']}\n"
         f"- completed instances: {report['completed_instances']}\n"
+        f"- empty patch instances: {report['empty_patch_instances']}\n"
         f"- resolved instances: {report['resolved_instances']}\n"
         f"- unresolved instances: {report['unresolved_instances']}\n"
         f"- error instances: {report['error_instances']}\n"
@@ -53,6 +57,19 @@
             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
         )
 
+    output_md += '\n## Empty Patch Instances\n'
+    for instance_id in report['empty_patch_ids']:
+        instance_id_to_status[instance_id]['empty_generation'] = True
+        output_md += (
+            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
+        )
+
+    output_md += '\n## Incomplete Instances\n'
+    for instance_id in report['incomplete_ids']:
+        output_md += (
+            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
+        )
+
     # Apply the status to the dataframe
     def apply_report(row):
         instance_id = row['instance_id']
diff --git a/evaluation/swe_bench/scripts/eval_infer.sh b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -97,6 +97,11 @@ if [ -z "$INSTANCE_ID" ]; then
 
     # move the eval results to the target directory
     mkdir -p $RESULT_OUTPUT_DIR
+    # rm eval_outputs directory if it exists
+    if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
+        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
+    fi
+
     mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
     mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
     echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt