Skip to content

Commit d0384ca

Browse files
authored
Two fixes to swe bench eval (OpenHands#2831)
* Two fixes to swe bench eval * Add error message * Change dumping of metadata
1 parent 3a3694c commit d0384ca

15 files changed

Lines changed: 19 additions & 18 deletions

File tree

evaluation/EDA/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def process_instance(
163163
'instance_id': instance['text'].strip(),
164164
'instance': instance,
165165
'instruction': instruction,
166-
'metadata': metadata,
166+
'metadata': metadata.model_dump(),
167167
'history': [
168168
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
169169
],

evaluation/agent_bench/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def process_instance(
189189
'instance_id': inst_id,
190190
'instance': instance.to_dict(),
191191
'instruction': instruction,
192-
'metadata': metadata,
192+
'metadata': metadata.model_dump(),
193193
'history': histories,
194194
'metrics': metrics,
195195
'error': state.last_error if state and state.last_error else None,

evaluation/biocoder/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def process_instance(
202202
'biocoder_instance': instance.to_dict(),
203203
'instruction': instruction,
204204
'generated': test_result['metadata']['1_copy_change_code'],
205-
'metadata': metadata,
205+
'metadata': metadata.model_dump(),
206206
'history': [
207207
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
208208
],

evaluation/bird/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def execute_sql(db_path, sql):
249249
output = {
250250
'task_id': instance.task_id,
251251
'instruction': instruction,
252-
'metadata': metadata,
252+
'metadata': metadata.model_dump(),
253253
'history': [
254254
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
255255
],

evaluation/gaia/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def process_instance(
171171
'instance_id': instance['task_id'],
172172
'instance': instance,
173173
'instruction': instance['Question'],
174-
'metadata': metadata,
174+
'metadata': metadata.model_dump(),
175175
'history': [
176176
(event_to_dict(action), event_to_dict(obs))
177177
for action, obs in state.history

evaluation/gorilla/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
150150
'hallucination': hallucination,
151151
'answer_id': 'None',
152152
'model_id': metadata['model_name'],
153-
'metadata': metadata,
153+
'metadata': metadata.model_dump(),
154154
'history': [
155155
(event_to_dict(action), event_to_dict(obs))
156156
for action, obs in state.history

evaluation/gpqa/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def process_instance(
236236
'task_id': instance.task_id,
237237
'instance_id': instance.instance_id,
238238
'instruction': instruction,
239-
'metadata': metadata,
239+
'metadata': metadata.model_dump(),
240240
'history': [
241241
(event_to_dict(action), event_to_dict(obs))
242242
for action, obs in state.history

evaluation/humanevalfix/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def process_instance(
206206
output = {
207207
'task_id': instance.task_id,
208208
'instruction': instruction,
209-
'metadata': metadata,
209+
'metadata': metadata.model_dump(),
210210
'history': [
211211
(event_to_dict(action), event_to_dict(obs))
212212
for action, obs in state.history

evaluation/logic_reasoning/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def process_instance(
222222
'id': instance['id'],
223223
'instance': instance,
224224
'instruction': instruction,
225-
# 'metadata': metadata,
225+
# 'metadata': metadata.model_dump(),
226226
'history': [
227227
(event_to_dict(action), event_to_dict(obs))
228228
for action, obs in state.history

evaluation/miniwob/run_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def process_instance(
114114
output = {
115115
'instance_id': env_id,
116116
'instruction': instruction,
117-
'metadata': metadata,
117+
'metadata': metadata.model_dump(),
118118
'history': [
119119
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
120120
],

0 commit comments

Comments
 (0)