Skip to content

Commit 563ebd4

Browse files
authored
Fix: Add missing arguments for SSHBox in evaluation (OpenHands#3075)
* Fix WebArena evaluation script to connect to SSH session * Update run_infer.py * Add missing arguments for DockerSSHBox
1 parent 1eb3bde commit 563ebd4

5 files changed

Lines changed: 43 additions & 5 deletions

File tree

evaluation/agent_bench/run_infer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,14 @@ def process_instance(
9999
# create sandbox and run the agent
100100
# =============================================
101101

102-
sandbox = DockerSSHBox()
102+
sandbox = DockerSSHBox(
103+
config=config.sandbox,
104+
persist_sandbox=False,
105+
workspace_mount_path=config.workspace_mount_path,
106+
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
107+
cache_dir=config.cache_dir,
108+
run_as_devin=config.run_as_devin,
109+
)
103110
sandbox.execute(f'cd {inst_id}')
104111

105112
init_cmd = instance.init

evaluation/logic_reasoning/run_infer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,15 @@ def process_instance(
173173

174174
# use a session id for concurrent evaluation
175175
sid = instance['id'] + '_' + str(os.getpid())
176-
sandbox = DockerSSHBox(sid=sid)
176+
sandbox = DockerSSHBox(
177+
config=config.sandbox,
178+
persist_sandbox=False,
179+
workspace_mount_path=config.workspace_mount_path,
180+
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
181+
cache_dir=config.cache_dir,
182+
run_as_devin=config.run_as_devin,
183+
sid=sid,
184+
)
177185
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
178186

179187
# Here's how you can run the agent (similar to the `main` function) and get the final task state

evaluation/mint/run_infer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,15 @@ def process_instance(
101101

102102
# use a session id for concurrent processing
103103
sid = instance.task_id + '_' + str(os.getpid())
104-
sandbox = DockerSSHBox(sid=sid)
104+
sandbox = DockerSSHBox(
105+
config=config.sandbox,
106+
persist_sandbox=False,
107+
workspace_mount_path=config.workspace_mount_path,
108+
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
109+
cache_dir=config.cache_dir,
110+
run_as_devin=config.run_as_devin,
111+
sid=sid,
112+
)
105113

106114
requirements_host_src = 'evaluation/mint/requirements.txt'
107115
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'

evaluation/ml_bench/run_infer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,15 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
112112

113113
# Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
114114
sid = str(instance['id']) + '_' + str(os.getpid())
115-
sandbox = DockerSSHBox(sid=sid)
115+
sandbox = DockerSSHBox(
116+
config=config.sandbox,
117+
persist_sandbox=False,
118+
workspace_mount_path=config.workspace_mount_path,
119+
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
120+
cache_dir=config.cache_dir,
121+
run_as_devin=config.run_as_devin,
122+
sid=sid,
123+
)
116124

117125
# Set up the task environment
118126
sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')

evaluation/webarena/run_infer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,14 @@
3434
def get_sandbox():
3535
global docker_ssh_box
3636
if docker_ssh_box is None:
37-
docker_ssh_box = DockerSSHBox()
37+
docker_ssh_box = DockerSSHBox(
38+
config=config.sandbox,
39+
persist_sandbox=False,
40+
workspace_mount_path=config.workspace_mount_path,
41+
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
42+
cache_dir=config.cache_dir,
43+
run_as_devin=config.run_as_devin,
44+
)
3845
return docker_ssh_box
3946

4047

0 commit comments

Comments
 (0)