Skip to content

Commit 836864f

Browse files
[feat] Integrate BrowserGym (OpenHands#1452)
* add a single-threaded server serving browsergym * update poetry * update browser page content * add import to make sure browsergym environments are registered properly * remove flask server, use multiprocess impl and Pipe * fix * refactor BrowserEnv * update browser action and obs to include more complete info * fix screenshot * update poetry lock * add playwright install to workflow * update * add better html to text conversion * update for better text conversion to maintain parity with the current handling of browseurlaction * update * update poetry * update multiprocessing mp * fix multiprocessing * update * update github workflow --------- Co-authored-by: Xingyao Wang <[email protected]>
1 parent 0d77f49 commit 836864f

11 files changed

Lines changed: 877 additions & 285 deletions

File tree

.github/workflows/dummy-agent-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ jobs:
1515
run: |
1616
curl -sSL https://install.python-poetry.org | python3 -
1717
poetry install --without evaluation
18+
poetry run playwright install --with-deps chromium
1819
wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
1920
- name: Run tests
2021
run: |

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ build-frontend:
159159
# Start backend
160160
start-backend:
161161
@echo "$(YELLOW)Starting backend...$(RESET)"
162-
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude workspace/*
162+
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
163163

164164
# Start frontend
165165
start-frontend:
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
sidebar_label: browser_env
3+
title: opendevin.browser.browser_env
4+
---
5+
6+
## BrowserEnv Objects
7+
8+
```python
9+
class BrowserEnv()
10+
```
11+
12+
#### image\_to\_png\_base64\_url
13+
14+
```python
15+
@staticmethod
16+
def image_to_png_base64_url(image: np.ndarray | Image.Image)
17+
```
18+
19+
Convert a numpy array to a base64 encoded png image url.
20+

docs/modules/python/sidebar.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@
8080
"label": "opendevin.action",
8181
"type": "category"
8282
},
83+
{
84+
"items": [
85+
"python/opendevin/browser/browser_env"
86+
],
87+
"label": "opendevin.browser",
88+
"type": "category"
89+
},
8390
{
8491
"items": [
8592
"python/opendevin/controller/agent_controller"

opendevin/action/browse.py

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
import base64
21
import os
32
from dataclasses import dataclass
43
from typing import TYPE_CHECKING
54

6-
from playwright.async_api import async_playwright
7-
85
from opendevin.observation import BrowserOutputObservation
96
from opendevin.schema import ActionType
107

@@ -25,29 +22,21 @@ async def run(self, controller: 'AgentController') -> BrowserOutputObservation:
2522
if not asked_url.startswith('http'):
2623
asked_url = os.path.abspath(os.curdir) + self.url
2724
try:
28-
async with async_playwright() as p:
29-
browser = await p.chromium.launch()
30-
page = await browser.new_page()
31-
response = await page.goto(asked_url)
32-
try:
33-
# domcontentloaded: Wait for the DOMContentLoaded event to be fired.
34-
# load: Wait for the load event to be fired.
35-
# networkidle: Wait until there are no more network connections
36-
await page.wait_for_load_state('networkidle', timeout=3000)
37-
except TimeoutError:
38-
pass
39-
# content = await page.content()
40-
inner_text = await page.evaluate('() => document.body.innerText')
41-
screenshot_bytes = await page.screenshot(full_page=True)
42-
await browser.close()
43-
44-
screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
45-
return BrowserOutputObservation(
46-
content=inner_text, # HTML content of the page
47-
screenshot=screenshot_base64, # Base64-encoded screenshot
48-
url=asked_url,
49-
status_code=response.status if response else 0, # HTTP status code
50-
)
25+
# action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
26+
action_str = f'goto("{asked_url}")'
27+
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
28+
obs = controller.browser.step(action_str)
29+
return BrowserOutputObservation(
30+
content=obs['text_content'], # text content of the page
31+
open_pages_urls=obs['open_pages_urls'], # list of open pages
32+
active_page_index=obs['active_page_index'], # index of the active page
33+
dom_object=obs['dom_object'], # DOM object
34+
axtree_object=obs['axtree_object'], # accessibility tree object
35+
last_browser_action=obs['last_action'], # last browser env action performed
36+
focused_element_bid=obs['focused_element_bid'], # focused element bid
37+
screenshot=obs['screenshot'], # base64-encoded screenshot, png
38+
url=asked_url,
39+
)
5140
except Exception as e:
5241
return BrowserOutputObservation(
5342
content=str(e), screenshot='', error=True, url=asked_url

opendevin/browser/__init__.py

Whitespace-only changes.

opendevin/browser/browser_env.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import atexit
2+
import base64
3+
import io
4+
import multiprocessing
5+
import time
6+
import uuid
7+
8+
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
9+
import gymnasium as gym
10+
import html2text
11+
import numpy as np
12+
from browsergym.utils.obs import flatten_dom_to_str
13+
from PIL import Image
14+
15+
from opendevin.logger import opendevin_logger as logger
16+
17+
18+
class BrowserException(Exception):
19+
pass
20+
21+
class BrowserEnv:
22+
23+
def __init__(self):
24+
self.html_text_converter = html2text.HTML2Text()
25+
# ignore links and images
26+
self.html_text_converter.ignore_links = True
27+
self.html_text_converter.ignore_images = True
28+
# use alt text for images
29+
self.html_text_converter.images_to_alt = True
30+
# disable auto text wrapping
31+
self.html_text_converter.body_width = 0
32+
# Initialize browser environment process
33+
multiprocessing.set_start_method('spawn', force=True)
34+
self.browser_side, self.agent_side = multiprocessing.Pipe()
35+
self.process = multiprocessing.Process(target=self.browser_process,)
36+
logger.info('Starting browser env...')
37+
self.process.start()
38+
atexit.register(self.close)
39+
40+
def browser_process(self):
41+
env = gym.make(
42+
'browsergym/openended',
43+
start_url='about:blank',
44+
wait_for_user_message=False,
45+
headless=True,
46+
disable_env_checker=True,
47+
)
48+
obs, info = env.reset()
49+
logger.info('Browser env started.')
50+
while True:
51+
try:
52+
if self.browser_side.poll(timeout=0.01):
53+
unique_request_id , action_data = self.browser_side.recv()
54+
# shutdown the browser environment
55+
if unique_request_id == 'SHUTDOWN':
56+
env.close()
57+
return
58+
action = action_data['action']
59+
obs, reward, terminated, truncated, info = env.step(action)
60+
# add text content of the page
61+
html_str = flatten_dom_to_str(obs['dom_object'])
62+
obs['text_content'] = self.html_text_converter.handle(html_str)
63+
# make observation serializable
64+
obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot'])
65+
obs['active_page_index'] = obs['active_page_index'].item()
66+
obs['elapsed_time'] = obs['elapsed_time'].item()
67+
self.browser_side.send((unique_request_id, obs))
68+
except KeyboardInterrupt:
69+
logger.info('Browser env process interrupted by user.')
70+
return
71+
72+
def step(self, action_str: str, timeout: float = 10) -> dict:
73+
unique_request_id = str(uuid.uuid4())
74+
self.agent_side.send((unique_request_id, {'action': action_str}))
75+
start_time = time.time()
76+
while True:
77+
if time.time() - start_time > timeout:
78+
raise TimeoutError('Browser environment took too long to respond.')
79+
if self.agent_side.poll(timeout=0.01):
80+
response_id, obs = self.agent_side.recv()
81+
if response_id == unique_request_id:
82+
if obs['last_action_error']:
83+
raise BrowserException(obs['last_action_error'])
84+
return obs
85+
86+
def close(self):
87+
self.agent_side.send(('SHUTDOWN', None))
88+
self.process.join()
89+
90+
@staticmethod
91+
def image_to_png_base64_url(image: np.ndarray | Image.Image):
92+
"""Convert a numpy array to a base64 encoded png image url."""
93+
94+
if isinstance(image, np.ndarray):
95+
image = Image.fromarray(image)
96+
if image.mode in ('RGBA', 'LA'):
97+
image = image.convert('RGB')
98+
buffered = io.BytesIO()
99+
image.save(buffered, format='PNG')
100+
101+
image_base64 = base64.b64encode(buffered.getvalue()).decode()
102+
return f'{image_base64}'

opendevin/controller/agent_controller.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
)
1313
from opendevin.action.tasks import TaskStateChangedAction
1414
from opendevin.agent import Agent
15+
from opendevin.browser.browser_env import BrowserEnv
1516
from opendevin.controller.action_manager import ActionManager
1617
from opendevin.exceptions import (
1718
AgentMalformedActionError,
@@ -43,6 +44,7 @@ class AgentController:
4344
max_iterations: int
4445
action_manager: ActionManager
4546
callbacks: List[Callable]
47+
browser: BrowserEnv
4648

4749
delegate: 'AgentController | None' = None
4850
state: State | None = None
@@ -67,6 +69,9 @@ def __init__(
6769
self.callbacks = callbacks
6870
# Initialize agent-required plugins for sandbox (if any)
6971
self.action_manager.init_sandbox_plugins(agent.sandbox_plugins)
72+
# Initialize browser environment
73+
self.browser = BrowserEnv()
74+
7075

7176
if isinstance(agent, CodeActAgent) and not isinstance(self.action_manager.sandbox, DockerSSHBox):
7277
logger.warning('CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful (LocalBox, DockerExecBox) will not work properly.')

opendevin/observation/browse.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from dataclasses import dataclass
1+
from dataclasses import dataclass, field
22

33
from opendevin.schema import ObservationType
44

@@ -16,6 +16,25 @@ class BrowserOutputObservation(Observation):
1616
status_code: int = 200
1717
error: bool = False
1818
observation: str = ObservationType.BROWSE
19+
# do not include in the memory
20+
open_pages_urls: list = field(default_factory=list)
21+
active_page_index: int = -1
22+
dom_object: dict = field(default_factory=dict)
23+
axtree_object: dict = field(default_factory=dict)
24+
last_browser_action: str = ''
25+
focused_element_bid: str = ''
26+
27+
def to_memory(self) -> dict:
28+
memory_dict = super().to_memory()
29+
# remove some fields from the memory, as currently they are too big for LLMs
30+
# TODO: find a more elegant way to handle this
31+
memory_dict['extras'].pop('dom_object', None)
32+
memory_dict['extras'].pop('axtree_object', None)
33+
memory_dict['extras'].pop('open_pages_urls', None)
34+
memory_dict['extras'].pop('active_page_index', None)
35+
memory_dict['extras'].pop('last_browser_action', None)
36+
memory_dict['extras'].pop('focused_element_bid', None)
37+
return memory_dict
1938

2039
@property
2140
def message(self) -> str:

0 commit comments

Comments
 (0)