tests: more Agentskills tests; updated .gitignore (OpenHands#2307)

tobitege · enyst · web-flow · commit b431fce93804 · 2024-06-07T16:29:03.000Z
* added tests related to backticks * updated .gitignore * added extra linter test for OpenHands#2210 * hotfix for integration test --------- Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
diff --git a/.gitignore b/.gitignore
@@ -161,9 +161,14 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 .vscode/
+.cursorignore
 
 # evaluation
+evaluation/evaluation_outputs
+evaluation/outputs
+evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
+evaluation/webarena/scripts/webarena_env.sh
 
 # frontend
 
@@ -176,6 +181,8 @@ frontend/yarn.lock
 
 # testing
 frontend/coverage
+test_results*
+/_test_files_tmp/
 
 # production
 frontend/build
@@ -204,9 +211,3 @@ cache
 # configuration
 config.toml
 config.toml.bak
-evaluation/swe_bench/eval_workspace*
-evaluation/outputs
-evaluation/evaluation_outputs
-test_results*
-/_test_files_tmp/
-evaluation/webarena/scripts/webarena_env.sh
diff --git a/agenthub/browsing_agent/browsing_agent.py b/agenthub/browsing_agent/browsing_agent.py
@@ -117,10 +117,7 @@ def step(self, state: State) -> Action:
         error_prefix = ''
         last_obs = None
         last_action = None
-        if len(state.history) == 1:
-            # initialize and retrieve the first observation by issuing an noop OP
-            # TODO: need more elegant way of doing this
-            return BrowseInteractiveAction(browser_actions='noop()')
+
         for prev_action, obs in state.history:
             if isinstance(prev_action, BrowseInteractiveAction):
                 prev_actions.append(prev_action.browser_actions)
@@ -133,7 +130,7 @@ def step(self, state: State) -> Action:
                 # agent has responded, task finish.
                 return AgentFinishAction(outputs={'content': prev_action.content})
 
-        prev_action_str = '\n'.join(prev_actions[1:])
+        prev_action_str = '\n'.join(prev_actions)
         # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
         # we should also send a message back to the user in OpenDevin and call it a day
         if (
diff --git a/opendevin/runtime/plugins/agent_skills/agentskills.py b/opendevin/runtime/plugins/agent_skills/agentskills.py
@@ -34,6 +34,9 @@
 
 ENABLE_AUTO_LINT = os.getenv('ENABLE_AUTO_LINT', 'false').lower() == 'true'
 
+# This is also used in unit tests!
+MSG_FILE_UPDATED = '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]'
+
 # OPENAI
 OPENAI_API_KEY = os.getenv(
     'OPENAI_API_KEY', os.getenv('SANDBOX_ENV_OPENAI_API_KEY', '')
@@ -311,6 +314,7 @@ def edit_file(start: int, end: int, content: str) -> None:
 
         lint_error = _lint_file(CURRENT_FILE)
         if lint_error:
+            # only change any literal strings here in combination with unit tests!
             print(
                 '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]'
             )
@@ -351,9 +355,7 @@ def edit_file(start: int, end: int, content: str) -> None:
         f'[File: {os.path.abspath(CURRENT_FILE)} ({n_total_lines} lines total after edit)]'
     )
     _print_window(CURRENT_FILE, CURRENT_LINE, WINDOW)
-    print(
-        '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]'
-    )
+    print(MSG_FILE_UPDATED)
 
 
 @update_pwd_decorator
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log
@@ -114,7 +114,7 @@ Don't execute multiple actions at once if you need feedback from the page.
 ----------
 
 # Current Accessibility Tree:
-RootWebArea '', focused
+
 
 # Previous Actions
 
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_003.log b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_003.log
@@ -121,7 +121,7 @@ RootWebArea 'The Ultimate Answer', focused
 	[10] button 'Click me', clickable
 
 # Previous Actions
-noop()
+goto('http://localhost:8000')
 
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_004.log b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_004.log
@@ -122,7 +122,7 @@ RootWebArea 'The Ultimate Answer', focused
 	StaticText 'The answer is OpenDevin is all you need!'
 
 # Previous Actions
-noop()
+goto('http://localhost:8000')
 click("10")
 
 Here is an example with chain of thought of a valid action when clicking on a button:
diff --git a/tests/unit/test_agent_skill.py b/tests/unit/test_agent_skill.py
@@ -6,6 +6,8 @@
 import pytest
 
 from opendevin.runtime.plugins.agent_skills.agentskills import (
+    MSG_FILE_UPDATED,
+    _print_window,
     create_file,
     edit_file,
     find_file,
@@ -274,6 +276,127 @@ def test_scroll_down_edge(tmp_path):
     assert result.split('\n') == expected.split('\n')
 
 
+def test_print_window_internal(tmp_path):
+    test_file_path = tmp_path / 'a.txt'
+    create_file(str(test_file_path))
+    open_file(str(test_file_path))
+    with open(test_file_path, 'w') as file:
+        for i in range(1, 101):
+            file.write(f'Line `{i}`\n')
+
+    # Define the parameters for the test
+    current_line = 50
+    window = 2
+
+    # Test _print_window especially with backticks
+    with io.StringIO() as buf:
+        with contextlib.redirect_stdout(buf):
+            _print_window(str(test_file_path), current_line, window, return_str=False)
+        result = buf.getvalue()
+        expected = (
+            '(49 more lines above)\n'
+            '50|Line `50`\n'
+            '51|Line `51`\n'
+            '(49 more lines below)\n'
+        )
+        assert result == expected
+
+
+def test_edit_file_window(tmp_path, monkeypatch):
+    # Set environment variable via monkeypatch does NOT work!
+    monkeypatch.setattr(
+        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
+    )
+
+    content = """def any_int(a, b, c):
+    return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)
+
+def test_any_int():
+    assert any_int(1, 2, 3) == True
+    assert any_int(1.5, 2, 3) == False
+    assert any_int(1, 2.5, 3) == False
+    assert any_int(1, 2, 3.5) == False
+    assert any_int(1.0, 2, 3) == False
+    assert any_int(1, 2.0, 3) == False
+    assert any_int(1, 2, 3.0) == False
+    assert any_int(0, 0, 0) == True
+    assert any_int(-1, -2, -3) == True
+    assert any_int(1, -2, 3) == True
+    assert any_int(1.5, -2, 3) == False
+    assert any_int(1, -2.5, 3) == False
+
+def check(any_int):
+    # Check some simple cases
+    assert any_int(2, 3, 1)==True, "This prints if this assert fails 1 (good for debugging!)"
+    assert any_int(2.5, 2, 3)==False, "This prints if this assert fails 2 (good for debugging!)"
+    assert any_int(1.5, 5, 3.5)==False, "This prints if this assert fails 3 (good for debugging!)"
+    assert any_int(2, 6, 2)==False, "This prints if this assert fails 4 (good for debugging!)"
+    assert any_int(4, 2, 2)==True, "This prints if this assert fails 5 (good for debugging!)"
+    assert any_int(2.2, 2.2, 2.2)==False, "This prints if this assert fails 6 (good for debugging!)"
+    assert any_int(-4, 6, 2)==True, "This prints if this assert fails 7 (good for debugging!)"
+
+    # Check some edge cases that are easy to work out by hand.
+    assert any_int(2,1,1)==True, "This prints if this assert fails 8 (also good for debugging!)"
+    assert any_int(3,4,7)==True, "This prints if this assert fails 9 (also good for debugging!)"
+    assert any_int(3.0,4,7)==False, "This prints if this assert fails 10 (also good for debugging!)"
+
+check(any_int)"""
+
+    temp_file_path = tmp_path / 'error-test.py'
+    temp_file_path.write_text(content)
+
+    open_file(str(temp_file_path))
+
+    with io.StringIO() as buf:
+        with contextlib.redirect_stdout(buf):
+            edit_file(
+                start=9, end=9, content='        assert any_int(1.0, 2, 3) == False'
+            )
+        result = buf.getvalue()
+        expected = (
+            '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
+            'ERRORS:\n'
+            + str(temp_file_path)
+            + ':9:9: '
+            + 'E999 IndentationError: unexpected indent\n'
+            '[This is how your edit would have looked if applied]\n'
+            '-------------------------------------------------\n'
+            '(5 more lines above)\n'
+            '6|    assert any_int(1.5, 2, 3) == False\n'
+            '7|    assert any_int(1, 2.5, 3) == False\n'
+            '8|    assert any_int(1, 2, 3.5) == False\n'
+            '9|        assert any_int(1.0, 2, 3) == False\n'
+            '10|    assert any_int(1, 2.0, 3) == False\n'
+            '11|    assert any_int(1, 2, 3.0) == False\n'
+            '12|    assert any_int(0, 0, 0) == True\n'
+            '13|    assert any_int(-1, -2, -3) == True\n'
+            '14|    assert any_int(1, -2, 3) == True\n'
+            '15|    assert any_int(1.5, -2, 3) == False\n'
+            '(18 more lines below)\n'
+            '-------------------------------------------------\n'
+            '\n'
+            '[This is the original code before your edit]\n'
+            '-------------------------------------------------\n'
+            '(5 more lines above)\n'
+            '6|    assert any_int(1.5, 2, 3) == False\n'
+            '7|    assert any_int(1, 2.5, 3) == False\n'
+            '8|    assert any_int(1, 2, 3.5) == False\n'
+            '9|    assert any_int(1.0, 2, 3) == False\n'
+            '10|    assert any_int(1, 2.0, 3) == False\n'
+            '11|    assert any_int(1, 2, 3.0) == False\n'
+            '12|    assert any_int(0, 0, 0) == True\n'
+            '13|    assert any_int(-1, -2, -3) == True\n'
+            '14|    assert any_int(1, -2, 3) == True\n'
+            '15|    assert any_int(1.5, -2, 3) == False\n'
+            '(18 more lines below)\n'
+            '-------------------------------------------------\n'
+            'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+            'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+            'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
+        )
+        assert result == expected
+
+
 def test_edit_file(tmp_path):
     temp_file_path = tmp_path / 'a.txt'
     content = 'Line 1\nLine 2\nLine 3\nLine 4\nLine 5'
@@ -289,8 +412,7 @@ def test_edit_file(tmp_path):
             f'[File: {temp_file_path} (3 lines total after edit)]\n'
             '1|REPLACE TEXT\n'
             '2|Line 4\n'
-            '3|Line 5\n'
-            '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]\n'
+            '3|Line 5\n' + MSG_FILE_UPDATED + '\n'
         )
         assert result.split('\n') == expected.split('\n')
 
@@ -313,8 +435,7 @@ def test_edit_file_from_scratch(tmp_path):
         result = buf.getvalue()
         expected = (
             f'[File: {temp_file_path} (1 lines total after edit)]\n'
-            '1|REPLACE TEXT\n'
-            '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]\n'
+            '1|REPLACE TEXT\n' + MSG_FILE_UPDATED + '\n'
         )
         assert result.split('\n') == expected.split('\n')
 
@@ -324,6 +445,65 @@ def test_edit_file_from_scratch(tmp_path):
     assert lines[0].rstrip() == 'REPLACE TEXT'
 
 
+def test_edit_file_from_scratch_multiline_with_backticks_and_second_edit(tmp_path):
+    temp_file_path = tmp_path / 'a.txt'
+    create_file(str(temp_file_path))
+    open_file(str(temp_file_path))
+
+    with io.StringIO() as buf:
+        with contextlib.redirect_stdout(buf):
+            edit_file(
+                1,
+                1,
+                '`REPLACE TEXT1`\n`REPLACE TEXT2`\n`REPLACE TEXT3`',
+            )
+        result = buf.getvalue()
+        expected = (
+            f'[File: {temp_file_path} (3 lines total after edit)]\n'
+            '1|`REPLACE TEXT1`\n'
+            '2|`REPLACE TEXT2`\n'
+            '3|`REPLACE TEXT3`\n' + MSG_FILE_UPDATED + '\n'
+        )
+        assert result.split('\n') == expected.split('\n')
+
+    with open(temp_file_path, 'r') as file:
+        lines = file.readlines()
+    assert len(lines) == 3
+    assert lines[0].rstrip() == '`REPLACE TEXT1`'
+    assert lines[1].rstrip() == '`REPLACE TEXT2`'
+    assert lines[2].rstrip() == '`REPLACE TEXT3`'
+
+    # Check that no backticks are escaped in the edit_file call
+    assert '\\`' not in result
+
+    # Perform a second edit
+    with io.StringIO() as buf:
+        with contextlib.redirect_stdout(buf):
+            edit_file(
+                1,
+                3,
+                '`REPLACED TEXT1`\n`REPLACED TEXT2`\n`REPLACED TEXT3`',
+            )
+        second_result = buf.getvalue()
+        second_expected = (
+            f'[File: {temp_file_path} (3 lines total after edit)]\n'
+            '1|`REPLACED TEXT1`\n'
+            '2|`REPLACED TEXT2`\n'
+            '3|`REPLACED TEXT3`\n' + MSG_FILE_UPDATED + '\n'
+        )
+        assert second_result.split('\n') == second_expected.split('\n')
+
+    with open(temp_file_path, 'r') as file:
+        lines = file.readlines()
+    assert len(lines) == 3
+    assert lines[0].rstrip() == '`REPLACED TEXT1`'
+    assert lines[1].rstrip() == '`REPLACED TEXT2`'
+    assert lines[2].rstrip() == '`REPLACED TEXT3`'
+
+    # Check that no backticks are escaped in the second edit_file call
+    assert '\\`' not in second_result
+
+
 def test_edit_file_from_scratch_multiline(tmp_path):
     temp_file_path = tmp_path / 'a.txt'
     create_file(str(temp_file_path))
@@ -341,8 +521,7 @@ def test_edit_file_from_scratch_multiline(tmp_path):
             f'[File: {temp_file_path} (3 lines total after edit)]\n'
             '1|REPLACE TEXT1\n'
             '2|REPLACE TEXT2\n'
-            '3|REPLACE TEXT3\n'
-            '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]\n'
+            '3|REPLACE TEXT3\n' + MSG_FILE_UPDATED + '\n'
         )
         assert result.split('\n') == expected.split('\n')
 
@@ -550,8 +729,7 @@ def test_edit_lint_file_pass(tmp_path, monkeypatch):
         '1|\n'
         f'[File: {file_path} (2 lines total after edit)]\n'
         "1|print('hello')\n"
-        '2|\n'
-        '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]\n'
+        '2|\n' + MSG_FILE_UPDATED + '\n'
     )
     assert result.split('\n') == expected.split('\n')
 
@@ -663,7 +841,7 @@ def test_lint_file_disabled_undefined_name(tmp_path, monkeypatch, capsys):
     file_path = tmp_path / 'test_file.py'
     file_path.write_text('\n')
 
-    # Set environment variable to enable linting
+    # Set environment variable to disable linting
     monkeypatch.setattr(
         'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', False
     )
@@ -678,8 +856,7 @@ def test_lint_file_disabled_undefined_name(tmp_path, monkeypatch, capsys):
         '1|\n'
         f'[File: {file_path} (2 lines total after edit)]\n'
         '1|undefined_name()\n'
-        '2|\n'
-        '[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]\n'
+        '2|\n' + MSG_FILE_UPDATED + '\n'
     )
     assert result.split('\n') == expected.split('\n')
 

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ RootWebArea 'The Ultimate Answer', focused`
`121`	`121`	`[10] button 'Click me', clickable`
`122`	`122`
`123`	`123`	`# Previous Actions`
`124`		`-noop()`
	`124`	`+goto('http://localhost:8000')`
`125`	`125`
`126`	`126`	`Here is an example with chain of thought of a valid action when clicking on a button:`
`127`	`127`	`"`