Unitygrid
diff --git a/‎.github/workflows/run-integration-tests.yml‎
Lines changed: 70 additions & 0 deletions b/‎.github/workflows/run-integration-tests.yml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎.github/workflows/run-tests.yml‎
Lines changed: 0 additions & 20 deletions b/‎.github/workflows/run-tests.yml‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎.github/workflows/build_run-tests.yml‎ ‎.github/workflows/run-unit-tests.yml‎.github/workflows/build_run-tests.yml renamed to .github/workflows/run-unit-tests.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build_run-tests.yml‎ ‎.github/workflows/run-unit-tests.yml‎.github/workflows/build_run-tests.yml renamed to .github/workflows/run-unit-tests.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎agenthub/monologue_agent/utils/memory.py‎
Lines changed: 5 additions & 0 deletions b/‎agenthub/monologue_agent/utils/memory.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎opendevin/config.py‎
Lines changed: 1 addition & 1 deletion b/‎opendevin/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opendevin/llm/llm.py‎
Lines changed: 0 additions & 1 deletion b/‎opendevin/llm/llm.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎opendevin/main.py‎
Lines changed: 4 additions & 2 deletions b/‎opendevin/main.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/integration/README.md‎
Lines changed: 101 additions & 0 deletions b/‎tests/integration/README.md‎
Lines changed: 101 additions & 0 deletions
@@ -0,0 +1,70 @@
+name: Run Integration Tests
+
+on: [push, pull_request]
+
+jobs:
+  on-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - name: SWEAgent-py311-ssh
+            python-version: "3.11"
+            agent: "SWEAgent"
+            embedding-model: "none"
+            sandbox: "ssh"
+          - name: PlannerAgent-py311-ssh
+            python-version: "3.11"
+            agent: "PlannerAgent"
+            embedding-model: "none"
+            sandbox: "ssh"
+          - name: MonologueAgent-py311-ssh
+            python-version: "3.11"
+            agent: "MonologueAgent"
+            embedding-model: "local"
+            sandbox: "ssh"
+          - name: CodeActAgent-py311-ssh
+            python-version: "3.11"
+            agent: "CodeActAgent"
+            embedding-model: "none"
+            sandbox: "ssh"
+          - name: SWEAgent-py311-exec
+            python-version: "3.11"
+            agent: "SWEAgent"
+            embedding-model: "none"
+            sandbox: "exec"
+          - name: PlannerAgent-py311-exec
+            python-version: "3.11"
+            agent: "PlannerAgent"
+            embedding-model: "none"
+            sandbox: "exec"
+          - name: MonologueAgent-py311-exec
+            python-version: "3.11"
+            agent: "MonologueAgent"
+            embedding-model: "local"
+            sandbox: "exec"
+          - name: CodeActAgent-py311-exec
+            python-version: "3.11"
+            agent: "CodeActAgent"
+            embedding-model: "none"
+            sandbox: "exec"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        run: curl -sSL https://install.python-poetry.org | python3 -
+      - name: Build Environment
+        run: make build
+      - name: Run Integration Tests
+        env:
+          SANDBOX_TYPE: ${{ matrix.sandbox }}
+          AGENT: ${{ matrix.agent }}
+          MAX_ITERATIONS: 10
+          LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }}
+        run: |
+          rm -rf workspace
+          mkdir workspace
+          WORKSPACE_BASE="$GITHUB_WORKSPACE/workspace" WORKSPACE_MOUNT_PATH="$GITHUB_WORKSPACE/workspace" poetry run pytest -s ./tests/integration
@@ -1,4 +1,4 @@
-name: Build & Run Tests
+name: Run Unit Tests
 
 on: [push, pull_request]
 
@@ -26,7 +26,7 @@ jobs:
       - name: Build Environment
         run: make build
       - name: Run Tests
-        run: poetry run pytest ./tests
+        run: poetry run pytest ./tests/unit
   on-linux:
     runs-on: ubuntu-latest
     strategy:
@@ -44,4 +44,4 @@ jobs:
       - name: Build Environment
         run: make build
       - name: Run Tests
-        run: poetry run pytest ./tests
+        run: poetry run pytest ./tests/unit
@@ -57,7 +57,6 @@ cover/
 *.pot
 
 # Django stuff:
-*.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 
@@ -85,5 +85,6 @@ Please refer to the README in each module:
     - [mock server](./opendevin/mock/README.md)
 
 ## Tests
-TODO: make sure code pass the test before submit.
-
+Please navigate to `tests` folder to see existing test suites.
+At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on CI to ensure quality of
+the project.
@@ -73,6 +73,11 @@ def wrapper_get_embeddings(*args, **kwargs):
         azure_endpoint=config.get('LLM_BASE_URL', required=True),
         api_version=config.get('LLM_API_VERSION', required=True),
     )
+elif (embedding_strategy is not None) and (embedding_strategy.lower() == 'none'):
+    # TODO: this works but is not elegant enough. The incentive is when
+    # monologue agent is not used, there is no reason we need to initialize an
+    # embedding model
+    embed_model = None
 else:
     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
     embed_model = HuggingFaceEmbedding(
 
@@ -95,7 +95,7 @@ def get_parser():
     parser.add_argument(
         '-c',
         '--agent-cls',
-        default='MonologueAgent',
+        default=config.get(ConfigType.AGENT),
         type=str,
         help='The agent class to use',
     )
 
@@ -1,4 +1,3 @@
-
 from litellm import completion as litellm_completion
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
 from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
 
@@ -20,11 +20,13 @@ def read_task_from_stdin() -> str:
     return sys.stdin.read()
 
 
-async def main():
+async def main(task_str: str = ''):
     """Main coroutine to run the agent controller with task input flexibility."""
 
     # Determine the task source
-    if args.file:
+    if task_str:
+        task = task_str
+    elif args.file:
         task = read_task_from_file(args.file)
     elif args.task:
         task = args.task
 
@@ -0,0 +1,101 @@
+## Introduction
+
+This folder contains backend integration tests that rely on a mock LLM. It serves
+two purposes:
+1. Ensure the quality of development, including OpenDevin framework and agents.
+2. Help contributors learn the workflow of OpenDevin, and examples of real interactions
+with (powerful) LLM, without spending real money.
+
+Why don't we launch an open-source model, e.g. LLAMA3? There are two reasons:
+1. LLMs cannot guarantee determinism, meaning the test behavior might change.
+2. CI machines are not powerful enough to run any LLM that is sophisticated enough
+to finish the tasks defined in tests.
+
+Note: integration tests are orthogonal to evaluations/benchmarks
+as they serve different purposes. Although benchmarks could also
+capture bugs, some of which may not be caught by tests, benchmarks
+require real LLMs which are non-deterministic and costly.
+We run integration test suite for every single commit, which is
+not possible with benchmarks.
+
+Known limitations:
+1. To avoid the potential impact of non-determinism, we remove all special
+characters and numbers (often used as PIDs) when doing the comparison. If two
+prompts for the same task only differ in non-alpha characters, a wrong mock
+response might be picked up.
+2. It is required that the agent itself doesn't do anything non-deterministic,
+including but not limited to using randomly generated numbers.
+
+The folder is organised as follows:
+
+```
+├── README.md
+├── conftest.py
+├── mock
+│   ├── [AgentName]
+│   │   └── [TestName]
+│   │       ├── prompt_*.log
+│   │       ├── response_*.log
+└── [TestFiles].py
+```
+
+where `conftest.py` defines the infrastructure needed to load real-world LLM prompts
+and responses for mocking purpose. Prompts and responses generated during real runs
+of agents with real LLMs are stored under `mock/AgentName/TestName` folders.
+
+## Run Integration Tests
+
+Take a look at `run-integration-tests.yml` to learn how integration tests are
+launched in CI environment. Assuming you want to use `workspace` for testing, an
+example is as follows:
+
+```bash
+rm -rf workspace; AGENT=PlannerAgent \
+WORKSPACE_BASE="/Users/admin/OpenDevin/workspace" WORKSPACE_MOUNT_PATH="/Users/admin/OpenDevin/workspace" MAX_ITERATIONS=10 \
+poetry run pytest -s ./tests/integration
+```
+
+Note: in order to run integration tests correctly, please ensure your workspace is empty.
+
+
+## Write Integration Tests
+
+To write an integraion test, there are essentially two steps:
+
+1. Decide your task prompt, and the result you want to verify.
+2. Either construct LLM responses by yourself, or run OpenDevin with a real LLM. The system prompts and
+LLM responses are recorded as logs, which you could then copy to test folder.
+The following paragraphs describe how to do it.
+
+Your `config.toml` should look like this:
+
+```toml
+LLM_MODEL="gpt-4-turbo"
+LLM_API_KEY="<your-api-key>"
+LLM_EMBEDDING_MODEL="openai"
+WORKSPACE_MOUNT_PATH="<absolute-path-of-your-workspace>"
+```
+
+You can choose any model you'd like to generate the mock responses.
+You can even handcraft mock responses, especially when you would like to test the behaviour of agent for corner cases. If you use a very weak model (e.g. 8B params), chance is most agents won't be able to finish the task.
+
+```bash
+# Remove logs iff you are okay to lose logs. This helps us locate the prompts and responses quickly, but is NOT a must.
+rm -rf logs
+# Clear the workspace, otherwise OpenDevin might not be able to reproduce your prompts in CI environment. Feel free to change the workspace name and path. Be sure to set `WORKSPACE_MOUNT_PATH` to the same absolute path.
+rm -rf workspace
+mkdir workspace
+# Depending on the complexity of the task you want to test, you can change the number of iterations limit. Change agent accordingly. If you are adding a new test, try generating mock responses for every agent.
+poetry run python ./opendevin/main.py -i 10 -t "Write a shell script 'hello.sh' that prints 'hello'." -c "MonologueAgent" -d "./workspace"
+```
+
+After running the above commands, you should be able to locate the real prompts
+and responses logged. The log folder follows `logs/llm/%y-%m-%d_%H-%M.log` format.
+
+Now, move all files under that folder to `tests/integration/mock/<AgentName>/<TestName>` folder. For example, moving all files from `logs/llm/24-04-23_21-55/` folder to
+`tests/integration/mock/MonologueAgent/test_write_simple_script` folder.
+
+That's it, you are good to go! When you launch an integration test, mock
+responses are loaded and used to replace a real LLM, so that we get
+deterministic and consistent behavior, and most importantly, without spending real
+money.
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ def get_parser():`
`95`	`95`	`parser.add_argument(`
`96`	`96`	`'-c',`
`97`	`97`	`'--agent-cls',`
`98`		`- default='MonologueAgent',`
	`98`	`+ default=config.get(ConfigType.AGENT),`
`99`	`99`	`type=str,`
`100`	`100`	`help='The agent class to use',`
`101`	`101`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-`
`2`	`1`	`from litellm import completion as litellm_completion`
`3`	`2`	`from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential`
`4`	`3`	`from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError`