Skip to content

Commit 3a478c2

Browse files
rbrenenyst
andauthored
Better LLM retry behavior (OpenHands#6557)
Co-authored-by: Engel Nyst <[email protected]>
1 parent 82b5325 commit 3a478c2

7 files changed

Lines changed: 67 additions & 59 deletions

File tree

docs/modules/usage/llms/llms.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,22 @@ We have a few guides for running OpenHands with specific model providers:
6363
### API retries and rate limits
6464

6565
LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically
66-
retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.
66+
retry requests if it receives a Rate Limit Error (429 error code).
6767

6868
You can customize these options as you need for the provider you're using. Check their documentation, and set the
6969
following environment variables to control the number of retries and the time between retries:
7070

71-
- `LLM_NUM_RETRIES` (Default of 8)
72-
- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
73-
- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
71+
- `LLM_NUM_RETRIES` (Default of 4 times)
72+
- `LLM_RETRY_MIN_WAIT` (Default of 5 seconds)
73+
- `LLM_RETRY_MAX_WAIT` (Default of 30 seconds)
7474
- `LLM_RETRY_MULTIPLIER` (Default of 2)
7575

7676
If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
7777

7878
```toml
7979
[llm]
80-
num_retries = 8
81-
retry_min_wait = 15
82-
retry_max_wait = 120
80+
num_retries = 4
81+
retry_min_wait = 5
82+
retry_max_wait = 30
8383
retry_multiplier = 2
8484
```

frontend/src/i18n/translation.json

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3803,6 +3803,37 @@
38033803
"pt": "Erro ao autenticar com o provedor LLM. Por favor, verifique sua chave API",
38043804
"tr": "LLM sağlayıcısı ile kimlik doğrulama hatası. Lütfen API anahtarınızı kontrol edin"
38053805
},
3806+
"STATUS$ERROR_LLM_SERVICE_UNAVAILABLE": {
3807+
"en": "The LLM provider is currently unavailable. Please try again later.",
3808+
"es": "El proveedor LLM no está actualmente disponible. Por favor, inténtelo de nuevo más tarde.",
3809+
"zh-CN": "LLM提供商当前不可用",
3810+
"zh-TW": "LLM提供商目前無法使用",
3811+
"ko-KR": "LLM 공급자가 현재 사용 불가능합니다",
3812+
"ja": "LLMプロバイダーが現在利用できません。後でもう一度試してください。",
3813+
"no": "LLM-leverandøren er nå ikke tilgjengelig. Vennligst prøv igjen senere.",
3814+
"ar": "المزود LLM غير متاح حالياً. يرجى المحاولة مرة أخرى لاحقًا.",
3815+
"de": "Der LLM-Anbieter ist derzeit nicht verfügbar. Bitte versuchen Sie es später erneut.",
3816+
"fr": "Le fournisseur LLM n'est actuellement pas disponible. Veuillez réessayer plus tard.",
3817+
"it": "Il provider LLM non è attualmente disponibile. Per favore, riprova più tardi.",
3818+
"pt": "O provedor LLM não está atualmente disponível. Por favor, tente novamente mais tarde.",
3819+
"tr": "LLM sağlayıcısı şu anda kullanılamıyor. Lütfen daha sonra tekrar deneyin."
3820+
},
3821+
"STATUS$ERROR_LLM_INTERNAL_SERVER_ERROR": {
3822+
"en": "The request failed with an internal server error.",
3823+
"es": "La solicitud falló con un error del servidor interno.",
3824+
"zh-CN": "请求失败,请稍后再试",
3825+
"zh-TW": "請求失敗,請稍後再試",
3826+
"ko-KR": "요청이 실패했습니다. 나중에 다시 시도해주세요.",
3827+
"ja": "リクエストが内部サーバーエラーで失敗しました。後でもう一度試してください。",
3828+
"no": "Det oppstod en feil ved tilkobling til kjøretidsmiljøet. Vennligst oppdater siden.",
3829+
"ar": "حدث خطأ أثناء الاتصال بوقت التشغيل. يرجى تحديث الصفحة.",
3830+
"de": "Beim Verbinden mit der Laufzeitumgebung ist ein Fehler aufgetreten. Bitte aktualisieren Sie die Seite.",
3831+
"fr": "Une erreur s'est produite lors de la connexion à l'environnement d'exécution. Veuillez rafraîchir la page.",
3832+
"it": "Si è verificato un errore durante la connessione al runtime. Aggiorna la pagina.",
3833+
"pt": "Ocorreu um erro ao conectar ao ambiente de execução. Por favor, atualize a página.",
3834+
"tr": "Çalışma zamanına bağlanırken bir hata oluştu. Lütfen sayfayı yenileyin."
3835+
},
3836+
38063837
"STATUS$ERROR_RUNTIME_DISCONNECTED": {
38073838
"en": "There was an error while connecting to the runtime. Please refresh the page.",
38083839
"zh-CN": "运行时已断开连接",
@@ -3820,7 +3851,18 @@
38203851
},
38213852
"STATUS$LLM_RETRY": {
38223853
"en": "Retrying LLM request",
3823-
"zh-TW": "重新嘗試 LLM 請求中"
3854+
"es": "Reintentando solicitud LLM",
3855+
"zh-CN": "重试LLM请求",
3856+
"zh-TW": "重試LLM請求",
3857+
"ko-KR": "LLM 요청 재시도",
3858+
"ja": "LLM リクエストを再試行中",
3859+
"no": "Gjenforsøker LLM-forespørsel",
3860+
"ar": "يتم إعادة تحميل الطلب LLM",
3861+
"de": "LLM-Anfrage erneut versuchen",
3862+
"fr": "Réessayer la requête LLM",
3863+
"it": "Ritenta la richiesta LLM",
3864+
"pt": "Reintentando a solicitação LLM",
3865+
"tr": "LLM isteğini yeniden deniyor"
38243866
},
38253867
"AGENT_ERROR$BAD_ACTION": {
38263868
"en": "Agent tried to execute a malformed action.",

openhands/controller/agent_controller.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,17 @@ async def _react_to_exception(
214214
err_id = ''
215215
if isinstance(e, litellm.AuthenticationError):
216216
err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
217+
elif isinstance(
218+
e,
219+
(
220+
litellm.ServiceUnavailableError,
221+
litellm.APIConnectionError,
222+
litellm.APIError,
223+
),
224+
):
225+
err_id = 'STATUS$ERROR_LLM_SERVICE_UNAVAILABLE'
226+
elif isinstance(e, litellm.InternalServerError):
227+
err_id = 'STATUS$ERROR_LLM_INTERNAL_SERVER_ERROR'
217228
elif isinstance(e, RateLimitError):
218229
await self.set_agent_state_to(AgentState.RATE_LIMITED)
219230
return

openhands/core/config/llm_config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,11 @@ class LLMConfig(BaseModel):
5959
aws_region_name: str | None = Field(default=None)
6060
openrouter_site_url: str = Field(default='https://docs.all-hands.dev/')
6161
openrouter_app_name: str = Field(default='OpenHands')
62-
num_retries: int = Field(default=8)
62+
# total wait time: 5 + 10 + 20 + 30 = 65 seconds
63+
num_retries: int = Field(default=4)
6364
retry_multiplier: float = Field(default=2)
64-
retry_min_wait: int = Field(default=15)
65-
retry_max_wait: int = Field(default=120)
65+
retry_min_wait: int = Field(default=5)
66+
retry_max_wait: int = Field(default=30)
6667
timeout: int | None = Field(default=None)
6768
max_message_chars: int = Field(
6869
default=30_000

openhands/llm/llm.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@
1818
from litellm import completion as litellm_completion
1919
from litellm import completion_cost as litellm_completion_cost
2020
from litellm.exceptions import (
21-
APIConnectionError,
22-
APIError,
23-
InternalServerError,
2421
RateLimitError,
25-
ServiceUnavailableError,
2622
)
2723
from litellm.types.utils import CostPerToken, ModelResponse, Usage
2824
from litellm.utils import create_pretrained_tokenizer
@@ -41,15 +37,7 @@
4137
__all__ = ['LLM']
4238

4339
# tuple of exceptions to retry on
44-
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
45-
APIConnectionError,
46-
# FIXME: APIError is useful on 502 from a proxy for example,
47-
# but it also retries on other errors that are permanent
48-
APIError,
49-
InternalServerError,
50-
RateLimitError,
51-
ServiceUnavailableError,
52-
)
40+
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (RateLimitError,)
5341

5442
# cache prompt supporting models
5543
# remove this when we gemini and deepseek are supported

tests/unit/test_llm.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,7 @@
33

44
import pytest
55
from litellm.exceptions import (
6-
APIConnectionError,
7-
InternalServerError,
86
RateLimitError,
9-
ServiceUnavailableError,
107
)
118

129
from openhands.core.config import LLMConfig
@@ -187,21 +184,6 @@ def test_completion_with_mocked_logger(
187184
@pytest.mark.parametrize(
188185
'exception_class,extra_args,expected_retries',
189186
[
190-
(
191-
APIConnectionError,
192-
{'llm_provider': 'test_provider', 'model': 'test_model'},
193-
2,
194-
),
195-
(
196-
InternalServerError,
197-
{'llm_provider': 'test_provider', 'model': 'test_model'},
198-
2,
199-
),
200-
(
201-
ServiceUnavailableError,
202-
{'llm_provider': 'test_provider', 'model': 'test_model'},
203-
2,
204-
),
205187
(RateLimitError, {'llm_provider': 'test_provider', 'model': 'test_model'}, 2),
206188
],
207189
)
@@ -254,22 +236,6 @@ def test_completion_rate_limit_wait_time(mock_litellm_completion, default_config
254236
), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
255237

256238

257-
@patch('openhands.llm.llm.litellm_completion')
258-
def test_completion_exhausts_retries(mock_litellm_completion, default_config):
259-
mock_litellm_completion.side_effect = APIConnectionError(
260-
'Persistent error', llm_provider='test_provider', model='test_model'
261-
)
262-
263-
llm = LLM(config=default_config)
264-
with pytest.raises(APIConnectionError):
265-
llm.completion(
266-
messages=[{'role': 'user', 'content': 'Hello!'}],
267-
stream=False,
268-
)
269-
270-
assert mock_litellm_completion.call_count == llm.config.num_retries
271-
272-
273239
@patch('openhands.llm.llm.litellm_completion')
274240
def test_completion_operation_cancelled(mock_litellm_completion, default_config):
275241
mock_litellm_completion.side_effect = OperationCancelled('Operation cancelled')

tests/unit/test_llm_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def test_load_from_toml_llm_missing_generic(
188188
assert custom_only.model == 'custom-only-model'
189189
assert custom_only.api_key.get_secret_value() == 'custom-only-api-key'
190190
assert custom_only.embedding_model == 'local' # default value
191-
assert custom_only.num_retries == 8 # default value
191+
assert custom_only.num_retries == 4 # default value
192192

193193

194194
def test_load_from_toml_llm_invalid_config(

0 commit comments

Comments
 (0)