diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 5e8a60e765..8ea986f816 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1349,6 +1349,32 @@ def _is_auth_error(exc: Exception) -> bool: return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower() +def _is_unsupported_temperature_error(exc: Exception) -> bool: + """Detect API errors where the selected model rejects `temperature`. + + Triggered by provider responses like: + * OpenAI / Codex Responses — ``Unsupported parameter: temperature`` + * Copilot reasoning models — ``unsupported_parameter`` with temperature + * OpenRouter reasoning models — ``does not support temperature`` + * Anthropic Opus 4.7+ via OpenAI-compat — ``temperature is not supported`` + + The same backend can accept temperature for some models and reject it for + others (e.g. gpt-5.4 accepts, gpt-5.5 rejects on the same endpoint), so we + react to the concrete error rather than maintaining a model allowlist. + """ + err_lower = str(exc).lower() + if "temperature" not in err_lower: + return False + return any(marker in err_lower for marker in ( + "unsupported parameter", + "unsupported_parameter", + "not supported", + "does not support", + "unknown parameter", + "unrecognized request argument", + )) + + def _evict_cached_clients(provider: str) -> None: """Drop cached auxiliary clients for a provider so fresh creds are used.""" normalized = _normalize_aux_provider(provider) @@ -2952,11 +2978,39 @@ def call_llm( if _is_anthropic_compat_endpoint(resolved_provider, _client_base): kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"]) - # Handle max_tokens vs max_completion_tokens retry, then payment fallback. + # Handle unsupported temperature, max_tokens vs max_completion_tokens retry, + # then payment fallback. try: return _validate_llm_response( client.chat.completions.create(**kwargs), task) except Exception as first_err: + if "temperature" in kwargs and _is_unsupported_temperature_error(first_err): + retry_kwargs = dict(kwargs) + retry_kwargs.pop("temperature", None) + logger.info( + "Auxiliary %s: provider rejected temperature; retrying once without it", + task or "call", + ) + try: + return _validate_llm_response( + client.chat.completions.create(**retry_kwargs), task) + except Exception as retry_err: + retry_err_str = str(retry_err) + # If retry still fails, fall through to the max_tokens / + # payment / auth chains below using the temperature-stripped + # kwargs. Re-raise only if the retry hit something those + # chains won't handle. + if not ( + _is_payment_error(retry_err) + or _is_connection_error(retry_err) + or _is_auth_error(retry_err) + or "max_tokens" in retry_err_str + or "unsupported_parameter" in retry_err_str + ): + raise + first_err = retry_err + kwargs = retry_kwargs + err_str = str(first_err) if "max_tokens" in err_str or "unsupported_parameter" in err_str: kwargs.pop("max_tokens", None) @@ -3221,6 +3275,29 @@ async def async_call_llm( return _validate_llm_response( await client.chat.completions.create(**kwargs), task) except Exception as first_err: + if "temperature" in kwargs and _is_unsupported_temperature_error(first_err): + retry_kwargs = dict(kwargs) + retry_kwargs.pop("temperature", None) + logger.info( + "Auxiliary %s (async): provider rejected temperature; retrying once without it", + task or "call", + ) + try: + return _validate_llm_response( + await client.chat.completions.create(**retry_kwargs), task) + except Exception as retry_err: + retry_err_str = str(retry_err) + if not ( + _is_payment_error(retry_err) + or _is_connection_error(retry_err) + or _is_auth_error(retry_err) + or "max_tokens" in retry_err_str + or "unsupported_parameter" in retry_err_str + ): + raise + first_err = retry_err + kwargs = retry_kwargs + err_str = str(first_err) if "max_tokens" in err_str or "unsupported_parameter" in err_str: kwargs.pop("max_tokens", None) diff --git a/tests/agent/test_unsupported_temperature_retry.py b/tests/agent/test_unsupported_temperature_retry.py new file mode 100644 index 0000000000..1e22a4d801 --- /dev/null +++ b/tests/agent/test_unsupported_temperature_retry.py @@ -0,0 +1,237 @@ +"""Regression tests for the universal "unsupported temperature" retry in +``agent.auxiliary_client``. + +Auxiliary callers (``flush_memories``, context compression, session search, +web extract summarisation, etc.) hardcode ``temperature=0.3`` for historical +reasons. Several provider/model combinations reject ``temperature`` with a +400: + + * OpenAI Responses (gpt-5/o-series reasoning models) + * Copilot Responses (reasoning models) + * OpenRouter reasoning models (gpt-5.5, some anthropic via OAI-compat) + * Anthropic Opus 4.7+ via OpenAI-compat endpoints + * Kimi/Moonshot (server-managed) + +``_fixed_temperature_for_model`` catches Kimi up front, and +``build_chat_completion_kwargs`` drops temperature for Anthropic Opus 4.7+, +but the same backend can accept ``temperature`` for some models and reject +it for others (for example gpt-5.4 accepts but gpt-5.5 rejects on the same +endpoint). An allow/deny-list is not maintainable across providers. + +The universal fix is reactive: when a call returns an +``Unsupported parameter: temperature`` 400, retry once without temperature. +These tests lock in that behaviour for both sync and async paths. +""" + +from unittest.mock import patch, MagicMock, AsyncMock + +import pytest + +from agent.auxiliary_client import ( + call_llm, + async_call_llm, + _is_unsupported_temperature_error, +) + + +class TestIsUnsupportedTemperatureError: + """The detector must match the phrasings providers actually return.""" + + @pytest.mark.parametrize("message", [ + # OpenAI / Codex Responses + "HTTP 400: Unsupported parameter: temperature", + "Error code: 400 - {'error': {'message': \"Unsupported parameter: 'temperature'\"}}", + # Copilot / OpenAI error-code form + "Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}", + # OpenRouter-style + "Provider returned error: temperature is not supported for this model", + "this model does not support temperature", + # Anthropic-style via OAI-compat + "temperature: unknown parameter", + # Some gateways + "unrecognized request argument supplied: temperature", + ]) + def test_matches_real_provider_messages(self, message): + assert _is_unsupported_temperature_error(RuntimeError(message)) is True + + @pytest.mark.parametrize("message", [ + # Unrelated 400s must NOT trigger a silent-retry + "HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'...", + "max_tokens is too large for this model", + "Rate limit exceeded", + "Connection reset by peer", + # Temperature value error is a different class of problem + "temperature must be between 0 and 2", + ]) + def test_does_not_match_unrelated_errors(self, message): + assert _is_unsupported_temperature_error(RuntimeError(message)) is False + + +def _dummy_response(): + # The real code calls _validate_llm_response which inspects + # response.choices[0].message. The tests here patch that out, so + # any sentinel object is fine. + return {"ok": True} + + +class TestCallLlmUnsupportedTemperatureRetry: + """``call_llm`` retries once without temperature and returns on success.""" + + def _setup(self, first_exc): + client = MagicMock() + client.base_url = "https://api.openai.com/v1" + client.chat.completions.create.side_effect = [first_exc, _dummy_response()] + return client + + @pytest.mark.parametrize("error_message", [ + "HTTP 400: Unsupported parameter: temperature", + "Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}", + "Provider error: this model does not support temperature", + ]) + def test_retries_once_without_temperature(self, error_message): + client = self._setup(RuntimeError(error_message)) + + with ( + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("openai-codex", "gpt-5.5", None, None, None)), + patch("agent.auxiliary_client._get_cached_client", + return_value=(client, "gpt-5.5")), + patch("agent.auxiliary_client._validate_llm_response", + side_effect=lambda resp, _task: resp), + ): + result = call_llm( + task="flush_memories", + messages=[{"role": "user", "content": "remember this"}], + temperature=0.3, + max_tokens=500, + ) + + assert result == {"ok": True} + assert client.chat.completions.create.call_count == 2 + first_kwargs = client.chat.completions.create.call_args_list[0].kwargs + retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs + assert first_kwargs["temperature"] == 0.3 + assert "temperature" not in retry_kwargs + # other kwargs preserved + assert retry_kwargs["max_tokens"] == 500 + + def test_non_temperature_400_does_not_retry_as_temperature(self): + """Unrelated 400s (e.g. bad tool role) must not silently drop temp.""" + client = MagicMock() + client.base_url = "https://api.openai.com/v1" + non_temp_err = RuntimeError( + "HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'..." + ) + client.chat.completions.create.side_effect = non_temp_err + + with ( + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("openai-codex", "gpt-5.5", None, None, None)), + patch("agent.auxiliary_client._get_cached_client", + return_value=(client, "gpt-5.5")), + patch("agent.auxiliary_client._validate_llm_response", + side_effect=lambda resp, _task: resp), + patch("agent.auxiliary_client._try_payment_fallback", + return_value=None), + ): + with pytest.raises(RuntimeError, match="Invalid value"): + call_llm( + task="flush_memories", + messages=[{"role": "user", "content": "x"}], + temperature=0.3, + max_tokens=500, + ) + # Should NOT have retried (non-temperature 400 doesn't match) + assert client.chat.completions.create.call_count == 1 + + def test_no_retry_when_temperature_not_in_kwargs(self): + """If caller didn't send temperature, don't invent a temperature-retry.""" + client = MagicMock() + client.base_url = "https://api.openai.com/v1" + # Provider complains about temperature even though we didn't send it. + # (Pathological but possible with misleading error text.) The guard + # ``"temperature" in kwargs`` must prevent an unnecessary retry. + err = RuntimeError("HTTP 400: Unsupported parameter: temperature") + client.chat.completions.create.side_effect = err + + with ( + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("openai-codex", "gpt-5.5", None, None, None)), + patch("agent.auxiliary_client._get_cached_client", + return_value=(client, "gpt-5.5")), + patch("agent.auxiliary_client._validate_llm_response", + side_effect=lambda resp, _task: resp), + patch("agent.auxiliary_client._try_payment_fallback", + return_value=None), + ): + with pytest.raises(RuntimeError): + call_llm( + task="flush_memories", + messages=[{"role": "user", "content": "x"}], + temperature=None, # explicit: no temperature sent + max_tokens=500, + ) + assert client.chat.completions.create.call_count == 1 + + +class TestAsyncCallLlmUnsupportedTemperatureRetry: + """``async_call_llm`` mirror of the sync retry semantics.""" + + @pytest.mark.asyncio + async def test_async_retries_once_without_temperature(self): + client = MagicMock() + client.base_url = "https://api.openai.com/v1" + client.chat.completions.create = AsyncMock(side_effect=[ + RuntimeError("HTTP 400: Unsupported parameter: temperature"), + _dummy_response(), + ]) + + with ( + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("openai-codex", "gpt-5.5", None, None, None)), + patch("agent.auxiliary_client._get_cached_client", + return_value=(client, "gpt-5.5")), + patch("agent.auxiliary_client._validate_llm_response", + side_effect=lambda resp, _task: resp), + ): + result = await async_call_llm( + task="session_search", + messages=[{"role": "user", "content": "query"}], + temperature=0.3, + max_tokens=500, + ) + + assert result == {"ok": True} + assert client.chat.completions.create.await_count == 2 + first_kwargs = client.chat.completions.create.call_args_list[0].kwargs + retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs + assert first_kwargs["temperature"] == 0.3 + assert "temperature" not in retry_kwargs + assert retry_kwargs["max_tokens"] == 500 + + @pytest.mark.asyncio + async def test_async_non_temperature_400_does_not_retry(self): + client = MagicMock() + client.base_url = "https://api.openai.com/v1" + client.chat.completions.create = AsyncMock( + side_effect=RuntimeError("HTTP 400: Invalid value: 'tool'"), + ) + + with ( + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("openai-codex", "gpt-5.5", None, None, None)), + patch("agent.auxiliary_client._get_cached_client", + return_value=(client, "gpt-5.5")), + patch("agent.auxiliary_client._validate_llm_response", + side_effect=lambda resp, _task: resp), + patch("agent.auxiliary_client._try_payment_fallback", + return_value=None), + ): + with pytest.raises(RuntimeError, match="Invalid value"): + await async_call_llm( + task="session_search", + messages=[{"role": "user", "content": "x"}], + temperature=0.3, + max_tokens=500, + ) + assert client.chat.completions.create.await_count == 1