fix(auxiliary): retry without temperature when any provider rejects it

Universal reactive fix for 'HTTP 400: Unsupported parameter: temperature'
across all providers/models — not just Codex Responses.

The same backend can accept temperature for some models and reject it for
others (e.g. gpt-5.4 accepts but gpt-5.5 rejects on the same OpenAI
endpoint; similar patterns on Copilot, OpenRouter reasoning routes, and
Anthropic Opus 4.7+ via OAI-compat). An allow/deny-list by model name does
not scale.

call_llm / async_call_llm now detect the concrete 'unsupported parameter:
temperature' 400 and transparently retry once without temperature. Kimi's
server-managed omission and Opus 4.7+'s proactive strip stay in place —
this is the safety net for everything else.

Changes:
- agent/auxiliary_client.py: add _is_unsupported_temperature_error helper;
  wire into both sync and async call_llm paths before the existing
  max_tokens/payment/auth retry ladder
- tests/agent/test_unsupported_temperature_retry.py: 19 tests covering
  detector phrasings, sync + async retry, no-retry-without-temperature,
  and non-temperature 400s not triggering the retry

Builds on PR #15620 (codex_responses fallback) which stripped temperature
up front for that one api_mode. This PR closes the gap for every other
provider/model combo via reactive retry.

Credit: retry approach and detector originate from @BlueBirdBack's PR #15578.

Co-authored-by: BlueBirdBack <BlueBirdBack@users.noreply.github.com>
This commit is contained in:
Ash Rowan Vale 🌿
2026-04-25 05:25:09 -07:00
committed by Teknium
parent f67a61dc93
commit facea84559
2 changed files with 315 additions and 1 deletions

View File

@@ -1349,6 +1349,32 @@ def _is_auth_error(exc: Exception) -> bool:
return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
def _is_unsupported_temperature_error(exc: Exception) -> bool:
"""Detect API errors where the selected model rejects `temperature`.
Triggered by provider responses like:
* OpenAI / Codex Responses — ``Unsupported parameter: temperature``
* Copilot reasoning models — ``unsupported_parameter`` with temperature
* OpenRouter reasoning models — ``does not support temperature``
* Anthropic Opus 4.7+ via OpenAI-compat — ``temperature is not supported``
The same backend can accept temperature for some models and reject it for
others (e.g. gpt-5.4 accepts, gpt-5.5 rejects on the same endpoint), so we
react to the concrete error rather than maintaining a model allowlist.
"""
err_lower = str(exc).lower()
if "temperature" not in err_lower:
return False
return any(marker in err_lower for marker in (
"unsupported parameter",
"unsupported_parameter",
"not supported",
"does not support",
"unknown parameter",
"unrecognized request argument",
))
def _evict_cached_clients(provider: str) -> None:
"""Drop cached auxiliary clients for a provider so fresh creds are used."""
normalized = _normalize_aux_provider(provider)
@@ -2952,11 +2978,39 @@ def call_llm(
if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
# Handle max_tokens vs max_completion_tokens retry, then payment fallback.
# Handle unsupported temperature, max_tokens vs max_completion_tokens retry,
# then payment fallback.
try:
return _validate_llm_response(
client.chat.completions.create(**kwargs), task)
except Exception as first_err:
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
retry_kwargs = dict(kwargs)
retry_kwargs.pop("temperature", None)
logger.info(
"Auxiliary %s: provider rejected temperature; retrying once without it",
task or "call",
)
try:
return _validate_llm_response(
client.chat.completions.create(**retry_kwargs), task)
except Exception as retry_err:
retry_err_str = str(retry_err)
# If retry still fails, fall through to the max_tokens /
# payment / auth chains below using the temperature-stripped
# kwargs. Re-raise only if the retry hit something those
# chains won't handle.
if not (
_is_payment_error(retry_err)
or _is_connection_error(retry_err)
or _is_auth_error(retry_err)
or "max_tokens" in retry_err_str
or "unsupported_parameter" in retry_err_str
):
raise
first_err = retry_err
kwargs = retry_kwargs
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
kwargs.pop("max_tokens", None)
@@ -3221,6 +3275,29 @@ async def async_call_llm(
return _validate_llm_response(
await client.chat.completions.create(**kwargs), task)
except Exception as first_err:
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
retry_kwargs = dict(kwargs)
retry_kwargs.pop("temperature", None)
logger.info(
"Auxiliary %s (async): provider rejected temperature; retrying once without it",
task or "call",
)
try:
return _validate_llm_response(
await client.chat.completions.create(**retry_kwargs), task)
except Exception as retry_err:
retry_err_str = str(retry_err)
if not (
_is_payment_error(retry_err)
or _is_connection_error(retry_err)
or _is_auth_error(retry_err)
or "max_tokens" in retry_err_str
or "unsupported_parameter" in retry_err_str
):
raise
first_err = retry_err
kwargs = retry_kwargs
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
kwargs.pop("max_tokens", None)

View File

@@ -0,0 +1,237 @@
"""Regression tests for the universal "unsupported temperature" retry in
``agent.auxiliary_client``.
Auxiliary callers (``flush_memories``, context compression, session search,
web extract summarisation, etc.) hardcode ``temperature=0.3`` for historical
reasons. Several provider/model combinations reject ``temperature`` with a
400:
* OpenAI Responses (gpt-5/o-series reasoning models)
* Copilot Responses (reasoning models)
* OpenRouter reasoning models (gpt-5.5, some anthropic via OAI-compat)
* Anthropic Opus 4.7+ via OpenAI-compat endpoints
* Kimi/Moonshot (server-managed)
``_fixed_temperature_for_model`` catches Kimi up front, and
``build_chat_completion_kwargs`` drops temperature for Anthropic Opus 4.7+,
but the same backend can accept ``temperature`` for some models and reject
it for others (for example gpt-5.4 accepts but gpt-5.5 rejects on the same
endpoint). An allow/deny-list is not maintainable across providers.
The universal fix is reactive: when a call returns an
``Unsupported parameter: temperature`` 400, retry once without temperature.
These tests lock in that behaviour for both sync and async paths.
"""
from unittest.mock import patch, MagicMock, AsyncMock
import pytest
from agent.auxiliary_client import (
call_llm,
async_call_llm,
_is_unsupported_temperature_error,
)
class TestIsUnsupportedTemperatureError:
"""The detector must match the phrasings providers actually return."""
@pytest.mark.parametrize("message", [
# OpenAI / Codex Responses
"HTTP 400: Unsupported parameter: temperature",
"Error code: 400 - {'error': {'message': \"Unsupported parameter: 'temperature'\"}}",
# Copilot / OpenAI error-code form
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
# OpenRouter-style
"Provider returned error: temperature is not supported for this model",
"this model does not support temperature",
# Anthropic-style via OAI-compat
"temperature: unknown parameter",
# Some gateways
"unrecognized request argument supplied: temperature",
])
def test_matches_real_provider_messages(self, message):
assert _is_unsupported_temperature_error(RuntimeError(message)) is True
@pytest.mark.parametrize("message", [
# Unrelated 400s must NOT trigger a silent-retry
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'...",
"max_tokens is too large for this model",
"Rate limit exceeded",
"Connection reset by peer",
# Temperature value error is a different class of problem
"temperature must be between 0 and 2",
])
def test_does_not_match_unrelated_errors(self, message):
assert _is_unsupported_temperature_error(RuntimeError(message)) is False
def _dummy_response():
# The real code calls _validate_llm_response which inspects
# response.choices[0].message. The tests here patch that out, so
# any sentinel object is fine.
return {"ok": True}
class TestCallLlmUnsupportedTemperatureRetry:
"""``call_llm`` retries once without temperature and returns on success."""
def _setup(self, first_exc):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create.side_effect = [first_exc, _dummy_response()]
return client
@pytest.mark.parametrize("error_message", [
"HTTP 400: Unsupported parameter: temperature",
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
"Provider error: this model does not support temperature",
])
def test_retries_once_without_temperature(self, error_message):
client = self._setup(RuntimeError(error_message))
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = call_llm(
task="flush_memories",
messages=[{"role": "user", "content": "remember this"}],
temperature=0.3,
max_tokens=500,
)
assert result == {"ok": True}
assert client.chat.completions.create.call_count == 2
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
assert first_kwargs["temperature"] == 0.3
assert "temperature" not in retry_kwargs
# other kwargs preserved
assert retry_kwargs["max_tokens"] == 500
def test_non_temperature_400_does_not_retry_as_temperature(self):
"""Unrelated 400s (e.g. bad tool role) must not silently drop temp."""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
non_temp_err = RuntimeError(
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'..."
)
client.chat.completions.create.side_effect = non_temp_err
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError, match="Invalid value"):
call_llm(
task="flush_memories",
messages=[{"role": "user", "content": "x"}],
temperature=0.3,
max_tokens=500,
)
# Should NOT have retried (non-temperature 400 doesn't match)
assert client.chat.completions.create.call_count == 1
def test_no_retry_when_temperature_not_in_kwargs(self):
"""If caller didn't send temperature, don't invent a temperature-retry."""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
# Provider complains about temperature even though we didn't send it.
# (Pathological but possible with misleading error text.) The guard
# ``"temperature" in kwargs`` must prevent an unnecessary retry.
err = RuntimeError("HTTP 400: Unsupported parameter: temperature")
client.chat.completions.create.side_effect = err
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError):
call_llm(
task="flush_memories",
messages=[{"role": "user", "content": "x"}],
temperature=None, # explicit: no temperature sent
max_tokens=500,
)
assert client.chat.completions.create.call_count == 1
class TestAsyncCallLlmUnsupportedTemperatureRetry:
"""``async_call_llm`` mirror of the sync retry semantics."""
@pytest.mark.asyncio
async def test_async_retries_once_without_temperature(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create = AsyncMock(side_effect=[
RuntimeError("HTTP 400: Unsupported parameter: temperature"),
_dummy_response(),
])
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "query"}],
temperature=0.3,
max_tokens=500,
)
assert result == {"ok": True}
assert client.chat.completions.create.await_count == 2
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
assert first_kwargs["temperature"] == 0.3
assert "temperature" not in retry_kwargs
assert retry_kwargs["max_tokens"] == 500
@pytest.mark.asyncio
async def test_async_non_temperature_400_does_not_retry(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create = AsyncMock(
side_effect=RuntimeError("HTTP 400: Invalid value: 'tool'"),
)
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError, match="Invalid value"):
await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "x"}],
temperature=0.3,
max_tokens=500,
)
assert client.chat.completions.create.await_count == 1