2026-04-25 05:25:09 -07:00
|
|
|
"""Regression tests for the universal "unsupported temperature" retry in
|
|
|
|
|
``agent.auxiliary_client``.
|
|
|
|
|
|
refactor(memory): remove flush_memories entirely (#15696)
The AIAgent.flush_memories pre-compression save, the gateway
_flush_memories_for_session, and everything feeding them are
obsolete now that the background memory/skill review handles
persistent memory extraction.
Problems with flush_memories:
- Pre-dates the background review loop. It was the only memory-save
path when introduced; the background review now fires every 10 user
turns on CLI and gateway alike, which is far more frequent than
compression or session reset ever triggered flush.
- Blocking and synchronous. Pre-compression flush ran on the live agent
before compression, blocking the user-visible response.
- Cache-breaking. Flush built a temporary conversation prefix
(system prompt + memory-only tool list) that diverged from the live
conversation's cached prefix, invalidating prompt caching. The
gateway variant spawned a fresh AIAgent with its own clean prompt
for each finalized session — still cache-breaking, just in a
different process.
- Redundant. Background review runs in the live conversation's
session context, gets the same content, writes to the same memory
store, and doesn't break the cache. Everything flush_memories
claimed to preserve is already covered.
What this removes:
- AIAgent.flush_memories() method (~248 LOC in run_agent.py)
- Pre-compression flush call in _compress_context
- flush_memories call sites in cli.py (/new + exit)
- GatewayRunner._flush_memories_for_session + _async_flush_memories
(and the 3 call sites: session expiry watcher, /new, /resume)
- 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks,
hermes tools UI task list, auxiliary_client docstrings
- _memory_flush_min_turns config + init
- #15631's headroom-deduction math in
_check_compression_model_feasibility (headroom was only needed
because flush dragged the full main-agent system prompt along;
the compression summariser sends a single user-role prompt so
new_threshold = aux_context is safe again)
- The dedicated test files and assertions that exercised
flush-specific paths
What this renames (with read-time backcompat on sessions.json):
- SessionEntry.memory_flushed -> SessionEntry.expiry_finalized.
The session-expiry watcher still uses the flag to avoid re-running
finalize/eviction on the same expired session; the new name
reflects what it now actually gates. from_dict() reads
'expiry_finalized' first, falls back to the legacy 'memory_flushed'
key so existing sessions.json files upgrade seamlessly.
Supersedes #15631 and #15638.
Tested: 383 targeted tests pass across run_agent/, agent/, cli/,
and gateway/ session-boundary suites. No behavior regressions —
background memory review continues to handle persistent memory
extraction on both CLI and gateway.
2026-04-25 08:21:14 -07:00
|
|
|
Auxiliary callers (context compression, session search,
|
2026-04-25 05:25:09 -07:00
|
|
|
web extract summarisation, etc.) hardcode ``temperature=0.3`` for historical
|
|
|
|
|
reasons. Several provider/model combinations reject ``temperature`` with a
|
|
|
|
|
400:
|
|
|
|
|
|
|
|
|
|
* OpenAI Responses (gpt-5/o-series reasoning models)
|
|
|
|
|
* Copilot Responses (reasoning models)
|
|
|
|
|
* OpenRouter reasoning models (gpt-5.5, some anthropic via OAI-compat)
|
|
|
|
|
* Anthropic Opus 4.7+ via OpenAI-compat endpoints
|
|
|
|
|
* Kimi/Moonshot (server-managed)
|
|
|
|
|
|
|
|
|
|
``_fixed_temperature_for_model`` catches Kimi up front, and
|
|
|
|
|
``build_chat_completion_kwargs`` drops temperature for Anthropic Opus 4.7+,
|
|
|
|
|
but the same backend can accept ``temperature`` for some models and reject
|
|
|
|
|
it for others (for example gpt-5.4 accepts but gpt-5.5 rejects on the same
|
|
|
|
|
endpoint). An allow/deny-list is not maintainable across providers.
|
|
|
|
|
|
|
|
|
|
The universal fix is reactive: when a call returns an
|
|
|
|
|
``Unsupported parameter: temperature`` 400, retry once without temperature.
|
|
|
|
|
These tests lock in that behaviour for both sync and async paths.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from unittest.mock import patch, MagicMock, AsyncMock
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from agent.auxiliary_client import (
|
|
|
|
|
call_llm,
|
|
|
|
|
async_call_llm,
|
|
|
|
|
_is_unsupported_temperature_error,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestIsUnsupportedTemperatureError:
|
|
|
|
|
"""The detector must match the phrasings providers actually return."""
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("message", [
|
|
|
|
|
# OpenAI / Codex Responses
|
|
|
|
|
"HTTP 400: Unsupported parameter: temperature",
|
|
|
|
|
"Error code: 400 - {'error': {'message': \"Unsupported parameter: 'temperature'\"}}",
|
|
|
|
|
# Copilot / OpenAI error-code form
|
|
|
|
|
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
|
|
|
|
|
# OpenRouter-style
|
|
|
|
|
"Provider returned error: temperature is not supported for this model",
|
|
|
|
|
"this model does not support temperature",
|
|
|
|
|
# Anthropic-style via OAI-compat
|
|
|
|
|
"temperature: unknown parameter",
|
|
|
|
|
# Some gateways
|
|
|
|
|
"unrecognized request argument supplied: temperature",
|
|
|
|
|
])
|
|
|
|
|
def test_matches_real_provider_messages(self, message):
|
|
|
|
|
assert _is_unsupported_temperature_error(RuntimeError(message)) is True
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("message", [
|
|
|
|
|
# Unrelated 400s must NOT trigger a silent-retry
|
|
|
|
|
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'...",
|
|
|
|
|
"max_tokens is too large for this model",
|
|
|
|
|
"Rate limit exceeded",
|
|
|
|
|
"Connection reset by peer",
|
|
|
|
|
# Temperature value error is a different class of problem
|
|
|
|
|
"temperature must be between 0 and 2",
|
|
|
|
|
])
|
|
|
|
|
def test_does_not_match_unrelated_errors(self, message):
|
|
|
|
|
assert _is_unsupported_temperature_error(RuntimeError(message)) is False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _dummy_response():
|
|
|
|
|
# The real code calls _validate_llm_response which inspects
|
|
|
|
|
# response.choices[0].message. The tests here patch that out, so
|
|
|
|
|
# any sentinel object is fine.
|
|
|
|
|
return {"ok": True}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCallLlmUnsupportedTemperatureRetry:
|
|
|
|
|
"""``call_llm`` retries once without temperature and returns on success."""
|
|
|
|
|
|
|
|
|
|
def _setup(self, first_exc):
|
|
|
|
|
client = MagicMock()
|
|
|
|
|
client.base_url = "https://api.openai.com/v1"
|
|
|
|
|
client.chat.completions.create.side_effect = [first_exc, _dummy_response()]
|
|
|
|
|
return client
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("error_message", [
|
|
|
|
|
"HTTP 400: Unsupported parameter: temperature",
|
|
|
|
|
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
|
|
|
|
|
"Provider error: this model does not support temperature",
|
|
|
|
|
])
|
|
|
|
|
def test_retries_once_without_temperature(self, error_message):
|
|
|
|
|
client = self._setup(RuntimeError(error_message))
|
|
|
|
|
|
|
|
|
|
with (
|
|
|
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
|
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
|
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
|
|
|
return_value=(client, "gpt-5.5")),
|
|
|
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
|
|
|
side_effect=lambda resp, _task: resp),
|
|
|
|
|
):
|
|
|
|
|
result = call_llm(
|
refactor(memory): remove flush_memories entirely (#15696)
The AIAgent.flush_memories pre-compression save, the gateway
_flush_memories_for_session, and everything feeding them are
obsolete now that the background memory/skill review handles
persistent memory extraction.
Problems with flush_memories:
- Pre-dates the background review loop. It was the only memory-save
path when introduced; the background review now fires every 10 user
turns on CLI and gateway alike, which is far more frequent than
compression or session reset ever triggered flush.
- Blocking and synchronous. Pre-compression flush ran on the live agent
before compression, blocking the user-visible response.
- Cache-breaking. Flush built a temporary conversation prefix
(system prompt + memory-only tool list) that diverged from the live
conversation's cached prefix, invalidating prompt caching. The
gateway variant spawned a fresh AIAgent with its own clean prompt
for each finalized session — still cache-breaking, just in a
different process.
- Redundant. Background review runs in the live conversation's
session context, gets the same content, writes to the same memory
store, and doesn't break the cache. Everything flush_memories
claimed to preserve is already covered.
What this removes:
- AIAgent.flush_memories() method (~248 LOC in run_agent.py)
- Pre-compression flush call in _compress_context
- flush_memories call sites in cli.py (/new + exit)
- GatewayRunner._flush_memories_for_session + _async_flush_memories
(and the 3 call sites: session expiry watcher, /new, /resume)
- 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks,
hermes tools UI task list, auxiliary_client docstrings
- _memory_flush_min_turns config + init
- #15631's headroom-deduction math in
_check_compression_model_feasibility (headroom was only needed
because flush dragged the full main-agent system prompt along;
the compression summariser sends a single user-role prompt so
new_threshold = aux_context is safe again)
- The dedicated test files and assertions that exercised
flush-specific paths
What this renames (with read-time backcompat on sessions.json):
- SessionEntry.memory_flushed -> SessionEntry.expiry_finalized.
The session-expiry watcher still uses the flag to avoid re-running
finalize/eviction on the same expired session; the new name
reflects what it now actually gates. from_dict() reads
'expiry_finalized' first, falls back to the legacy 'memory_flushed'
key so existing sessions.json files upgrade seamlessly.
Supersedes #15631 and #15638.
Tested: 383 targeted tests pass across run_agent/, agent/, cli/,
and gateway/ session-boundary suites. No behavior regressions —
background memory review continues to handle persistent memory
extraction on both CLI and gateway.
2026-04-25 08:21:14 -07:00
|
|
|
task="compression",
|
2026-04-25 05:25:09 -07:00
|
|
|
messages=[{"role": "user", "content": "remember this"}],
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
max_tokens=500,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert result == {"ok": True}
|
|
|
|
|
assert client.chat.completions.create.call_count == 2
|
|
|
|
|
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
|
|
|
|
|
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
|
|
|
|
assert first_kwargs["temperature"] == 0.3
|
|
|
|
|
assert "temperature" not in retry_kwargs
|
|
|
|
|
# other kwargs preserved
|
|
|
|
|
assert retry_kwargs["max_tokens"] == 500
|
|
|
|
|
|
|
|
|
|
def test_non_temperature_400_does_not_retry_as_temperature(self):
|
|
|
|
|
"""Unrelated 400s (e.g. bad tool role) must not silently drop temp."""
|
|
|
|
|
client = MagicMock()
|
|
|
|
|
client.base_url = "https://api.openai.com/v1"
|
|
|
|
|
non_temp_err = RuntimeError(
|
|
|
|
|
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'..."
|
|
|
|
|
)
|
|
|
|
|
client.chat.completions.create.side_effect = non_temp_err
|
|
|
|
|
|
|
|
|
|
with (
|
|
|
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
|
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
|
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
|
|
|
return_value=(client, "gpt-5.5")),
|
|
|
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
|
|
|
side_effect=lambda resp, _task: resp),
|
|
|
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
|
|
|
return_value=None),
|
|
|
|
|
):
|
|
|
|
|
with pytest.raises(RuntimeError, match="Invalid value"):
|
|
|
|
|
call_llm(
|
refactor(memory): remove flush_memories entirely (#15696)
The AIAgent.flush_memories pre-compression save, the gateway
_flush_memories_for_session, and everything feeding them are
obsolete now that the background memory/skill review handles
persistent memory extraction.
Problems with flush_memories:
- Pre-dates the background review loop. It was the only memory-save
path when introduced; the background review now fires every 10 user
turns on CLI and gateway alike, which is far more frequent than
compression or session reset ever triggered flush.
- Blocking and synchronous. Pre-compression flush ran on the live agent
before compression, blocking the user-visible response.
- Cache-breaking. Flush built a temporary conversation prefix
(system prompt + memory-only tool list) that diverged from the live
conversation's cached prefix, invalidating prompt caching. The
gateway variant spawned a fresh AIAgent with its own clean prompt
for each finalized session — still cache-breaking, just in a
different process.
- Redundant. Background review runs in the live conversation's
session context, gets the same content, writes to the same memory
store, and doesn't break the cache. Everything flush_memories
claimed to preserve is already covered.
What this removes:
- AIAgent.flush_memories() method (~248 LOC in run_agent.py)
- Pre-compression flush call in _compress_context
- flush_memories call sites in cli.py (/new + exit)
- GatewayRunner._flush_memories_for_session + _async_flush_memories
(and the 3 call sites: session expiry watcher, /new, /resume)
- 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks,
hermes tools UI task list, auxiliary_client docstrings
- _memory_flush_min_turns config + init
- #15631's headroom-deduction math in
_check_compression_model_feasibility (headroom was only needed
because flush dragged the full main-agent system prompt along;
the compression summariser sends a single user-role prompt so
new_threshold = aux_context is safe again)
- The dedicated test files and assertions that exercised
flush-specific paths
What this renames (with read-time backcompat on sessions.json):
- SessionEntry.memory_flushed -> SessionEntry.expiry_finalized.
The session-expiry watcher still uses the flag to avoid re-running
finalize/eviction on the same expired session; the new name
reflects what it now actually gates. from_dict() reads
'expiry_finalized' first, falls back to the legacy 'memory_flushed'
key so existing sessions.json files upgrade seamlessly.
Supersedes #15631 and #15638.
Tested: 383 targeted tests pass across run_agent/, agent/, cli/,
and gateway/ session-boundary suites. No behavior regressions —
background memory review continues to handle persistent memory
extraction on both CLI and gateway.
2026-04-25 08:21:14 -07:00
|
|
|
task="compression",
|
2026-04-25 05:25:09 -07:00
|
|
|
messages=[{"role": "user", "content": "x"}],
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
max_tokens=500,
|
|
|
|
|
)
|
|
|
|
|
# Should NOT have retried (non-temperature 400 doesn't match)
|
|
|
|
|
assert client.chat.completions.create.call_count == 1
|
|
|
|
|
|
|
|
|
|
def test_no_retry_when_temperature_not_in_kwargs(self):
|
|
|
|
|
"""If caller didn't send temperature, don't invent a temperature-retry."""
|
|
|
|
|
client = MagicMock()
|
|
|
|
|
client.base_url = "https://api.openai.com/v1"
|
|
|
|
|
# Provider complains about temperature even though we didn't send it.
|
|
|
|
|
# (Pathological but possible with misleading error text.) The guard
|
|
|
|
|
# ``"temperature" in kwargs`` must prevent an unnecessary retry.
|
|
|
|
|
err = RuntimeError("HTTP 400: Unsupported parameter: temperature")
|
|
|
|
|
client.chat.completions.create.side_effect = err
|
|
|
|
|
|
|
|
|
|
with (
|
|
|
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
|
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
|
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
|
|
|
return_value=(client, "gpt-5.5")),
|
|
|
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
|
|
|
side_effect=lambda resp, _task: resp),
|
|
|
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
|
|
|
return_value=None),
|
|
|
|
|
):
|
|
|
|
|
with pytest.raises(RuntimeError):
|
|
|
|
|
call_llm(
|
refactor(memory): remove flush_memories entirely (#15696)
The AIAgent.flush_memories pre-compression save, the gateway
_flush_memories_for_session, and everything feeding them are
obsolete now that the background memory/skill review handles
persistent memory extraction.
Problems with flush_memories:
- Pre-dates the background review loop. It was the only memory-save
path when introduced; the background review now fires every 10 user
turns on CLI and gateway alike, which is far more frequent than
compression or session reset ever triggered flush.
- Blocking and synchronous. Pre-compression flush ran on the live agent
before compression, blocking the user-visible response.
- Cache-breaking. Flush built a temporary conversation prefix
(system prompt + memory-only tool list) that diverged from the live
conversation's cached prefix, invalidating prompt caching. The
gateway variant spawned a fresh AIAgent with its own clean prompt
for each finalized session — still cache-breaking, just in a
different process.
- Redundant. Background review runs in the live conversation's
session context, gets the same content, writes to the same memory
store, and doesn't break the cache. Everything flush_memories
claimed to preserve is already covered.
What this removes:
- AIAgent.flush_memories() method (~248 LOC in run_agent.py)
- Pre-compression flush call in _compress_context
- flush_memories call sites in cli.py (/new + exit)
- GatewayRunner._flush_memories_for_session + _async_flush_memories
(and the 3 call sites: session expiry watcher, /new, /resume)
- 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks,
hermes tools UI task list, auxiliary_client docstrings
- _memory_flush_min_turns config + init
- #15631's headroom-deduction math in
_check_compression_model_feasibility (headroom was only needed
because flush dragged the full main-agent system prompt along;
the compression summariser sends a single user-role prompt so
new_threshold = aux_context is safe again)
- The dedicated test files and assertions that exercised
flush-specific paths
What this renames (with read-time backcompat on sessions.json):
- SessionEntry.memory_flushed -> SessionEntry.expiry_finalized.
The session-expiry watcher still uses the flag to avoid re-running
finalize/eviction on the same expired session; the new name
reflects what it now actually gates. from_dict() reads
'expiry_finalized' first, falls back to the legacy 'memory_flushed'
key so existing sessions.json files upgrade seamlessly.
Supersedes #15631 and #15638.
Tested: 383 targeted tests pass across run_agent/, agent/, cli/,
and gateway/ session-boundary suites. No behavior regressions —
background memory review continues to handle persistent memory
extraction on both CLI and gateway.
2026-04-25 08:21:14 -07:00
|
|
|
task="compression",
|
2026-04-25 05:25:09 -07:00
|
|
|
messages=[{"role": "user", "content": "x"}],
|
|
|
|
|
temperature=None, # explicit: no temperature sent
|
|
|
|
|
max_tokens=500,
|
|
|
|
|
)
|
|
|
|
|
assert client.chat.completions.create.call_count == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestAsyncCallLlmUnsupportedTemperatureRetry:
|
|
|
|
|
"""``async_call_llm`` mirror of the sync retry semantics."""
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_async_retries_once_without_temperature(self):
|
|
|
|
|
client = MagicMock()
|
|
|
|
|
client.base_url = "https://api.openai.com/v1"
|
|
|
|
|
client.chat.completions.create = AsyncMock(side_effect=[
|
|
|
|
|
RuntimeError("HTTP 400: Unsupported parameter: temperature"),
|
|
|
|
|
_dummy_response(),
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
with (
|
|
|
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
|
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
|
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
|
|
|
return_value=(client, "gpt-5.5")),
|
|
|
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
|
|
|
side_effect=lambda resp, _task: resp),
|
|
|
|
|
):
|
|
|
|
|
result = await async_call_llm(
|
|
|
|
|
task="session_search",
|
|
|
|
|
messages=[{"role": "user", "content": "query"}],
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
max_tokens=500,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert result == {"ok": True}
|
|
|
|
|
assert client.chat.completions.create.await_count == 2
|
|
|
|
|
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
|
|
|
|
|
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
|
|
|
|
assert first_kwargs["temperature"] == 0.3
|
|
|
|
|
assert "temperature" not in retry_kwargs
|
|
|
|
|
assert retry_kwargs["max_tokens"] == 500
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_async_non_temperature_400_does_not_retry(self):
|
|
|
|
|
client = MagicMock()
|
|
|
|
|
client.base_url = "https://api.openai.com/v1"
|
|
|
|
|
client.chat.completions.create = AsyncMock(
|
|
|
|
|
side_effect=RuntimeError("HTTP 400: Invalid value: 'tool'"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
with (
|
|
|
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
|
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
|
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
|
|
|
return_value=(client, "gpt-5.5")),
|
|
|
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
|
|
|
side_effect=lambda resp, _task: resp),
|
|
|
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
|
|
|
return_value=None),
|
|
|
|
|
):
|
|
|
|
|
with pytest.raises(RuntimeError, match="Invalid value"):
|
|
|
|
|
await async_call_llm(
|
|
|
|
|
task="session_search",
|
|
|
|
|
messages=[{"role": "user", "content": "x"}],
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
max_tokens=500,
|
|
|
|
|
)
|
|
|
|
|
assert client.chat.completions.create.await_count == 1
|