mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
The AIAgent.flush_memories pre-compression save, the gateway _flush_memories_for_session, and everything feeding them are obsolete now that the background memory/skill review handles persistent memory extraction. Problems with flush_memories: - Pre-dates the background review loop. It was the only memory-save path when introduced; the background review now fires every 10 user turns on CLI and gateway alike, which is far more frequent than compression or session reset ever triggered flush. - Blocking and synchronous. Pre-compression flush ran on the live agent before compression, blocking the user-visible response. - Cache-breaking. Flush built a temporary conversation prefix (system prompt + memory-only tool list) that diverged from the live conversation's cached prefix, invalidating prompt caching. The gateway variant spawned a fresh AIAgent with its own clean prompt for each finalized session — still cache-breaking, just in a different process. - Redundant. Background review runs in the live conversation's session context, gets the same content, writes to the same memory store, and doesn't break the cache. Everything flush_memories claimed to preserve is already covered. What this removes: - AIAgent.flush_memories() method (~248 LOC in run_agent.py) - Pre-compression flush call in _compress_context - flush_memories call sites in cli.py (/new + exit) - GatewayRunner._flush_memories_for_session + _async_flush_memories (and the 3 call sites: session expiry watcher, /new, /resume) - 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks, hermes tools UI task list, auxiliary_client docstrings - _memory_flush_min_turns config + init - #15631's headroom-deduction math in _check_compression_model_feasibility (headroom was only needed because flush dragged the full main-agent system prompt along; the compression summariser sends a single user-role prompt so new_threshold = aux_context is safe again) - The dedicated test files and assertions that exercised flush-specific paths What this renames (with read-time backcompat on sessions.json): - SessionEntry.memory_flushed -> SessionEntry.expiry_finalized. The session-expiry watcher still uses the flag to avoid re-running finalize/eviction on the same expired session; the new name reflects what it now actually gates. from_dict() reads 'expiry_finalized' first, falls back to the legacy 'memory_flushed' key so existing sessions.json files upgrade seamlessly. Supersedes #15631 and #15638. Tested: 383 targeted tests pass across run_agent/, agent/, cli/, and gateway/ session-boundary suites. No behavior regressions — background memory review continues to handle persistent memory extraction on both CLI and gateway.
238 lines
10 KiB
Python
238 lines
10 KiB
Python
"""Regression tests for the universal "unsupported temperature" retry in
|
|
``agent.auxiliary_client``.
|
|
|
|
Auxiliary callers (context compression, session search,
|
|
web extract summarisation, etc.) hardcode ``temperature=0.3`` for historical
|
|
reasons. Several provider/model combinations reject ``temperature`` with a
|
|
400:
|
|
|
|
* OpenAI Responses (gpt-5/o-series reasoning models)
|
|
* Copilot Responses (reasoning models)
|
|
* OpenRouter reasoning models (gpt-5.5, some anthropic via OAI-compat)
|
|
* Anthropic Opus 4.7+ via OpenAI-compat endpoints
|
|
* Kimi/Moonshot (server-managed)
|
|
|
|
``_fixed_temperature_for_model`` catches Kimi up front, and
|
|
``build_chat_completion_kwargs`` drops temperature for Anthropic Opus 4.7+,
|
|
but the same backend can accept ``temperature`` for some models and reject
|
|
it for others (for example gpt-5.4 accepts but gpt-5.5 rejects on the same
|
|
endpoint). An allow/deny-list is not maintainable across providers.
|
|
|
|
The universal fix is reactive: when a call returns an
|
|
``Unsupported parameter: temperature`` 400, retry once without temperature.
|
|
These tests lock in that behaviour for both sync and async paths.
|
|
"""
|
|
|
|
from unittest.mock import patch, MagicMock, AsyncMock
|
|
|
|
import pytest
|
|
|
|
from agent.auxiliary_client import (
|
|
call_llm,
|
|
async_call_llm,
|
|
_is_unsupported_temperature_error,
|
|
)
|
|
|
|
|
|
class TestIsUnsupportedTemperatureError:
|
|
"""The detector must match the phrasings providers actually return."""
|
|
|
|
@pytest.mark.parametrize("message", [
|
|
# OpenAI / Codex Responses
|
|
"HTTP 400: Unsupported parameter: temperature",
|
|
"Error code: 400 - {'error': {'message': \"Unsupported parameter: 'temperature'\"}}",
|
|
# Copilot / OpenAI error-code form
|
|
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
|
|
# OpenRouter-style
|
|
"Provider returned error: temperature is not supported for this model",
|
|
"this model does not support temperature",
|
|
# Anthropic-style via OAI-compat
|
|
"temperature: unknown parameter",
|
|
# Some gateways
|
|
"unrecognized request argument supplied: temperature",
|
|
])
|
|
def test_matches_real_provider_messages(self, message):
|
|
assert _is_unsupported_temperature_error(RuntimeError(message)) is True
|
|
|
|
@pytest.mark.parametrize("message", [
|
|
# Unrelated 400s must NOT trigger a silent-retry
|
|
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'...",
|
|
"max_tokens is too large for this model",
|
|
"Rate limit exceeded",
|
|
"Connection reset by peer",
|
|
# Temperature value error is a different class of problem
|
|
"temperature must be between 0 and 2",
|
|
])
|
|
def test_does_not_match_unrelated_errors(self, message):
|
|
assert _is_unsupported_temperature_error(RuntimeError(message)) is False
|
|
|
|
|
|
def _dummy_response():
|
|
# The real code calls _validate_llm_response which inspects
|
|
# response.choices[0].message. The tests here patch that out, so
|
|
# any sentinel object is fine.
|
|
return {"ok": True}
|
|
|
|
|
|
class TestCallLlmUnsupportedTemperatureRetry:
|
|
"""``call_llm`` retries once without temperature and returns on success."""
|
|
|
|
def _setup(self, first_exc):
|
|
client = MagicMock()
|
|
client.base_url = "https://api.openai.com/v1"
|
|
client.chat.completions.create.side_effect = [first_exc, _dummy_response()]
|
|
return client
|
|
|
|
@pytest.mark.parametrize("error_message", [
|
|
"HTTP 400: Unsupported parameter: temperature",
|
|
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
|
|
"Provider error: this model does not support temperature",
|
|
])
|
|
def test_retries_once_without_temperature(self, error_message):
|
|
client = self._setup(RuntimeError(error_message))
|
|
|
|
with (
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
return_value=(client, "gpt-5.5")),
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
side_effect=lambda resp, _task: resp),
|
|
):
|
|
result = call_llm(
|
|
task="compression",
|
|
messages=[{"role": "user", "content": "remember this"}],
|
|
temperature=0.3,
|
|
max_tokens=500,
|
|
)
|
|
|
|
assert result == {"ok": True}
|
|
assert client.chat.completions.create.call_count == 2
|
|
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
|
|
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
|
assert first_kwargs["temperature"] == 0.3
|
|
assert "temperature" not in retry_kwargs
|
|
# other kwargs preserved
|
|
assert retry_kwargs["max_tokens"] == 500
|
|
|
|
def test_non_temperature_400_does_not_retry_as_temperature(self):
|
|
"""Unrelated 400s (e.g. bad tool role) must not silently drop temp."""
|
|
client = MagicMock()
|
|
client.base_url = "https://api.openai.com/v1"
|
|
non_temp_err = RuntimeError(
|
|
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'..."
|
|
)
|
|
client.chat.completions.create.side_effect = non_temp_err
|
|
|
|
with (
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
return_value=(client, "gpt-5.5")),
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
side_effect=lambda resp, _task: resp),
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
return_value=None),
|
|
):
|
|
with pytest.raises(RuntimeError, match="Invalid value"):
|
|
call_llm(
|
|
task="compression",
|
|
messages=[{"role": "user", "content": "x"}],
|
|
temperature=0.3,
|
|
max_tokens=500,
|
|
)
|
|
# Should NOT have retried (non-temperature 400 doesn't match)
|
|
assert client.chat.completions.create.call_count == 1
|
|
|
|
def test_no_retry_when_temperature_not_in_kwargs(self):
|
|
"""If caller didn't send temperature, don't invent a temperature-retry."""
|
|
client = MagicMock()
|
|
client.base_url = "https://api.openai.com/v1"
|
|
# Provider complains about temperature even though we didn't send it.
|
|
# (Pathological but possible with misleading error text.) The guard
|
|
# ``"temperature" in kwargs`` must prevent an unnecessary retry.
|
|
err = RuntimeError("HTTP 400: Unsupported parameter: temperature")
|
|
client.chat.completions.create.side_effect = err
|
|
|
|
with (
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
return_value=(client, "gpt-5.5")),
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
side_effect=lambda resp, _task: resp),
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
return_value=None),
|
|
):
|
|
with pytest.raises(RuntimeError):
|
|
call_llm(
|
|
task="compression",
|
|
messages=[{"role": "user", "content": "x"}],
|
|
temperature=None, # explicit: no temperature sent
|
|
max_tokens=500,
|
|
)
|
|
assert client.chat.completions.create.call_count == 1
|
|
|
|
|
|
class TestAsyncCallLlmUnsupportedTemperatureRetry:
|
|
"""``async_call_llm`` mirror of the sync retry semantics."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_retries_once_without_temperature(self):
|
|
client = MagicMock()
|
|
client.base_url = "https://api.openai.com/v1"
|
|
client.chat.completions.create = AsyncMock(side_effect=[
|
|
RuntimeError("HTTP 400: Unsupported parameter: temperature"),
|
|
_dummy_response(),
|
|
])
|
|
|
|
with (
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
return_value=(client, "gpt-5.5")),
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
side_effect=lambda resp, _task: resp),
|
|
):
|
|
result = await async_call_llm(
|
|
task="session_search",
|
|
messages=[{"role": "user", "content": "query"}],
|
|
temperature=0.3,
|
|
max_tokens=500,
|
|
)
|
|
|
|
assert result == {"ok": True}
|
|
assert client.chat.completions.create.await_count == 2
|
|
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
|
|
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
|
assert first_kwargs["temperature"] == 0.3
|
|
assert "temperature" not in retry_kwargs
|
|
assert retry_kwargs["max_tokens"] == 500
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_non_temperature_400_does_not_retry(self):
|
|
client = MagicMock()
|
|
client.base_url = "https://api.openai.com/v1"
|
|
client.chat.completions.create = AsyncMock(
|
|
side_effect=RuntimeError("HTTP 400: Invalid value: 'tool'"),
|
|
)
|
|
|
|
with (
|
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
|
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
|
patch("agent.auxiliary_client._get_cached_client",
|
|
return_value=(client, "gpt-5.5")),
|
|
patch("agent.auxiliary_client._validate_llm_response",
|
|
side_effect=lambda resp, _task: resp),
|
|
patch("agent.auxiliary_client._try_payment_fallback",
|
|
return_value=None),
|
|
):
|
|
with pytest.raises(RuntimeError, match="Invalid value"):
|
|
await async_call_llm(
|
|
task="session_search",
|
|
messages=[{"role": "user", "content": "x"}],
|
|
temperature=0.3,
|
|
max_tokens=500,
|
|
)
|
|
assert client.chat.completions.create.await_count == 1
|