Files
hermes-agent/tests/run_agent/test_compression_feasibility.py
kshitijk4poor 25d9fc8094 fix(flush_memories): always deduct headroom + resolve flush aux model + trim defence
Three fixes for flush_memories / compression context window overflow:

1. ALWAYS deduct headroom before comparing aux_context vs threshold.
   #15631 only deducted inside 'if aux_context < threshold' — which
   never fires in the common same-model case (threshold = context × 0.50
   means aux_context > threshold always). Now headroom is computed
   unconditionally and effective_limit = aux_context - headroom is
   compared against threshold.

2. Also resolve flush_memories auxiliary model in the feasibility check.
   If the user configures separate auxiliary.flush_memories provider,
   the flush model's smaller context was unchecked.

3. Defence-in-depth trimming in flush_memories() for CLI /new and
   gateway resets that bypass preflight compression entirely.
2026-04-25 19:53:54 +05:30

534 lines
21 KiB
Python

"""Tests for _check_compression_model_feasibility() — warns when the
auxiliary compression model's context is smaller than the main model's
compression threshold.
Two-phase design:
1. __init__ → runs the check, prints via _vprint (CLI), stores warning
2. run_conversation (first call) → replays stored warning through
status_callback (gateway platforms)
"""
from unittest.mock import MagicMock, patch
import pytest
from run_agent import AIAgent
from agent.context_compressor import ContextCompressor
def _make_agent(
*,
compression_enabled: bool = True,
threshold_percent: float = 0.50,
main_context: int = 200_000,
) -> AIAgent:
"""Build a minimal AIAgent with a compressor, skipping __init__."""
agent = AIAgent.__new__(AIAgent)
agent.model = "test-main-model"
agent.provider = "openrouter"
agent.base_url = "https://openrouter.ai/api/v1"
agent.api_key = "sk-test"
agent.api_mode = "chat_completions"
agent.quiet_mode = True
agent.log_prefix = ""
agent.compression_enabled = compression_enabled
agent._print_fn = None
agent.suppress_status_output = False
agent._stream_consumers = []
agent._executing_tools = False
agent._mute_post_response = False
agent.status_callback = None
agent.tool_progress_callback = None
agent._compression_warning = None
agent._aux_compression_context_length_config = None
# Tools feed into the headroom calculation in _check_compression_model_feasibility.
# Tests that want to assert specific threshold values can override this.
agent.tools = []
compressor = MagicMock(spec=ContextCompressor)
compressor.context_length = main_context
compressor.threshold_tokens = int(main_context * threshold_percent)
agent.context_compressor = compressor
return agent
# ── Core warning logic ──────────────────────────────────────────────
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
"""Auto-correction: aux >= 64K floor but < threshold → lower threshold
to aux_context so compression still works this session."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
# threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "google/gemini-3-flash-preview")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert len(messages) == 1
assert "Compression model" in messages[0]
assert "80,000" in messages[0] # aux context
assert "100,000" in messages[0] # old threshold
assert "Auto-lowered" in messages[0]
# Actionable persistence guidance included
assert "config.yaml" in messages[0]
assert "auxiliary:" in messages[0]
assert "compression:" in messages[0]
assert "threshold:" in messages[0]
# Warning stored for gateway replay
assert agent._compression_warning is not None
# Threshold on the live compressor was actually lowered, accounting for
# the request-overhead headroom (empty tools list → ~12K headroom only).
assert agent.context_compressor.threshold_tokens == 68_000
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
"""Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
refuses to start (ValueError), mirroring the main-model rejection."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "tiny-aux-model")
agent._emit_status = lambda msg: None
with pytest.raises(ValueError) as exc_info:
agent._check_compression_model_feasibility()
err = str(exc_info.value)
assert "tiny-aux-model" in err
assert "32,768" in err
assert "64,000" in err
assert "below the minimum" in err
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len):
"""No warning when aux model context >= main model threshold."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
# threshold = 100,000 — aux has 200,000 (sufficient)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "google/gemini-2.5-flash")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert len(messages) == 0
assert agent._compression_warning is None
def test_feasibility_check_passes_live_main_runtime():
"""Compression feasibility should probe using the live session runtime."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
agent.model = "gpt-5.4"
agent.provider = "openai-codex"
agent.base_url = "https://chatgpt.com/backend-api/codex"
agent.api_key = "codex-token"
agent.api_mode = "codex_responses"
mock_client = MagicMock()
mock_client.base_url = "https://chatgpt.com/backend-api/codex"
mock_client.api_key = "codex-token"
with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_client, "gpt-5.4")) as mock_get_client, \
patch("agent.model_metadata.get_model_context_length", return_value=200_000):
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
# Called for both compression + flush_memories; verify compression call present
assert any(
c == (("compression",), {"main_runtime": {
"model": "gpt-5.4", "provider": "openai-codex",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_key": "codex-token", "api_mode": "codex_responses",
}})
for c in mock_get_client.call_args_list
)
@patch("agent.model_metadata.get_model_context_length", return_value=1_000_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_feasibility_check_passes_config_context_length(mock_get_client, mock_ctx_len):
"""auxiliary.compression.context_length from config is forwarded to
get_model_context_length so custom endpoints that lack /models still
report the correct context window (fixes #8499)."""
agent = _make_agent(main_context=200_000, threshold_percent=0.85)
agent._aux_compression_context_length_config = 1_000_000
mock_client = MagicMock()
mock_client.base_url = "http://custom-endpoint:8080/v1"
mock_client.api_key = "sk-custom"
mock_get_client.return_value = (mock_client, "custom/big-model")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
# First call is the compression model
assert mock_ctx_len.call_args_list[0] == (
("custom/big-model",),
{"base_url": "http://custom-endpoint:8080/v1",
"api_key": "sk-custom", "config_context_length": 1_000_000,
"provider": "openrouter"},
)
@patch("agent.model_metadata.get_model_context_length", return_value=128_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_feasibility_check_ignores_invalid_context_length(mock_get_client, mock_ctx_len):
"""Non-integer context_length in config is silently ignored."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
agent._aux_compression_context_length_config = None
mock_client = MagicMock()
mock_client.base_url = "http://custom:8080/v1"
mock_client.api_key = "sk-test"
mock_get_client.return_value = (mock_client, "custom/model")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
assert mock_ctx_len.call_args_list[0] == (
("custom/model",),
{"base_url": "http://custom:8080/v1",
"api_key": "sk-test", "config_context_length": None,
"provider": "openrouter"},
)
def test_init_feasibility_check_uses_aux_context_override_from_config():
"""Real AIAgent init should cache and forward auxiliary.compression.context_length."""
class _StubCompressor:
def __init__(self, *args, **kwargs):
self.context_length = 200_000
self.threshold_tokens = 100_000
self.threshold_percent = 0.50
def get_tool_schemas(self):
return []
def on_session_start(self, *args, **kwargs):
return None
cfg = {
"auxiliary": {
"compression": {
"context_length": 1_000_000,
},
},
}
mock_client = MagicMock()
mock_client.base_url = "http://custom-endpoint:8080/v1"
mock_client.api_key = "sk-custom"
with (
patch("hermes_cli.config.load_config", return_value=cfg),
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
patch("run_agent.ContextCompressor", new=_StubCompressor),
patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_client, "custom/big-model")),
patch("agent.model_metadata.get_model_context_length", return_value=1_000_000) as mock_ctx_len,
):
agent = AIAgent(
api_key="test-key-1234567890",
base_url="https://openrouter.ai/api/v1",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
assert agent._aux_compression_context_length_config == 1_000_000
c0 = mock_ctx_len.call_args_list[0]
assert c0.args == ("custom/big-model",)
assert c0.kwargs["base_url"] == "http://custom-endpoint:8080/v1"
assert c0.kwargs["config_context_length"] == 1_000_000
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_warns_when_no_auxiliary_provider(mock_get_client):
"""Warning emitted when no auxiliary provider is configured."""
agent = _make_agent()
mock_get_client.return_value = (None, None)
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert len(messages) == 1
assert "No auxiliary LLM provider" in messages[0]
assert agent._compression_warning is not None
def test_skips_check_when_compression_disabled():
"""No check performed when compression is disabled."""
agent = _make_agent(compression_enabled=False)
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert len(messages) == 0
assert agent._compression_warning is None
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_exception_does_not_crash(mock_get_client):
"""Exceptions in the check are caught — never blocks startup."""
agent = _make_agent()
mock_get_client.side_effect = RuntimeError("boom")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
# Should not raise
agent._check_compression_model_feasibility()
# No user-facing message (error is debug-logged)
assert len(messages) == 0
@patch("agent.model_metadata.get_model_context_length", return_value=100_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_exact_threshold_boundary_triggers_headroom_correction(mock_get_client, mock_ctx_len):
"""When aux context exactly equals the threshold, headroom deduction
still fires — flush_memories adds system prompt + tool schema on top
of the conversation messages, so threshold must be lowered."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "test-model")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
# 100K - headroom < 100K → auto-corrects
assert len(messages) == 1
assert "Auto-lowered" in messages[0]
assert agent.context_compressor.threshold_tokens < 100_000
@patch("agent.model_metadata.get_model_context_length", return_value=99_999)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
"""Auto-correct fires when aux context is one token below the threshold
(and above the 64K hard floor)."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "small-model")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert len(messages) == 1
assert "small-model" in messages[0]
assert "Auto-lowered" in messages[0]
assert agent.context_compressor.threshold_tokens == 87_999
# ── Headroom for system prompt + tool schemas ────────────────────────
@patch("agent.model_metadata.get_model_context_length", return_value=128_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_auto_lowered_threshold_reserves_headroom_for_tools_and_system(mock_get_client, mock_ctx_len):
"""When aux context binds the threshold, new_threshold must leave room
for the system prompt and tool schemas that auxiliary callers
(compression summariser, flush_memories) prepend to the message list.
Without headroom, a full-budget message window + ~25K system/tool
overhead overflows the aux model with HTTP 400. Regression guard for
the flush_memories-on-busy-toolset overflow path.
"""
# Main context 200K, threshold 70% = 140K. Aux pins at 128K (below
# threshold → triggers auto-correct).
agent = _make_agent(main_context=200_000, threshold_percent=0.70)
# Build a realistic tool schema load.
agent.tools = [
{
"type": "function",
"function": {
"name": f"tool_{i}",
"description": "x" * 200,
"parameters": {"type": "object", "properties": {"arg": {"type": "string", "description": "y" * 120}}},
},
}
for i in range(50)
]
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "model-with-128k")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
new_threshold = agent.context_compressor.threshold_tokens
# Must have strictly reserved headroom: new_threshold < aux_context.
assert new_threshold < 128_000, (
f"threshold {new_threshold} did not reserve headroom below aux=128,000 "
f"— system prompt + tools would overflow the aux model"
)
# Must respect the 64K hard floor.
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
assert new_threshold >= MINIMUM_CONTEXT_LENGTH
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_headroom_floors_at_minimum_context(mock_get_client, mock_ctx_len):
"""If headroom subtraction would push below 64K floor, clamp to 64K
rather than refusing the session — the aux is still workable for a
smaller message window.
"""
# Aux at 80K, with enough tools to push headroom > 16K → naive subtract
# would land at < 64K. The max(..., MINIMUM_CONTEXT_LENGTH) clamp must
# keep the session running.
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
agent.tools = [
{
"type": "function",
"function": {
"name": f"tool_{i}",
"description": "z" * 2_000, # fat descriptions
"parameters": {},
},
}
for i in range(30)
]
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "small-aux-model")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
assert agent.context_compressor.threshold_tokens == MINIMUM_CONTEXT_LENGTH
# ── Two-phase: __init__ + run_conversation replay ───────────────────
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
"""__init__ stores the warning; _replay sends it through status_callback."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "google/gemini-3-flash-preview")
# Phase 1: __init__ — _emit_status prints (CLI) but callback is None
vprint_messages = []
agent._emit_status = lambda msg: vprint_messages.append(msg)
agent._check_compression_model_feasibility()
assert len(vprint_messages) == 1 # CLI got it
assert agent._compression_warning is not None # stored for replay
# Phase 2: gateway wires callback post-init, then run_conversation replays
callback_events = []
agent.status_callback = lambda ev, msg: callback_events.append((ev, msg))
agent._replay_compression_warning()
assert any(
ev == "lifecycle" and "Auto-lowered" in msg
for ev, msg in callback_events
)
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_no_replay_when_no_warning(mock_get_client, mock_ctx_len):
"""_replay_compression_warning is a no-op when there's no stored warning."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "big-model")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
assert agent._compression_warning is None
callback_events = []
agent.status_callback = lambda ev, msg: callback_events.append((ev, msg))
agent._replay_compression_warning()
assert len(callback_events) == 0
def test_replay_without_callback_is_noop():
"""_replay_compression_warning doesn't crash when status_callback is None."""
agent = _make_agent()
agent._compression_warning = "some warning"
agent.status_callback = None
# Should not raise
agent._replay_compression_warning()
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
"""After replay in run_conversation, _compression_warning is cleared
so the warning is not sent again on subsequent turns."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "small-model")
agent._emit_status = lambda msg: None
agent._check_compression_model_feasibility()
assert agent._compression_warning is not None
# Simulate what run_conversation does
callback_events = []
agent.status_callback = lambda ev, msg: callback_events.append((ev, msg))
if agent._compression_warning:
agent._replay_compression_warning()
agent._compression_warning = None # as in run_conversation
assert len(callback_events) == 1
# Second turn — nothing replayed
callback_events.clear()
if agent._compression_warning:
agent._replay_compression_warning()
agent._compression_warning = None
assert len(callback_events) == 0