fix(compression): reserve system+tools headroom when aux binds threshold (#15631)

When the auxiliary compression model's context is smaller than the main model's compression threshold, _check_compression_model_feasibility auto-lowers the session threshold. Previously it set: new_threshold = aux_context This let the raw message list grow to exactly aux_context tokens. But compression and flush_memories actually send system_prompt + tool_schemas + messages to the aux model. With 50+ tools that overhead is 25-30K tokens, so the full request overflowed aux with HTTP 400. Subtract a headroom estimate from aux_context before setting the new threshold: the actual tool-schema token count (from estimate_request_tokens_rough) plus a 12K allowance for the system prompt (not yet built at __init__ time) and flush-instruction overhead. Clamp to MINIMUM_CONTEXT_LENGTH so the session still starts even with an unusually heavy tool schema. This fixes the 'flush_memories overflow on busy toolsets' path that Teknium flagged — where main and aux can be nominally the same model but still 400 because the threshold left no room for the request overhead. Same fix also protects the normal compression summarisation request on the same binding aux. Tests: two new regression tests cover the headroom reservation and the MINIMUM_CONTEXT_LENGTH floor. Two existing tests updated for the new (lower) threshold values now that empty-tools still produces a 12K static headroom deduction.
2026-04-28 06:51:16 +08:00 · 2026-04-25 05:41:56 -07:00
parent b35d692f45
commit f92006ce1c
2 changed files with 110 additions and 4 deletions
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -41,6 +41,9 @@ def _make_agent(
    agent.tool_progress_callback = None
    agent._compression_warning = None
    agent._aux_compression_context_length_config = None
+    # Tools feed into the headroom calculation in _check_compression_model_feasibility.
+    # Tests that want to assert specific threshold values can override this.
+    agent.tools = []

    compressor = MagicMock(spec=ContextCompressor)
    compressor.context_length = main_context
@@ -82,8 +85,9 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
    assert "threshold:" in messages[0]
    # Warning stored for gateway replay
    assert agent._compression_warning is not None
-    # Threshold on the live compressor was actually lowered
-    assert agent.context_compressor.threshold_tokens == 80_000
+    # Threshold on the live compressor was actually lowered, accounting for
+    # the request-overhead headroom (empty tools list → ~12K headroom only).
+    assert agent.context_compressor.threshold_tokens == 68_000


@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@@ -339,7 +343,93 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
    assert len(messages) == 1
    assert "small-model" in messages[0]
    assert "Auto-lowered" in messages[0]
-    assert agent.context_compressor.threshold_tokens == 99_999
+    assert agent.context_compressor.threshold_tokens == 87_999
+
+
+# ── Headroom for system prompt + tool schemas ────────────────────────
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=128_000)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_auto_lowered_threshold_reserves_headroom_for_tools_and_system(mock_get_client, mock_ctx_len):
+    """When aux context binds the threshold, new_threshold must leave room
+    for the system prompt and tool schemas that auxiliary callers
+    (compression summariser, flush_memories) prepend to the message list.
+
+    Without headroom, a full-budget message window + ~25K system/tool
+    overhead overflows the aux model with HTTP 400.  Regression guard for
+    the flush_memories-on-busy-toolset overflow path.
+    """
+    # Main context 200K, threshold 70% = 140K.  Aux pins at 128K (below
+    # threshold → triggers auto-correct).
+    agent = _make_agent(main_context=200_000, threshold_percent=0.70)
+
+    # Build a realistic tool schema load.
+    agent.tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": f"tool_{i}",
+                "description": "x" * 200,
+                "parameters": {"type": "object", "properties": {"arg": {"type": "string", "description": "y" * 120}}},
+            },
+        }
+        for i in range(50)
+    ]
+
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "model-with-128k")
+
+    agent._emit_status = lambda msg: None
+    agent._check_compression_model_feasibility()
+
+    new_threshold = agent.context_compressor.threshold_tokens
+
+    # Must have strictly reserved headroom: new_threshold < aux_context.
+    assert new_threshold < 128_000, (
+        f"threshold {new_threshold} did not reserve headroom below aux=128,000 "
+        f"— system prompt + tools would overflow the aux model"
+    )
+    # Must respect the 64K hard floor.
+    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
+    assert new_threshold >= MINIMUM_CONTEXT_LENGTH
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_headroom_floors_at_minimum_context(mock_get_client, mock_ctx_len):
+    """If headroom subtraction would push below 64K floor, clamp to 64K
+    rather than refusing the session — the aux is still workable for a
+    smaller message window.
+    """
+    # Aux at 80K, with enough tools to push headroom > 16K → naive subtract
+    # would land at < 64K.  The max(..., MINIMUM_CONTEXT_LENGTH) clamp must
+    # keep the session running.
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    agent.tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": f"tool_{i}",
+                "description": "z" * 2_000,  # fat descriptions
+                "parameters": {},
+            },
+        }
+        for i in range(30)
+    ]
+
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "small-aux-model")
+
+    agent._emit_status = lambda msg: None
+    agent._check_compression_model_feasibility()
+
+    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
+    assert agent.context_compressor.threshold_tokens == MINIMUM_CONTEXT_LENGTH


 # ── Two-phase: __init__ + run_conversation replay ───────────────────