feat: require runway before prune-only compaction

Make prune-first compression cache-aware by only accepting prune-only compaction when it gets comfortably below threshold. If pruning merely dips under threshold, fall through to the existing summary compaction so we avoid frequent near-threshold recompressions. Tests cover both the conservative fallback and the prune-only fast path.
docs: add cache-aware compaction design note
2026-04-28 23:11:37 +08:00 · 2026-03-13 21:46:09 -07:00 · 2026-03-13 21:46:03 -07:00 · 2026-03-13 21:21:28 -07:00
3 changed files with 445 additions and 3 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -7,7 +7,7 @@ protecting head and tail context.
 import logging
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from agent.auxiliary_client import call_llm
 from agent.model_metadata import (
@@ -17,6 +17,24 @@ from agent.model_metadata import (
 logger = logging.getLogger(__name__)
 NEVER_PRUNE_TOOLS = {"clarify", "memory", "skill_view", "todo", "read_file"}
 def _adaptive_prune_protect(context_length: int) -> int:
    """Scale the recent-tool-output protection window to the model context size."""
    if context_length >= 500_000:
        return 100_000
    if context_length >= 128_000:
        return 40_000
    if context_length >= 64_000:
        return 20_000
    return 10_000
 def _adaptive_prune_minimum(context_length: int) -> int:
    """Only prune when it reclaims a meaningful amount of prompt budget."""
    return max(5_000, context_length // 20)
 class ContextCompressor:
    """Compresses conversation context when approaching the model's context limit.
@@ -54,6 +72,10 @@ class ContextCompressor:
        self.last_total_tokens = 0
        self.summary_model = summary_model_override or ""
        self._prune_protect_tokens = _adaptive_prune_protect(self.context_length)
        self._prune_minimum_tokens = _adaptive_prune_minimum(self.context_length)
        self._prune_runway_tokens = max(self._prune_minimum_tokens, int(self.threshold_tokens * 0.15))
        self._prune_target_tokens = max(0, self.threshold_tokens - self._prune_runway_tokens)
    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
@@ -81,6 +103,58 @@ class ContextCompressor:
            "compression_count": self.compression_count,
        }
    def _is_protected_tool(self, message: Dict[str, Any]) -> bool:
        """Return True when a tool output should never be pruned."""
        return (message.get("name") or "") in NEVER_PRUNE_TOOLS
    def _prune_tool_outputs(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
        """Replace older middle tool outputs with compact placeholders.
        Only prunes tool outputs from the same middle region that would be eligible
        for summarization. The head/tail protected windows are left untouched.
        Returns:
            (messages_after_prune, chars_saved)
        """
        n_messages = len(messages)
        compress_start = self.protect_first_n
        compress_end = n_messages - self.protect_last_n
        if compress_start >= compress_end:
            return messages, 0
        compress_start = self._align_boundary_forward(messages, compress_start)
        compress_end = self._align_boundary_backward(messages, compress_end)
        if compress_start >= compress_end:
            return messages, 0
        pruned = [msg.copy() for msg in messages]
        chars_saved = 0
        recent_tool_tokens = 0
        for i in range(compress_end - 1, compress_start - 1, -1):
            msg = pruned[i]
            if msg.get("role") != "tool" or self._is_protected_tool(msg):
                continue
            content = msg.get("content")
            content_text = content if isinstance(content, str) else str(content or "")
            token_estimate = max(1, len(content_text) // 4)
            if recent_tool_tokens < self._prune_protect_tokens:
                recent_tool_tokens += token_estimate
                continue
            original_len = len(content_text)
            placeholder = f"[Tool output pruned — was {original_len:,} chars]"
            pruned[i]["content"] = placeholder
            chars_saved += max(0, original_len - len(placeholder))
        tokens_saved = chars_saved // 4
        if tokens_saved < self._prune_minimum_tokens:
            return messages, 0
        return pruned, chars_saved
    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
        """Generate a concise summary of conversation turns.
@@ -267,13 +341,49 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
        if compress_start >= compress_end:
            return messages
-        turns_to_summarize = messages[compress_start:compress_end]
+        display_tokens = current_tokens if current_tokens is not None else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
        display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
        if not self.quiet_mode:
            print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
            print(f"   📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
        pruned_messages, chars_saved = self._prune_tool_outputs(messages)
        if chars_saved > 0:
            pruned_tokens = estimate_messages_tokens_rough(pruned_messages)
            tokens_saved_phase1 = max(0, display_tokens - pruned_tokens)
            if not self.quiet_mode:
                print(
                    f"   ✂️  Phase 1 (prune): removed {chars_saved:,} chars of old tool outputs "
                    f"(~{tokens_saved_phase1:,} tokens saved)"
                )
            if pruned_tokens <= self._prune_target_tokens:
                self.compression_count += 1
                pruned_messages = self._sanitize_tool_pairs(pruned_messages)
                if not self.quiet_mode:
                    print(
                        f"   ✅ Phase 1 sufficient: {n_messages} → {len(pruned_messages)} messages, "
                        f"now {pruned_tokens:,} tokens"
                    )
                    print(f"   💡 Compression #{self.compression_count} complete (prune only — no LLM call needed)")
                return pruned_messages
            if not self.quiet_mode and pruned_tokens < self.threshold_tokens:
                print(
                    f"   ↪️  Phase 1 recovered tokens but not enough runway "
                    f"({pruned_tokens:,} > target {self._prune_target_tokens:,}); continuing to compaction"
                )
            messages = pruned_messages
            n_messages = len(messages)
            compress_start = self.protect_first_n
            compress_end = n_messages - self.protect_last_n
            if compress_start >= compress_end:
                return messages
            compress_start = self._align_boundary_forward(messages, compress_start)
            compress_end = self._align_boundary_backward(messages, compress_end)
            if compress_start >= compress_end:
                return messages
        turns_to_summarize = messages[compress_start:compress_end]
        if not self.quiet_mode:
            print(f"   🗜️  Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
--- a/docs/plans/2026-03-14-cache-aware-context-compaction.md
+++ b/docs/plans/2026-03-14-cache-aware-context-compaction.md
@@ -0,0 +1,192 @@
 # Cache-Aware Context Compaction Design Note
 > For Hermes: this note is a design/implementation sketch for revisiting prune-first compaction without optimizing token spend at the expense of prompt-cache stability.
 Goal: reduce compression cost while keeping cache-break frequency as low as possible.
 Architecture: keep Hermes' current invariant that conversation history is only mutated during context compression, then make prune-first compaction conservative enough that it only short-circuits when it buys meaningful runway. If pruning only gets us barely below threshold, fall through to the existing summary compaction immediately.
 Tech Stack: `agent/context_compressor.py`, existing `call_llm()`-based summary path, pytest coverage in `tests/agent/test_context_compressor.py`.
 ---
 ## 1. Baseline behavior on current main
 Today Hermes behaves like this:
 1. Prompt crosses the compression threshold.
 2. We mutate transcript history once by summarizing the middle region with an LLM.
 3. We preserve role alternation and tool-call/tool-result integrity.
 4. We continue the conversation from the compressed transcript.
 This is expensive in two ways:
 - an auxiliary summary call is often required
 - the entire compressed middle region is rewritten even when the real problem was just a few huge old tool outputs
 But it has one strong cache property:
 - it tends to reclaim a lot of headroom per compression event, so the next compression is usually farther away
 ---
 ## 2. Why naive prune-first compaction is not enough
 A naive prune-first policy says:
 - prune old tool outputs
 - if prompt is now below threshold, stop
 This improves per-event token cost, but it can hurt cache economics:
 - prune-only may reclaim less headroom than full compaction
 - smaller headroom means the next compression may happen sooner
 - each compression event is still a cache-breaking transcript mutation
 So there is a real failure mode:
 - fewer tokens per compression
 - more compression events overall
 - worse cache break cadence
 That is exactly the tradeoff we want to avoid.
 ---
 ## 3. Cache-aware principle
 Prune-first compaction should only short-circuit when it buys real runway, not when it merely dips under threshold.
 Rule of thumb:
 - compression frequency matters as much as compression size
 - a smaller mutation is not automatically cheaper if it causes another mutation a few turns later
 So the design target is:
 - fewer auxiliary summary calls
 - without materially increasing compression frequency
 ---
 ## 4. Conservative prototype policy
 The conservative prototype keeps all existing compression invariants and only changes the acceptance rule for prune-only compaction.
 ### Phase 1: prune old middle tool outputs
 Only prune tool outputs that are:
 - in the compressible middle region
 - not in protected head/tail windows
 - not from protected tools (`read_file`, `memory`, `clarify`, `skill_view`, `todo`)
 ### Phase 2: require a low-water mark
 Do not accept prune-only just because it lands below threshold.
 Instead require:
 - `post_prune_tokens <= prune_target_tokens`
 Where:
 - `prune_runway_tokens = max(prune_minimum_tokens, 15% of threshold_tokens)`
 - `prune_target_tokens = threshold_tokens - prune_runway_tokens`
 Interpretation:
 - pruning must get us comfortably below threshold
 - otherwise we immediately fall through to normal LLM summary compaction
 Why this helps:
 - protects cache by avoiding "micro-compactions" that would be followed by another compression shortly after
 - still avoids the summary call when pruning truly buys useful runway
 ---
 ## 5. What the prototype currently does
 The prototype branch currently:
 - keeps prune-first compaction
 - adds the low-water / runway requirement above
 - preserves current main behavior for summary role alternation
 - preserves the centralized `call_llm()` summary path
 - keeps head/tail and tool-call/result integrity handling unchanged
 This means the branch is no longer optimizing only for token reduction per event; it is explicitly biased toward fewer compression events.
 ---
 ## 6. Metrics we should evaluate before merging any future version
 A serious cache-aware review should measure all of these, not just token savings:
 1. Compression events per 100 conversation turns
 2. Average turns between compressions
 3. Auxiliary summary calls per session
 4. Average tokens reclaimed per compression event
 5. Total prompt+auxiliary tokens spent over a long session
 6. Earliest changed message index during compression
 7. Ratio of prune-only compressions to full summary compressions
 The most important comparison is:
 - baseline main vs conservative prune-first
 Success is not:
 - "fewer tokens in one compression"
 Success is:
 - "equal or better total session cost without increasing compression/cache-break cadence in a meaningful way"
 ---
 ## 7. Better long-term directions
 If we want a stronger cache story than conservative prune-first, these are the real next-step options:
 ### A. Insertion-time trimming
 Best cache-preserving option.
 Idea:
 - trim or summarize giant tool outputs before they become durable transcript history
 - keep a compact representation from the start instead of mutating history later
 Pros:
 - avoids later cache-breaking rewrites for those blobs
 - makes transcript size stable earlier
 Cons:
 - more invasive design change
 - requires careful UX and provenance handling
 ### B. Provider/backend-aware compaction policy
 Different providers may reward:
 - preserving a longer stable prefix
 - or simply reducing total prompt size
 We may eventually want backend-specific heuristics for:
 - prune runway targets
 - compression thresholds
 - when to prefer summary vs pruning
 ### C. Explicit compression telemetry
 If compression remains a core feature, `ContextCompressor` should expose enough telemetry to understand real-world cadence:
 - prune-only count
 - full summary count
 - average recovered tokens
 - last compression mode
 This is not required for the conservative prototype, but it would make future tuning much easier.
 ---
 ## 8. Recommended next steps
 1. Keep the conservative prototype local for review.
 2. Run targeted tests plus long-session manual trials.
 3. If it looks promising, add telemetry before opening another PR.
 4. If cache stability remains the top priority, pursue insertion-time trimming instead of further read-time pruning tweaks.
 ---
 ## 9. Review question for Teknium
 The key product question is:
 "Should Hermes optimize compression primarily for per-event token cost, or for minimizing the number of transcript mutations over the lifetime of a session?"
 This prototype assumes the answer is:
 - prioritize fewer transcript mutations unless pruning buys substantial runway.
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -314,3 +314,143 @@ class TestCompressWithClient:
        for msg in result:
            if msg.get("role") == "tool" and msg.get("tool_call_id"):
                assert msg["tool_call_id"] in called_ids
 class TestPruneToolOutputs:
    def _make_compressor(self, *, context_length=128000, protect_first_n=2, protect_last_n=2):
        with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
            return ContextCompressor(
                model="test/model",
                threshold_percent=0.50,
                protect_first_n=protect_first_n,
                protect_last_n=protect_last_n,
                quiet_mode=True,
            )
    def test_prune_replaces_old_middle_tool_outputs(self):
        c = self._make_compressor(protect_last_n=1)
        big_content = "x" * (c._prune_protect_tokens * 4)
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": big_content, "name": "terminal"},
            {"role": "assistant", "content": "newer"},
            {"role": "tool", "content": big_content, "name": "terminal"},
            {"role": "assistant", "content": "tail"},
        ]
        pruned, chars_saved = c._prune_tool_outputs(messages)
        assert chars_saved > 0
        assert pruned[3]["content"].startswith("[Tool output pruned")
        assert pruned[5]["content"] == big_content
    def test_protected_tools_are_never_pruned(self):
        c = self._make_compressor()
        big_content = "x" * (c._prune_protect_tokens * 8)
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": big_content, "name": "read_file"},
            {"role": "assistant", "content": "middle"},
            {"role": "tool", "content": big_content, "name": "terminal"},
            {"role": "assistant", "content": "tail"},
        ]
        pruned, _ = c._prune_tool_outputs(messages)
        read_file_msg = next(msg for msg in pruned if msg.get("name") == "read_file")
        assert read_file_msg["content"] == big_content
    def test_prune_only_path_skips_summary_call_when_sufficient(self):
        c = self._make_compressor(protect_first_n=2, protect_last_n=1)
        huge_content = "x" * 180000
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "newer"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "tail"},
        ]
        with patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
            result = c.compress(messages, current_tokens=200000)
        assert result[3]["content"].startswith("[Tool output pruned")
        assert result[5]["content"] == huge_content
        assert c.compression_count == 1
    def test_prune_does_not_touch_protected_tail_messages(self):
        c = self._make_compressor(context_length=128000, protect_first_n=2, protect_last_n=3)
        huge_content = "x" * (c._prune_protect_tokens * 8)
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "tail assistant"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "latest"},
        ]
        pruned, _ = c._prune_tool_outputs(messages)
        assert pruned[-2]["content"] == huge_content
        assert pruned[-1]["content"] == "latest"
 class TestPruneAcceptancePolicy:
    def _make_compressor(self, *, context_length=128000):
        with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
            return ContextCompressor(
                model="test/model",
                threshold_percent=0.50,
                protect_first_n=2,
                protect_last_n=1,
                quiet_mode=True,
            )
    def test_prune_near_threshold_still_falls_back_to_summary(self):
        c = self._make_compressor()
        huge_content = "x" * 180000
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "newer"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "tail"},
        ]
        mock_response = MagicMock()
        mock_response.choices = [MagicMock()]
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compacted"
        with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=62000), \
             patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(messages, current_tokens=68000)
        assert any("CONTEXT SUMMARY" in (msg.get("content") or "") for msg in result)
    def test_prune_only_is_allowed_when_it_buys_real_runway(self):
        c = self._make_compressor()
        huge_content = "x" * 180000
        messages = [
            {"role": "system", "content": "sys"},
            {"role": "user", "content": "task"},
            {"role": "assistant", "content": "older"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "newer"},
            {"role": "tool", "content": huge_content, "name": "terminal"},
            {"role": "assistant", "content": "tail"},
        ]
        with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=48000), \
             patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
            result = c.compress(messages, current_tokens=68000)
        assert result[3]["content"].startswith("[Tool output pruned")
        assert result[5]["content"] == huge_content