feat: require runway before prune-only compaction

Make prune-first compression cache-aware by only accepting prune-only compaction when it gets comfortably below threshold. If pruning merely dips under threshold, fall through to the existing summary compaction so we avoid frequent near-threshold recompressions. Tests cover both the conservative fallback and the prune-only fast path.
docs: add cache-aware compaction design note
2026-04-28 15:01:34 +08:00 · 2026-03-13 21:46:09 -07:00 · 2026-03-13 21:46:03 -07:00 · 2026-03-13 21:21:28 -07:00
3 changed files with 445 additions and 3 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -7,7 +7,7 @@ protecting head and tail context.

 import logging
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple

 from agent.auxiliary_client import call_llm
 from agent.model_metadata import (
@@ -17,6 +17,24 @@ from agent.model_metadata import (

 logger = logging.getLogger(__name__)

+NEVER_PRUNE_TOOLS = {"clarify", "memory", "skill_view", "todo", "read_file"}
+
+
+def _adaptive_prune_protect(context_length: int) -> int:
+    """Scale the recent-tool-output protection window to the model context size."""
+    if context_length >= 500_000:
+        return 100_000
+    if context_length >= 128_000:
+        return 40_000
+    if context_length >= 64_000:
+        return 20_000
+    return 10_000
+
+
+def _adaptive_prune_minimum(context_length: int) -> int:
+    """Only prune when it reclaims a meaningful amount of prompt budget."""
+    return max(5_000, context_length // 20)
+

 class ContextCompressor:
    """Compresses conversation context when approaching the model's context limit.
@@ -54,6 +72,10 @@ class ContextCompressor:
        self.last_total_tokens = 0

        self.summary_model = summary_model_override or ""
+        self._prune_protect_tokens = _adaptive_prune_protect(self.context_length)
+        self._prune_minimum_tokens = _adaptive_prune_minimum(self.context_length)
+        self._prune_runway_tokens = max(self._prune_minimum_tokens, int(self.threshold_tokens * 0.15))
+        self._prune_target_tokens = max(0, self.threshold_tokens - self._prune_runway_tokens)

    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
@@ -81,6 +103,58 @@ class ContextCompressor:
            "compression_count": self.compression_count,
        }

+    def _is_protected_tool(self, message: Dict[str, Any]) -> bool:
+        """Return True when a tool output should never be pruned."""
+        return (message.get("name") or "") in NEVER_PRUNE_TOOLS
+
+    def _prune_tool_outputs(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
+        """Replace older middle tool outputs with compact placeholders.
+
+        Only prunes tool outputs from the same middle region that would be eligible
+        for summarization. The head/tail protected windows are left untouched.
+
+        Returns:
+            (messages_after_prune, chars_saved)
+        """
+        n_messages = len(messages)
+        compress_start = self.protect_first_n
+        compress_end = n_messages - self.protect_last_n
+        if compress_start >= compress_end:
+            return messages, 0
+
+        compress_start = self._align_boundary_forward(messages, compress_start)
+        compress_end = self._align_boundary_backward(messages, compress_end)
+        if compress_start >= compress_end:
+            return messages, 0
+
+        pruned = [msg.copy() for msg in messages]
+        chars_saved = 0
+        recent_tool_tokens = 0
+
+        for i in range(compress_end - 1, compress_start - 1, -1):
+            msg = pruned[i]
+            if msg.get("role") != "tool" or self._is_protected_tool(msg):
+                continue
+
+            content = msg.get("content")
+            content_text = content if isinstance(content, str) else str(content or "")
+            token_estimate = max(1, len(content_text) // 4)
+
+            if recent_tool_tokens < self._prune_protect_tokens:
+                recent_tool_tokens += token_estimate
+                continue
+
+            original_len = len(content_text)
+            placeholder = f"[Tool output pruned — was {original_len:,} chars]"
+            pruned[i]["content"] = placeholder
+            chars_saved += max(0, original_len - len(placeholder))
+
+        tokens_saved = chars_saved // 4
+        if tokens_saved < self._prune_minimum_tokens:
+            return messages, 0
+
+        return pruned, chars_saved
+
    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
        """Generate a concise summary of conversation turns.

@@ -267,13 +341,49 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
        if compress_start >= compress_end:
            return messages

-        turns_to_summarize = messages[compress_start:compress_end]
-        display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
+        display_tokens = current_tokens if current_tokens is not None else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)

        if not self.quiet_mode:
            print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
            print(f"   📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")

+        pruned_messages, chars_saved = self._prune_tool_outputs(messages)
+        if chars_saved > 0:
+            pruned_tokens = estimate_messages_tokens_rough(pruned_messages)
+            tokens_saved_phase1 = max(0, display_tokens - pruned_tokens)
+            if not self.quiet_mode:
+                print(
+                    f"   ✂️  Phase 1 (prune): removed {chars_saved:,} chars of old tool outputs "
+                    f"(~{tokens_saved_phase1:,} tokens saved)"
+                )
+            if pruned_tokens <= self._prune_target_tokens:
+                self.compression_count += 1
+                pruned_messages = self._sanitize_tool_pairs(pruned_messages)
+                if not self.quiet_mode:
+                    print(
+                        f"   ✅ Phase 1 sufficient: {n_messages} → {len(pruned_messages)} messages, "
+                        f"now {pruned_tokens:,} tokens"
+                    )
+                    print(f"   💡 Compression #{self.compression_count} complete (prune only — no LLM call needed)")
+                return pruned_messages
+            if not self.quiet_mode and pruned_tokens < self.threshold_tokens:
+                print(
+                    f"   ↪️  Phase 1 recovered tokens but not enough runway "
+                    f"({pruned_tokens:,} > target {self._prune_target_tokens:,}); continuing to compaction"
+                )
+            messages = pruned_messages
+            n_messages = len(messages)
+            compress_start = self.protect_first_n
+            compress_end = n_messages - self.protect_last_n
+            if compress_start >= compress_end:
+                return messages
+            compress_start = self._align_boundary_forward(messages, compress_start)
+            compress_end = self._align_boundary_backward(messages, compress_end)
+            if compress_start >= compress_end:
+                return messages
+
+        turns_to_summarize = messages[compress_start:compress_end]
+
        if not self.quiet_mode:
            print(f"   🗜️  Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")

--- a/docs/plans/2026-03-14-cache-aware-context-compaction.md
+++ b/docs/plans/2026-03-14-cache-aware-context-compaction.md
@@ -0,0 +1,192 @@
+# Cache-Aware Context Compaction Design Note
+
+> For Hermes: this note is a design/implementation sketch for revisiting prune-first compaction without optimizing token spend at the expense of prompt-cache stability.
+
+Goal: reduce compression cost while keeping cache-break frequency as low as possible.
+
+Architecture: keep Hermes' current invariant that conversation history is only mutated during context compression, then make prune-first compaction conservative enough that it only short-circuits when it buys meaningful runway. If pruning only gets us barely below threshold, fall through to the existing summary compaction immediately.
+
+Tech Stack: `agent/context_compressor.py`, existing `call_llm()`-based summary path, pytest coverage in `tests/agent/test_context_compressor.py`.
+
+---
+
+## 1. Baseline behavior on current main
+
+Today Hermes behaves like this:
+
+1. Prompt crosses the compression threshold.
+2. We mutate transcript history once by summarizing the middle region with an LLM.
+3. We preserve role alternation and tool-call/tool-result integrity.
+4. We continue the conversation from the compressed transcript.
+
+This is expensive in two ways:
+- an auxiliary summary call is often required
+- the entire compressed middle region is rewritten even when the real problem was just a few huge old tool outputs
+
+But it has one strong cache property:
+- it tends to reclaim a lot of headroom per compression event, so the next compression is usually farther away
+
+---
+
+## 2. Why naive prune-first compaction is not enough
+
+A naive prune-first policy says:
+- prune old tool outputs
+- if prompt is now below threshold, stop
+
+This improves per-event token cost, but it can hurt cache economics:
+- prune-only may reclaim less headroom than full compaction
+- smaller headroom means the next compression may happen sooner
+- each compression event is still a cache-breaking transcript mutation
+
+So there is a real failure mode:
+- fewer tokens per compression
+- more compression events overall
+- worse cache break cadence
+
+That is exactly the tradeoff we want to avoid.
+
+---
+
+## 3. Cache-aware principle
+
+Prune-first compaction should only short-circuit when it buys real runway, not when it merely dips under threshold.
+
+Rule of thumb:
+- compression frequency matters as much as compression size
+- a smaller mutation is not automatically cheaper if it causes another mutation a few turns later
+
+So the design target is:
+- fewer auxiliary summary calls
+- without materially increasing compression frequency
+
+---
+
+## 4. Conservative prototype policy
+
+The conservative prototype keeps all existing compression invariants and only changes the acceptance rule for prune-only compaction.
+
+### Phase 1: prune old middle tool outputs
+
+Only prune tool outputs that are:
+- in the compressible middle region
+- not in protected head/tail windows
+- not from protected tools (`read_file`, `memory`, `clarify`, `skill_view`, `todo`)
+
+### Phase 2: require a low-water mark
+
+Do not accept prune-only just because it lands below threshold.
+
+Instead require:
+- `post_prune_tokens <= prune_target_tokens`
+
+Where:
+- `prune_runway_tokens = max(prune_minimum_tokens, 15% of threshold_tokens)`
+- `prune_target_tokens = threshold_tokens - prune_runway_tokens`
+
+Interpretation:
+- pruning must get us comfortably below threshold
+- otherwise we immediately fall through to normal LLM summary compaction
+
+Why this helps:
+- protects cache by avoiding "micro-compactions" that would be followed by another compression shortly after
+- still avoids the summary call when pruning truly buys useful runway
+
+---
+
+## 5. What the prototype currently does
+
+The prototype branch currently:
+- keeps prune-first compaction
+- adds the low-water / runway requirement above
+- preserves current main behavior for summary role alternation
+- preserves the centralized `call_llm()` summary path
+- keeps head/tail and tool-call/result integrity handling unchanged
+
+This means the branch is no longer optimizing only for token reduction per event; it is explicitly biased toward fewer compression events.
+
+---
+
+## 6. Metrics we should evaluate before merging any future version
+
+A serious cache-aware review should measure all of these, not just token savings:
+
+1. Compression events per 100 conversation turns
+2. Average turns between compressions
+3. Auxiliary summary calls per session
+4. Average tokens reclaimed per compression event
+5. Total prompt+auxiliary tokens spent over a long session
+6. Earliest changed message index during compression
+7. Ratio of prune-only compressions to full summary compressions
+
+The most important comparison is:
+- baseline main vs conservative prune-first
+
+Success is not:
+- "fewer tokens in one compression"
+
+Success is:
+- "equal or better total session cost without increasing compression/cache-break cadence in a meaningful way"
+
+---
+
+## 7. Better long-term directions
+
+If we want a stronger cache story than conservative prune-first, these are the real next-step options:
+
+### A. Insertion-time trimming
+
+Best cache-preserving option.
+
+Idea:
+- trim or summarize giant tool outputs before they become durable transcript history
+- keep a compact representation from the start instead of mutating history later
+
+Pros:
+- avoids later cache-breaking rewrites for those blobs
+- makes transcript size stable earlier
+
+Cons:
+- more invasive design change
+- requires careful UX and provenance handling
+
+### B. Provider/backend-aware compaction policy
+
+Different providers may reward:
+- preserving a longer stable prefix
+- or simply reducing total prompt size
+
+We may eventually want backend-specific heuristics for:
+- prune runway targets
+- compression thresholds
+- when to prefer summary vs pruning
+
+### C. Explicit compression telemetry
+
+If compression remains a core feature, `ContextCompressor` should expose enough telemetry to understand real-world cadence:
+- prune-only count
+- full summary count
+- average recovered tokens
+- last compression mode
+
+This is not required for the conservative prototype, but it would make future tuning much easier.
+
+---
+
+## 8. Recommended next steps
+
+1. Keep the conservative prototype local for review.
+2. Run targeted tests plus long-session manual trials.
+3. If it looks promising, add telemetry before opening another PR.
+4. If cache stability remains the top priority, pursue insertion-time trimming instead of further read-time pruning tweaks.
+
+---
+
+## 9. Review question for Teknium
+
+The key product question is:
+
+"Should Hermes optimize compression primarily for per-event token cost, or for minimizing the number of transcript mutations over the lifetime of a session?"
+
+This prototype assumes the answer is:
+- prioritize fewer transcript mutations unless pruning buys substantial runway.
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -314,3 +314,143 @@ class TestCompressWithClient:
        for msg in result:
            if msg.get("role") == "tool" and msg.get("tool_call_id"):
                assert msg["tool_call_id"] in called_ids
+
+
+class TestPruneToolOutputs:
+    def _make_compressor(self, *, context_length=128000, protect_first_n=2, protect_last_n=2):
+        with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
+            return ContextCompressor(
+                model="test/model",
+                threshold_percent=0.50,
+                protect_first_n=protect_first_n,
+                protect_last_n=protect_last_n,
+                quiet_mode=True,
+            )
+
+    def test_prune_replaces_old_middle_tool_outputs(self):
+        c = self._make_compressor(protect_last_n=1)
+        big_content = "x" * (c._prune_protect_tokens * 4)
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": big_content, "name": "terminal"},
+            {"role": "assistant", "content": "newer"},
+            {"role": "tool", "content": big_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail"},
+        ]
+
+        pruned, chars_saved = c._prune_tool_outputs(messages)
+
+        assert chars_saved > 0
+        assert pruned[3]["content"].startswith("[Tool output pruned")
+        assert pruned[5]["content"] == big_content
+
+    def test_protected_tools_are_never_pruned(self):
+        c = self._make_compressor()
+        big_content = "x" * (c._prune_protect_tokens * 8)
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": big_content, "name": "read_file"},
+            {"role": "assistant", "content": "middle"},
+            {"role": "tool", "content": big_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail"},
+        ]
+
+        pruned, _ = c._prune_tool_outputs(messages)
+        read_file_msg = next(msg for msg in pruned if msg.get("name") == "read_file")
+        assert read_file_msg["content"] == big_content
+
+    def test_prune_only_path_skips_summary_call_when_sufficient(self):
+        c = self._make_compressor(protect_first_n=2, protect_last_n=1)
+        huge_content = "x" * 180000
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "newer"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail"},
+        ]
+
+        with patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
+            result = c.compress(messages, current_tokens=200000)
+
+        assert result[3]["content"].startswith("[Tool output pruned")
+        assert result[5]["content"] == huge_content
+        assert c.compression_count == 1
+
+    def test_prune_does_not_touch_protected_tail_messages(self):
+        c = self._make_compressor(context_length=128000, protect_first_n=2, protect_last_n=3)
+        huge_content = "x" * (c._prune_protect_tokens * 8)
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail assistant"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "latest"},
+        ]
+
+        pruned, _ = c._prune_tool_outputs(messages)
+
+        assert pruned[-2]["content"] == huge_content
+        assert pruned[-1]["content"] == "latest"
+
+
+class TestPruneAcceptancePolicy:
+    def _make_compressor(self, *, context_length=128000):
+        with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
+            return ContextCompressor(
+                model="test/model",
+                threshold_percent=0.50,
+                protect_first_n=2,
+                protect_last_n=1,
+                quiet_mode=True,
+            )
+
+    def test_prune_near_threshold_still_falls_back_to_summary(self):
+        c = self._make_compressor()
+        huge_content = "x" * 180000
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "newer"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail"},
+        ]
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compacted"
+
+        with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=62000), \
+             patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(messages, current_tokens=68000)
+
+        assert any("CONTEXT SUMMARY" in (msg.get("content") or "") for msg in result)
+
+    def test_prune_only_is_allowed_when_it_buys_real_runway(self):
+        c = self._make_compressor()
+        huge_content = "x" * 180000
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "task"},
+            {"role": "assistant", "content": "older"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "newer"},
+            {"role": "tool", "content": huge_content, "name": "terminal"},
+            {"role": "assistant", "content": "tail"},
+        ]
+
+        with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=48000), \
+             patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
+            result = c.compress(messages, current_tokens=68000)
+
+        assert result[3]["content"].startswith("[Tool output pruned")
+        assert result[5]["content"] == huge_content