Compare commits

...

3 Commits

Author SHA1 Message Date
teknium1
a1a90f3f10 feat: require runway before prune-only compaction
Make prune-first compression cache-aware by only accepting prune-only
compaction when it gets comfortably below threshold. If pruning merely
dips under threshold, fall through to the existing summary compaction
so we avoid frequent near-threshold recompressions.

Tests cover both the conservative fallback and the prune-only fast path.
2026-03-13 21:46:09 -07:00
teknium1
55729670be docs: add cache-aware compaction design note 2026-03-13 21:46:03 -07:00
teyrebaz33
119bad65fc feat: prune old tool outputs before context compaction
Port the useful part of PR #588 onto current main without regressing
summary role alternation or the centralized call_llm-based summary path.

This adds a prune-first compression pass that:
- protects recent tool outputs with adaptive thresholds
- never prunes key tool outputs like read_file/memory/clarify
- skips the LLM summary call entirely when pruning alone is enough
- keeps head/tail protected windows untouched

Tests cover prune-only compaction, protected tools, and tail protection.

Co-authored-by: teyrebaz33 <hakanerten02@hotmail.com>
2026-03-13 21:21:28 -07:00
3 changed files with 445 additions and 3 deletions

View File

@@ -7,7 +7,7 @@ protecting head and tail context.
import logging
import os
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
from agent.auxiliary_client import call_llm
from agent.model_metadata import (
@@ -17,6 +17,24 @@ from agent.model_metadata import (
logger = logging.getLogger(__name__)
NEVER_PRUNE_TOOLS = {"clarify", "memory", "skill_view", "todo", "read_file"}
def _adaptive_prune_protect(context_length: int) -> int:
"""Scale the recent-tool-output protection window to the model context size."""
if context_length >= 500_000:
return 100_000
if context_length >= 128_000:
return 40_000
if context_length >= 64_000:
return 20_000
return 10_000
def _adaptive_prune_minimum(context_length: int) -> int:
"""Only prune when it reclaims a meaningful amount of prompt budget."""
return max(5_000, context_length // 20)
class ContextCompressor:
"""Compresses conversation context when approaching the model's context limit.
@@ -54,6 +72,10 @@ class ContextCompressor:
self.last_total_tokens = 0
self.summary_model = summary_model_override or ""
self._prune_protect_tokens = _adaptive_prune_protect(self.context_length)
self._prune_minimum_tokens = _adaptive_prune_minimum(self.context_length)
self._prune_runway_tokens = max(self._prune_minimum_tokens, int(self.threshold_tokens * 0.15))
self._prune_target_tokens = max(0, self.threshold_tokens - self._prune_runway_tokens)
def update_from_response(self, usage: Dict[str, Any]):
"""Update tracked token usage from API response."""
@@ -81,6 +103,58 @@ class ContextCompressor:
"compression_count": self.compression_count,
}
def _is_protected_tool(self, message: Dict[str, Any]) -> bool:
"""Return True when a tool output should never be pruned."""
return (message.get("name") or "") in NEVER_PRUNE_TOOLS
def _prune_tool_outputs(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
"""Replace older middle tool outputs with compact placeholders.
Only prunes tool outputs from the same middle region that would be eligible
for summarization. The head/tail protected windows are left untouched.
Returns:
(messages_after_prune, chars_saved)
"""
n_messages = len(messages)
compress_start = self.protect_first_n
compress_end = n_messages - self.protect_last_n
if compress_start >= compress_end:
return messages, 0
compress_start = self._align_boundary_forward(messages, compress_start)
compress_end = self._align_boundary_backward(messages, compress_end)
if compress_start >= compress_end:
return messages, 0
pruned = [msg.copy() for msg in messages]
chars_saved = 0
recent_tool_tokens = 0
for i in range(compress_end - 1, compress_start - 1, -1):
msg = pruned[i]
if msg.get("role") != "tool" or self._is_protected_tool(msg):
continue
content = msg.get("content")
content_text = content if isinstance(content, str) else str(content or "")
token_estimate = max(1, len(content_text) // 4)
if recent_tool_tokens < self._prune_protect_tokens:
recent_tool_tokens += token_estimate
continue
original_len = len(content_text)
placeholder = f"[Tool output pruned — was {original_len:,} chars]"
pruned[i]["content"] = placeholder
chars_saved += max(0, original_len - len(placeholder))
tokens_saved = chars_saved // 4
if tokens_saved < self._prune_minimum_tokens:
return messages, 0
return pruned, chars_saved
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
"""Generate a concise summary of conversation turns.
@@ -267,13 +341,49 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
if compress_start >= compress_end:
return messages
turns_to_summarize = messages[compress_start:compress_end]
display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
display_tokens = current_tokens if current_tokens is not None else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
if not self.quiet_mode:
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
pruned_messages, chars_saved = self._prune_tool_outputs(messages)
if chars_saved > 0:
pruned_tokens = estimate_messages_tokens_rough(pruned_messages)
tokens_saved_phase1 = max(0, display_tokens - pruned_tokens)
if not self.quiet_mode:
print(
f" ✂️ Phase 1 (prune): removed {chars_saved:,} chars of old tool outputs "
f"(~{tokens_saved_phase1:,} tokens saved)"
)
if pruned_tokens <= self._prune_target_tokens:
self.compression_count += 1
pruned_messages = self._sanitize_tool_pairs(pruned_messages)
if not self.quiet_mode:
print(
f" ✅ Phase 1 sufficient: {n_messages}{len(pruned_messages)} messages, "
f"now {pruned_tokens:,} tokens"
)
print(f" 💡 Compression #{self.compression_count} complete (prune only — no LLM call needed)")
return pruned_messages
if not self.quiet_mode and pruned_tokens < self.threshold_tokens:
print(
f" ↪️ Phase 1 recovered tokens but not enough runway "
f"({pruned_tokens:,} > target {self._prune_target_tokens:,}); continuing to compaction"
)
messages = pruned_messages
n_messages = len(messages)
compress_start = self.protect_first_n
compress_end = n_messages - self.protect_last_n
if compress_start >= compress_end:
return messages
compress_start = self._align_boundary_forward(messages, compress_start)
compress_end = self._align_boundary_backward(messages, compress_end)
if compress_start >= compress_end:
return messages
turns_to_summarize = messages[compress_start:compress_end]
if not self.quiet_mode:
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")

View File

@@ -0,0 +1,192 @@
# Cache-Aware Context Compaction Design Note
> For Hermes: this note is a design/implementation sketch for revisiting prune-first compaction without optimizing token spend at the expense of prompt-cache stability.
Goal: reduce compression cost while keeping cache-break frequency as low as possible.
Architecture: keep Hermes' current invariant that conversation history is only mutated during context compression, then make prune-first compaction conservative enough that it only short-circuits when it buys meaningful runway. If pruning only gets us barely below threshold, fall through to the existing summary compaction immediately.
Tech Stack: `agent/context_compressor.py`, existing `call_llm()`-based summary path, pytest coverage in `tests/agent/test_context_compressor.py`.
---
## 1. Baseline behavior on current main
Today Hermes behaves like this:
1. Prompt crosses the compression threshold.
2. We mutate transcript history once by summarizing the middle region with an LLM.
3. We preserve role alternation and tool-call/tool-result integrity.
4. We continue the conversation from the compressed transcript.
This is expensive in two ways:
- an auxiliary summary call is often required
- the entire compressed middle region is rewritten even when the real problem was just a few huge old tool outputs
But it has one strong cache property:
- it tends to reclaim a lot of headroom per compression event, so the next compression is usually farther away
---
## 2. Why naive prune-first compaction is not enough
A naive prune-first policy says:
- prune old tool outputs
- if prompt is now below threshold, stop
This improves per-event token cost, but it can hurt cache economics:
- prune-only may reclaim less headroom than full compaction
- smaller headroom means the next compression may happen sooner
- each compression event is still a cache-breaking transcript mutation
So there is a real failure mode:
- fewer tokens per compression
- more compression events overall
- worse cache break cadence
That is exactly the tradeoff we want to avoid.
---
## 3. Cache-aware principle
Prune-first compaction should only short-circuit when it buys real runway, not when it merely dips under threshold.
Rule of thumb:
- compression frequency matters as much as compression size
- a smaller mutation is not automatically cheaper if it causes another mutation a few turns later
So the design target is:
- fewer auxiliary summary calls
- without materially increasing compression frequency
---
## 4. Conservative prototype policy
The conservative prototype keeps all existing compression invariants and only changes the acceptance rule for prune-only compaction.
### Phase 1: prune old middle tool outputs
Only prune tool outputs that are:
- in the compressible middle region
- not in protected head/tail windows
- not from protected tools (`read_file`, `memory`, `clarify`, `skill_view`, `todo`)
### Phase 2: require a low-water mark
Do not accept prune-only just because it lands below threshold.
Instead require:
- `post_prune_tokens <= prune_target_tokens`
Where:
- `prune_runway_tokens = max(prune_minimum_tokens, 15% of threshold_tokens)`
- `prune_target_tokens = threshold_tokens - prune_runway_tokens`
Interpretation:
- pruning must get us comfortably below threshold
- otherwise we immediately fall through to normal LLM summary compaction
Why this helps:
- protects cache by avoiding "micro-compactions" that would be followed by another compression shortly after
- still avoids the summary call when pruning truly buys useful runway
---
## 5. What the prototype currently does
The prototype branch currently:
- keeps prune-first compaction
- adds the low-water / runway requirement above
- preserves current main behavior for summary role alternation
- preserves the centralized `call_llm()` summary path
- keeps head/tail and tool-call/result integrity handling unchanged
This means the branch is no longer optimizing only for token reduction per event; it is explicitly biased toward fewer compression events.
---
## 6. Metrics we should evaluate before merging any future version
A serious cache-aware review should measure all of these, not just token savings:
1. Compression events per 100 conversation turns
2. Average turns between compressions
3. Auxiliary summary calls per session
4. Average tokens reclaimed per compression event
5. Total prompt+auxiliary tokens spent over a long session
6. Earliest changed message index during compression
7. Ratio of prune-only compressions to full summary compressions
The most important comparison is:
- baseline main vs conservative prune-first
Success is not:
- "fewer tokens in one compression"
Success is:
- "equal or better total session cost without increasing compression/cache-break cadence in a meaningful way"
---
## 7. Better long-term directions
If we want a stronger cache story than conservative prune-first, these are the real next-step options:
### A. Insertion-time trimming
Best cache-preserving option.
Idea:
- trim or summarize giant tool outputs before they become durable transcript history
- keep a compact representation from the start instead of mutating history later
Pros:
- avoids later cache-breaking rewrites for those blobs
- makes transcript size stable earlier
Cons:
- more invasive design change
- requires careful UX and provenance handling
### B. Provider/backend-aware compaction policy
Different providers may reward:
- preserving a longer stable prefix
- or simply reducing total prompt size
We may eventually want backend-specific heuristics for:
- prune runway targets
- compression thresholds
- when to prefer summary vs pruning
### C. Explicit compression telemetry
If compression remains a core feature, `ContextCompressor` should expose enough telemetry to understand real-world cadence:
- prune-only count
- full summary count
- average recovered tokens
- last compression mode
This is not required for the conservative prototype, but it would make future tuning much easier.
---
## 8. Recommended next steps
1. Keep the conservative prototype local for review.
2. Run targeted tests plus long-session manual trials.
3. If it looks promising, add telemetry before opening another PR.
4. If cache stability remains the top priority, pursue insertion-time trimming instead of further read-time pruning tweaks.
---
## 9. Review question for Teknium
The key product question is:
"Should Hermes optimize compression primarily for per-event token cost, or for minimizing the number of transcript mutations over the lifetime of a session?"
This prototype assumes the answer is:
- prioritize fewer transcript mutations unless pruning buys substantial runway.

View File

@@ -314,3 +314,143 @@ class TestCompressWithClient:
for msg in result:
if msg.get("role") == "tool" and msg.get("tool_call_id"):
assert msg["tool_call_id"] in called_ids
class TestPruneToolOutputs:
def _make_compressor(self, *, context_length=128000, protect_first_n=2, protect_last_n=2):
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
return ContextCompressor(
model="test/model",
threshold_percent=0.50,
protect_first_n=protect_first_n,
protect_last_n=protect_last_n,
quiet_mode=True,
)
def test_prune_replaces_old_middle_tool_outputs(self):
c = self._make_compressor(protect_last_n=1)
big_content = "x" * (c._prune_protect_tokens * 4)
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": big_content, "name": "terminal"},
{"role": "assistant", "content": "newer"},
{"role": "tool", "content": big_content, "name": "terminal"},
{"role": "assistant", "content": "tail"},
]
pruned, chars_saved = c._prune_tool_outputs(messages)
assert chars_saved > 0
assert pruned[3]["content"].startswith("[Tool output pruned")
assert pruned[5]["content"] == big_content
def test_protected_tools_are_never_pruned(self):
c = self._make_compressor()
big_content = "x" * (c._prune_protect_tokens * 8)
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": big_content, "name": "read_file"},
{"role": "assistant", "content": "middle"},
{"role": "tool", "content": big_content, "name": "terminal"},
{"role": "assistant", "content": "tail"},
]
pruned, _ = c._prune_tool_outputs(messages)
read_file_msg = next(msg for msg in pruned if msg.get("name") == "read_file")
assert read_file_msg["content"] == big_content
def test_prune_only_path_skips_summary_call_when_sufficient(self):
c = self._make_compressor(protect_first_n=2, protect_last_n=1)
huge_content = "x" * 180000
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "newer"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "tail"},
]
with patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
result = c.compress(messages, current_tokens=200000)
assert result[3]["content"].startswith("[Tool output pruned")
assert result[5]["content"] == huge_content
assert c.compression_count == 1
def test_prune_does_not_touch_protected_tail_messages(self):
c = self._make_compressor(context_length=128000, protect_first_n=2, protect_last_n=3)
huge_content = "x" * (c._prune_protect_tokens * 8)
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "tail assistant"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "latest"},
]
pruned, _ = c._prune_tool_outputs(messages)
assert pruned[-2]["content"] == huge_content
assert pruned[-1]["content"] == "latest"
class TestPruneAcceptancePolicy:
def _make_compressor(self, *, context_length=128000):
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
return ContextCompressor(
model="test/model",
threshold_percent=0.50,
protect_first_n=2,
protect_last_n=1,
quiet_mode=True,
)
def test_prune_near_threshold_still_falls_back_to_summary(self):
c = self._make_compressor()
huge_content = "x" * 180000
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "newer"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "tail"},
]
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compacted"
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=62000), \
patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(messages, current_tokens=68000)
assert any("CONTEXT SUMMARY" in (msg.get("content") or "") for msg in result)
def test_prune_only_is_allowed_when_it_buys_real_runway(self):
c = self._make_compressor()
huge_content = "x" * 180000
messages = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "task"},
{"role": "assistant", "content": "older"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "newer"},
{"role": "tool", "content": huge_content, "name": "terminal"},
{"role": "assistant", "content": "tail"},
]
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=48000), \
patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
result = c.compress(messages, current_tokens=68000)
assert result[3]["content"].startswith("[Tool output pruned")
assert result[5]["content"] == huge_content