mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 23:11:37 +08:00
Compare commits
3 Commits
skill/gith
...
feat/cache
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1a90f3f10 | ||
|
|
55729670be | ||
|
|
119bad65fc |
@@ -7,7 +7,7 @@ protecting head and tail context.
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from agent.auxiliary_client import call_llm
|
||||
from agent.model_metadata import (
|
||||
@@ -17,6 +17,24 @@ from agent.model_metadata import (
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NEVER_PRUNE_TOOLS = {"clarify", "memory", "skill_view", "todo", "read_file"}
|
||||
|
||||
|
||||
def _adaptive_prune_protect(context_length: int) -> int:
|
||||
"""Scale the recent-tool-output protection window to the model context size."""
|
||||
if context_length >= 500_000:
|
||||
return 100_000
|
||||
if context_length >= 128_000:
|
||||
return 40_000
|
||||
if context_length >= 64_000:
|
||||
return 20_000
|
||||
return 10_000
|
||||
|
||||
|
||||
def _adaptive_prune_minimum(context_length: int) -> int:
|
||||
"""Only prune when it reclaims a meaningful amount of prompt budget."""
|
||||
return max(5_000, context_length // 20)
|
||||
|
||||
|
||||
class ContextCompressor:
|
||||
"""Compresses conversation context when approaching the model's context limit.
|
||||
@@ -54,6 +72,10 @@ class ContextCompressor:
|
||||
self.last_total_tokens = 0
|
||||
|
||||
self.summary_model = summary_model_override or ""
|
||||
self._prune_protect_tokens = _adaptive_prune_protect(self.context_length)
|
||||
self._prune_minimum_tokens = _adaptive_prune_minimum(self.context_length)
|
||||
self._prune_runway_tokens = max(self._prune_minimum_tokens, int(self.threshold_tokens * 0.15))
|
||||
self._prune_target_tokens = max(0, self.threshold_tokens - self._prune_runway_tokens)
|
||||
|
||||
def update_from_response(self, usage: Dict[str, Any]):
|
||||
"""Update tracked token usage from API response."""
|
||||
@@ -81,6 +103,58 @@ class ContextCompressor:
|
||||
"compression_count": self.compression_count,
|
||||
}
|
||||
|
||||
def _is_protected_tool(self, message: Dict[str, Any]) -> bool:
|
||||
"""Return True when a tool output should never be pruned."""
|
||||
return (message.get("name") or "") in NEVER_PRUNE_TOOLS
|
||||
|
||||
def _prune_tool_outputs(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
|
||||
"""Replace older middle tool outputs with compact placeholders.
|
||||
|
||||
Only prunes tool outputs from the same middle region that would be eligible
|
||||
for summarization. The head/tail protected windows are left untouched.
|
||||
|
||||
Returns:
|
||||
(messages_after_prune, chars_saved)
|
||||
"""
|
||||
n_messages = len(messages)
|
||||
compress_start = self.protect_first_n
|
||||
compress_end = n_messages - self.protect_last_n
|
||||
if compress_start >= compress_end:
|
||||
return messages, 0
|
||||
|
||||
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||
if compress_start >= compress_end:
|
||||
return messages, 0
|
||||
|
||||
pruned = [msg.copy() for msg in messages]
|
||||
chars_saved = 0
|
||||
recent_tool_tokens = 0
|
||||
|
||||
for i in range(compress_end - 1, compress_start - 1, -1):
|
||||
msg = pruned[i]
|
||||
if msg.get("role") != "tool" or self._is_protected_tool(msg):
|
||||
continue
|
||||
|
||||
content = msg.get("content")
|
||||
content_text = content if isinstance(content, str) else str(content or "")
|
||||
token_estimate = max(1, len(content_text) // 4)
|
||||
|
||||
if recent_tool_tokens < self._prune_protect_tokens:
|
||||
recent_tool_tokens += token_estimate
|
||||
continue
|
||||
|
||||
original_len = len(content_text)
|
||||
placeholder = f"[Tool output pruned — was {original_len:,} chars]"
|
||||
pruned[i]["content"] = placeholder
|
||||
chars_saved += max(0, original_len - len(placeholder))
|
||||
|
||||
tokens_saved = chars_saved // 4
|
||||
if tokens_saved < self._prune_minimum_tokens:
|
||||
return messages, 0
|
||||
|
||||
return pruned, chars_saved
|
||||
|
||||
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||
"""Generate a concise summary of conversation turns.
|
||||
|
||||
@@ -267,13 +341,49 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||
if compress_start >= compress_end:
|
||||
return messages
|
||||
|
||||
turns_to_summarize = messages[compress_start:compress_end]
|
||||
display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
||||
display_tokens = current_tokens if current_tokens is not None else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
||||
|
||||
if not self.quiet_mode:
|
||||
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
|
||||
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
|
||||
|
||||
pruned_messages, chars_saved = self._prune_tool_outputs(messages)
|
||||
if chars_saved > 0:
|
||||
pruned_tokens = estimate_messages_tokens_rough(pruned_messages)
|
||||
tokens_saved_phase1 = max(0, display_tokens - pruned_tokens)
|
||||
if not self.quiet_mode:
|
||||
print(
|
||||
f" ✂️ Phase 1 (prune): removed {chars_saved:,} chars of old tool outputs "
|
||||
f"(~{tokens_saved_phase1:,} tokens saved)"
|
||||
)
|
||||
if pruned_tokens <= self._prune_target_tokens:
|
||||
self.compression_count += 1
|
||||
pruned_messages = self._sanitize_tool_pairs(pruned_messages)
|
||||
if not self.quiet_mode:
|
||||
print(
|
||||
f" ✅ Phase 1 sufficient: {n_messages} → {len(pruned_messages)} messages, "
|
||||
f"now {pruned_tokens:,} tokens"
|
||||
)
|
||||
print(f" 💡 Compression #{self.compression_count} complete (prune only — no LLM call needed)")
|
||||
return pruned_messages
|
||||
if not self.quiet_mode and pruned_tokens < self.threshold_tokens:
|
||||
print(
|
||||
f" ↪️ Phase 1 recovered tokens but not enough runway "
|
||||
f"({pruned_tokens:,} > target {self._prune_target_tokens:,}); continuing to compaction"
|
||||
)
|
||||
messages = pruned_messages
|
||||
n_messages = len(messages)
|
||||
compress_start = self.protect_first_n
|
||||
compress_end = n_messages - self.protect_last_n
|
||||
if compress_start >= compress_end:
|
||||
return messages
|
||||
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||
if compress_start >= compress_end:
|
||||
return messages
|
||||
|
||||
turns_to_summarize = messages[compress_start:compress_end]
|
||||
|
||||
if not self.quiet_mode:
|
||||
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
|
||||
|
||||
|
||||
192
docs/plans/2026-03-14-cache-aware-context-compaction.md
Normal file
192
docs/plans/2026-03-14-cache-aware-context-compaction.md
Normal file
@@ -0,0 +1,192 @@
|
||||
# Cache-Aware Context Compaction Design Note
|
||||
|
||||
> For Hermes: this note is a design/implementation sketch for revisiting prune-first compaction without optimizing token spend at the expense of prompt-cache stability.
|
||||
|
||||
Goal: reduce compression cost while keeping cache-break frequency as low as possible.
|
||||
|
||||
Architecture: keep Hermes' current invariant that conversation history is only mutated during context compression, then make prune-first compaction conservative enough that it only short-circuits when it buys meaningful runway. If pruning only gets us barely below threshold, fall through to the existing summary compaction immediately.
|
||||
|
||||
Tech Stack: `agent/context_compressor.py`, existing `call_llm()`-based summary path, pytest coverage in `tests/agent/test_context_compressor.py`.
|
||||
|
||||
---
|
||||
|
||||
## 1. Baseline behavior on current main
|
||||
|
||||
Today Hermes behaves like this:
|
||||
|
||||
1. Prompt crosses the compression threshold.
|
||||
2. We mutate transcript history once by summarizing the middle region with an LLM.
|
||||
3. We preserve role alternation and tool-call/tool-result integrity.
|
||||
4. We continue the conversation from the compressed transcript.
|
||||
|
||||
This is expensive in two ways:
|
||||
- an auxiliary summary call is often required
|
||||
- the entire compressed middle region is rewritten even when the real problem was just a few huge old tool outputs
|
||||
|
||||
But it has one strong cache property:
|
||||
- it tends to reclaim a lot of headroom per compression event, so the next compression is usually farther away
|
||||
|
||||
---
|
||||
|
||||
## 2. Why naive prune-first compaction is not enough
|
||||
|
||||
A naive prune-first policy says:
|
||||
- prune old tool outputs
|
||||
- if prompt is now below threshold, stop
|
||||
|
||||
This improves per-event token cost, but it can hurt cache economics:
|
||||
- prune-only may reclaim less headroom than full compaction
|
||||
- smaller headroom means the next compression may happen sooner
|
||||
- each compression event is still a cache-breaking transcript mutation
|
||||
|
||||
So there is a real failure mode:
|
||||
- fewer tokens per compression
|
||||
- more compression events overall
|
||||
- worse cache break cadence
|
||||
|
||||
That is exactly the tradeoff we want to avoid.
|
||||
|
||||
---
|
||||
|
||||
## 3. Cache-aware principle
|
||||
|
||||
Prune-first compaction should only short-circuit when it buys real runway, not when it merely dips under threshold.
|
||||
|
||||
Rule of thumb:
|
||||
- compression frequency matters as much as compression size
|
||||
- a smaller mutation is not automatically cheaper if it causes another mutation a few turns later
|
||||
|
||||
So the design target is:
|
||||
- fewer auxiliary summary calls
|
||||
- without materially increasing compression frequency
|
||||
|
||||
---
|
||||
|
||||
## 4. Conservative prototype policy
|
||||
|
||||
The conservative prototype keeps all existing compression invariants and only changes the acceptance rule for prune-only compaction.
|
||||
|
||||
### Phase 1: prune old middle tool outputs
|
||||
|
||||
Only prune tool outputs that are:
|
||||
- in the compressible middle region
|
||||
- not in protected head/tail windows
|
||||
- not from protected tools (`read_file`, `memory`, `clarify`, `skill_view`, `todo`)
|
||||
|
||||
### Phase 2: require a low-water mark
|
||||
|
||||
Do not accept prune-only just because it lands below threshold.
|
||||
|
||||
Instead require:
|
||||
- `post_prune_tokens <= prune_target_tokens`
|
||||
|
||||
Where:
|
||||
- `prune_runway_tokens = max(prune_minimum_tokens, 15% of threshold_tokens)`
|
||||
- `prune_target_tokens = threshold_tokens - prune_runway_tokens`
|
||||
|
||||
Interpretation:
|
||||
- pruning must get us comfortably below threshold
|
||||
- otherwise we immediately fall through to normal LLM summary compaction
|
||||
|
||||
Why this helps:
|
||||
- protects cache by avoiding "micro-compactions" that would be followed by another compression shortly after
|
||||
- still avoids the summary call when pruning truly buys useful runway
|
||||
|
||||
---
|
||||
|
||||
## 5. What the prototype currently does
|
||||
|
||||
The prototype branch currently:
|
||||
- keeps prune-first compaction
|
||||
- adds the low-water / runway requirement above
|
||||
- preserves current main behavior for summary role alternation
|
||||
- preserves the centralized `call_llm()` summary path
|
||||
- keeps head/tail and tool-call/result integrity handling unchanged
|
||||
|
||||
This means the branch is no longer optimizing only for token reduction per event; it is explicitly biased toward fewer compression events.
|
||||
|
||||
---
|
||||
|
||||
## 6. Metrics we should evaluate before merging any future version
|
||||
|
||||
A serious cache-aware review should measure all of these, not just token savings:
|
||||
|
||||
1. Compression events per 100 conversation turns
|
||||
2. Average turns between compressions
|
||||
3. Auxiliary summary calls per session
|
||||
4. Average tokens reclaimed per compression event
|
||||
5. Total prompt+auxiliary tokens spent over a long session
|
||||
6. Earliest changed message index during compression
|
||||
7. Ratio of prune-only compressions to full summary compressions
|
||||
|
||||
The most important comparison is:
|
||||
- baseline main vs conservative prune-first
|
||||
|
||||
Success is not:
|
||||
- "fewer tokens in one compression"
|
||||
|
||||
Success is:
|
||||
- "equal or better total session cost without increasing compression/cache-break cadence in a meaningful way"
|
||||
|
||||
---
|
||||
|
||||
## 7. Better long-term directions
|
||||
|
||||
If we want a stronger cache story than conservative prune-first, these are the real next-step options:
|
||||
|
||||
### A. Insertion-time trimming
|
||||
|
||||
Best cache-preserving option.
|
||||
|
||||
Idea:
|
||||
- trim or summarize giant tool outputs before they become durable transcript history
|
||||
- keep a compact representation from the start instead of mutating history later
|
||||
|
||||
Pros:
|
||||
- avoids later cache-breaking rewrites for those blobs
|
||||
- makes transcript size stable earlier
|
||||
|
||||
Cons:
|
||||
- more invasive design change
|
||||
- requires careful UX and provenance handling
|
||||
|
||||
### B. Provider/backend-aware compaction policy
|
||||
|
||||
Different providers may reward:
|
||||
- preserving a longer stable prefix
|
||||
- or simply reducing total prompt size
|
||||
|
||||
We may eventually want backend-specific heuristics for:
|
||||
- prune runway targets
|
||||
- compression thresholds
|
||||
- when to prefer summary vs pruning
|
||||
|
||||
### C. Explicit compression telemetry
|
||||
|
||||
If compression remains a core feature, `ContextCompressor` should expose enough telemetry to understand real-world cadence:
|
||||
- prune-only count
|
||||
- full summary count
|
||||
- average recovered tokens
|
||||
- last compression mode
|
||||
|
||||
This is not required for the conservative prototype, but it would make future tuning much easier.
|
||||
|
||||
---
|
||||
|
||||
## 8. Recommended next steps
|
||||
|
||||
1. Keep the conservative prototype local for review.
|
||||
2. Run targeted tests plus long-session manual trials.
|
||||
3. If it looks promising, add telemetry before opening another PR.
|
||||
4. If cache stability remains the top priority, pursue insertion-time trimming instead of further read-time pruning tweaks.
|
||||
|
||||
---
|
||||
|
||||
## 9. Review question for Teknium
|
||||
|
||||
The key product question is:
|
||||
|
||||
"Should Hermes optimize compression primarily for per-event token cost, or for minimizing the number of transcript mutations over the lifetime of a session?"
|
||||
|
||||
This prototype assumes the answer is:
|
||||
- prioritize fewer transcript mutations unless pruning buys substantial runway.
|
||||
@@ -314,3 +314,143 @@ class TestCompressWithClient:
|
||||
for msg in result:
|
||||
if msg.get("role") == "tool" and msg.get("tool_call_id"):
|
||||
assert msg["tool_call_id"] in called_ids
|
||||
|
||||
|
||||
class TestPruneToolOutputs:
|
||||
def _make_compressor(self, *, context_length=128000, protect_first_n=2, protect_last_n=2):
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
|
||||
return ContextCompressor(
|
||||
model="test/model",
|
||||
threshold_percent=0.50,
|
||||
protect_first_n=protect_first_n,
|
||||
protect_last_n=protect_last_n,
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
def test_prune_replaces_old_middle_tool_outputs(self):
|
||||
c = self._make_compressor(protect_last_n=1)
|
||||
big_content = "x" * (c._prune_protect_tokens * 4)
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "newer"},
|
||||
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail"},
|
||||
]
|
||||
|
||||
pruned, chars_saved = c._prune_tool_outputs(messages)
|
||||
|
||||
assert chars_saved > 0
|
||||
assert pruned[3]["content"].startswith("[Tool output pruned")
|
||||
assert pruned[5]["content"] == big_content
|
||||
|
||||
def test_protected_tools_are_never_pruned(self):
|
||||
c = self._make_compressor()
|
||||
big_content = "x" * (c._prune_protect_tokens * 8)
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": big_content, "name": "read_file"},
|
||||
{"role": "assistant", "content": "middle"},
|
||||
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail"},
|
||||
]
|
||||
|
||||
pruned, _ = c._prune_tool_outputs(messages)
|
||||
read_file_msg = next(msg for msg in pruned if msg.get("name") == "read_file")
|
||||
assert read_file_msg["content"] == big_content
|
||||
|
||||
def test_prune_only_path_skips_summary_call_when_sufficient(self):
|
||||
c = self._make_compressor(protect_first_n=2, protect_last_n=1)
|
||||
huge_content = "x" * 180000
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "newer"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail"},
|
||||
]
|
||||
|
||||
with patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
|
||||
result = c.compress(messages, current_tokens=200000)
|
||||
|
||||
assert result[3]["content"].startswith("[Tool output pruned")
|
||||
assert result[5]["content"] == huge_content
|
||||
assert c.compression_count == 1
|
||||
|
||||
def test_prune_does_not_touch_protected_tail_messages(self):
|
||||
c = self._make_compressor(context_length=128000, protect_first_n=2, protect_last_n=3)
|
||||
huge_content = "x" * (c._prune_protect_tokens * 8)
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail assistant"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "latest"},
|
||||
]
|
||||
|
||||
pruned, _ = c._prune_tool_outputs(messages)
|
||||
|
||||
assert pruned[-2]["content"] == huge_content
|
||||
assert pruned[-1]["content"] == "latest"
|
||||
|
||||
|
||||
class TestPruneAcceptancePolicy:
|
||||
def _make_compressor(self, *, context_length=128000):
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
|
||||
return ContextCompressor(
|
||||
model="test/model",
|
||||
threshold_percent=0.50,
|
||||
protect_first_n=2,
|
||||
protect_last_n=1,
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
def test_prune_near_threshold_still_falls_back_to_summary(self):
|
||||
c = self._make_compressor()
|
||||
huge_content = "x" * 180000
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "newer"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail"},
|
||||
]
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [MagicMock()]
|
||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compacted"
|
||||
|
||||
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=62000), \
|
||||
patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||
result = c.compress(messages, current_tokens=68000)
|
||||
|
||||
assert any("CONTEXT SUMMARY" in (msg.get("content") or "") for msg in result)
|
||||
|
||||
def test_prune_only_is_allowed_when_it_buys_real_runway(self):
|
||||
c = self._make_compressor()
|
||||
huge_content = "x" * 180000
|
||||
messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "task"},
|
||||
{"role": "assistant", "content": "older"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "newer"},
|
||||
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||
{"role": "assistant", "content": "tail"},
|
||||
]
|
||||
|
||||
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=48000), \
|
||||
patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
|
||||
result = c.compress(messages, current_tokens=68000)
|
||||
|
||||
assert result[3]["content"].startswith("[Tool output pruned")
|
||||
assert result[5]["content"] == huge_content
|
||||
|
||||
Reference in New Issue
Block a user