mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 23:11:37 +08:00
Compare commits
3 Commits
codex-port
...
feat/cache
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1a90f3f10 | ||
|
|
55729670be | ||
|
|
119bad65fc |
@@ -7,7 +7,7 @@ protecting head and tail context.
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from agent.auxiliary_client import call_llm
|
from agent.auxiliary_client import call_llm
|
||||||
from agent.model_metadata import (
|
from agent.model_metadata import (
|
||||||
@@ -17,6 +17,24 @@ from agent.model_metadata import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
NEVER_PRUNE_TOOLS = {"clarify", "memory", "skill_view", "todo", "read_file"}
|
||||||
|
|
||||||
|
|
||||||
|
def _adaptive_prune_protect(context_length: int) -> int:
|
||||||
|
"""Scale the recent-tool-output protection window to the model context size."""
|
||||||
|
if context_length >= 500_000:
|
||||||
|
return 100_000
|
||||||
|
if context_length >= 128_000:
|
||||||
|
return 40_000
|
||||||
|
if context_length >= 64_000:
|
||||||
|
return 20_000
|
||||||
|
return 10_000
|
||||||
|
|
||||||
|
|
||||||
|
def _adaptive_prune_minimum(context_length: int) -> int:
|
||||||
|
"""Only prune when it reclaims a meaningful amount of prompt budget."""
|
||||||
|
return max(5_000, context_length // 20)
|
||||||
|
|
||||||
|
|
||||||
class ContextCompressor:
|
class ContextCompressor:
|
||||||
"""Compresses conversation context when approaching the model's context limit.
|
"""Compresses conversation context when approaching the model's context limit.
|
||||||
@@ -54,6 +72,10 @@ class ContextCompressor:
|
|||||||
self.last_total_tokens = 0
|
self.last_total_tokens = 0
|
||||||
|
|
||||||
self.summary_model = summary_model_override or ""
|
self.summary_model = summary_model_override or ""
|
||||||
|
self._prune_protect_tokens = _adaptive_prune_protect(self.context_length)
|
||||||
|
self._prune_minimum_tokens = _adaptive_prune_minimum(self.context_length)
|
||||||
|
self._prune_runway_tokens = max(self._prune_minimum_tokens, int(self.threshold_tokens * 0.15))
|
||||||
|
self._prune_target_tokens = max(0, self.threshold_tokens - self._prune_runway_tokens)
|
||||||
|
|
||||||
def update_from_response(self, usage: Dict[str, Any]):
|
def update_from_response(self, usage: Dict[str, Any]):
|
||||||
"""Update tracked token usage from API response."""
|
"""Update tracked token usage from API response."""
|
||||||
@@ -81,6 +103,58 @@ class ContextCompressor:
|
|||||||
"compression_count": self.compression_count,
|
"compression_count": self.compression_count,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _is_protected_tool(self, message: Dict[str, Any]) -> bool:
|
||||||
|
"""Return True when a tool output should never be pruned."""
|
||||||
|
return (message.get("name") or "") in NEVER_PRUNE_TOOLS
|
||||||
|
|
||||||
|
def _prune_tool_outputs(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
|
||||||
|
"""Replace older middle tool outputs with compact placeholders.
|
||||||
|
|
||||||
|
Only prunes tool outputs from the same middle region that would be eligible
|
||||||
|
for summarization. The head/tail protected windows are left untouched.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(messages_after_prune, chars_saved)
|
||||||
|
"""
|
||||||
|
n_messages = len(messages)
|
||||||
|
compress_start = self.protect_first_n
|
||||||
|
compress_end = n_messages - self.protect_last_n
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages, 0
|
||||||
|
|
||||||
|
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||||
|
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages, 0
|
||||||
|
|
||||||
|
pruned = [msg.copy() for msg in messages]
|
||||||
|
chars_saved = 0
|
||||||
|
recent_tool_tokens = 0
|
||||||
|
|
||||||
|
for i in range(compress_end - 1, compress_start - 1, -1):
|
||||||
|
msg = pruned[i]
|
||||||
|
if msg.get("role") != "tool" or self._is_protected_tool(msg):
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = msg.get("content")
|
||||||
|
content_text = content if isinstance(content, str) else str(content or "")
|
||||||
|
token_estimate = max(1, len(content_text) // 4)
|
||||||
|
|
||||||
|
if recent_tool_tokens < self._prune_protect_tokens:
|
||||||
|
recent_tool_tokens += token_estimate
|
||||||
|
continue
|
||||||
|
|
||||||
|
original_len = len(content_text)
|
||||||
|
placeholder = f"[Tool output pruned — was {original_len:,} chars]"
|
||||||
|
pruned[i]["content"] = placeholder
|
||||||
|
chars_saved += max(0, original_len - len(placeholder))
|
||||||
|
|
||||||
|
tokens_saved = chars_saved // 4
|
||||||
|
if tokens_saved < self._prune_minimum_tokens:
|
||||||
|
return messages, 0
|
||||||
|
|
||||||
|
return pruned, chars_saved
|
||||||
|
|
||||||
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||||
"""Generate a concise summary of conversation turns.
|
"""Generate a concise summary of conversation turns.
|
||||||
|
|
||||||
@@ -267,13 +341,49 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
|||||||
if compress_start >= compress_end:
|
if compress_start >= compress_end:
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
turns_to_summarize = messages[compress_start:compress_end]
|
display_tokens = current_tokens if current_tokens is not None else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
||||||
display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages)
|
|
||||||
|
|
||||||
if not self.quiet_mode:
|
if not self.quiet_mode:
|
||||||
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
|
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
|
||||||
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
|
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
|
||||||
|
|
||||||
|
pruned_messages, chars_saved = self._prune_tool_outputs(messages)
|
||||||
|
if chars_saved > 0:
|
||||||
|
pruned_tokens = estimate_messages_tokens_rough(pruned_messages)
|
||||||
|
tokens_saved_phase1 = max(0, display_tokens - pruned_tokens)
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(
|
||||||
|
f" ✂️ Phase 1 (prune): removed {chars_saved:,} chars of old tool outputs "
|
||||||
|
f"(~{tokens_saved_phase1:,} tokens saved)"
|
||||||
|
)
|
||||||
|
if pruned_tokens <= self._prune_target_tokens:
|
||||||
|
self.compression_count += 1
|
||||||
|
pruned_messages = self._sanitize_tool_pairs(pruned_messages)
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(
|
||||||
|
f" ✅ Phase 1 sufficient: {n_messages} → {len(pruned_messages)} messages, "
|
||||||
|
f"now {pruned_tokens:,} tokens"
|
||||||
|
)
|
||||||
|
print(f" 💡 Compression #{self.compression_count} complete (prune only — no LLM call needed)")
|
||||||
|
return pruned_messages
|
||||||
|
if not self.quiet_mode and pruned_tokens < self.threshold_tokens:
|
||||||
|
print(
|
||||||
|
f" ↪️ Phase 1 recovered tokens but not enough runway "
|
||||||
|
f"({pruned_tokens:,} > target {self._prune_target_tokens:,}); continuing to compaction"
|
||||||
|
)
|
||||||
|
messages = pruned_messages
|
||||||
|
n_messages = len(messages)
|
||||||
|
compress_start = self.protect_first_n
|
||||||
|
compress_end = n_messages - self.protect_last_n
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages
|
||||||
|
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||||
|
compress_end = self._align_boundary_backward(messages, compress_end)
|
||||||
|
if compress_start >= compress_end:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
turns_to_summarize = messages[compress_start:compress_end]
|
||||||
|
|
||||||
if not self.quiet_mode:
|
if not self.quiet_mode:
|
||||||
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
|
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
|
||||||
|
|
||||||
|
|||||||
192
docs/plans/2026-03-14-cache-aware-context-compaction.md
Normal file
192
docs/plans/2026-03-14-cache-aware-context-compaction.md
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
# Cache-Aware Context Compaction Design Note
|
||||||
|
|
||||||
|
> For Hermes: this note is a design/implementation sketch for revisiting prune-first compaction without optimizing token spend at the expense of prompt-cache stability.
|
||||||
|
|
||||||
|
Goal: reduce compression cost while keeping cache-break frequency as low as possible.
|
||||||
|
|
||||||
|
Architecture: keep Hermes' current invariant that conversation history is only mutated during context compression, then make prune-first compaction conservative enough that it only short-circuits when it buys meaningful runway. If pruning only gets us barely below threshold, fall through to the existing summary compaction immediately.
|
||||||
|
|
||||||
|
Tech Stack: `agent/context_compressor.py`, existing `call_llm()`-based summary path, pytest coverage in `tests/agent/test_context_compressor.py`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Baseline behavior on current main
|
||||||
|
|
||||||
|
Today Hermes behaves like this:
|
||||||
|
|
||||||
|
1. Prompt crosses the compression threshold.
|
||||||
|
2. We mutate transcript history once by summarizing the middle region with an LLM.
|
||||||
|
3. We preserve role alternation and tool-call/tool-result integrity.
|
||||||
|
4. We continue the conversation from the compressed transcript.
|
||||||
|
|
||||||
|
This is expensive in two ways:
|
||||||
|
- an auxiliary summary call is often required
|
||||||
|
- the entire compressed middle region is rewritten even when the real problem was just a few huge old tool outputs
|
||||||
|
|
||||||
|
But it has one strong cache property:
|
||||||
|
- it tends to reclaim a lot of headroom per compression event, so the next compression is usually farther away
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Why naive prune-first compaction is not enough
|
||||||
|
|
||||||
|
A naive prune-first policy says:
|
||||||
|
- prune old tool outputs
|
||||||
|
- if prompt is now below threshold, stop
|
||||||
|
|
||||||
|
This improves per-event token cost, but it can hurt cache economics:
|
||||||
|
- prune-only may reclaim less headroom than full compaction
|
||||||
|
- smaller headroom means the next compression may happen sooner
|
||||||
|
- each compression event is still a cache-breaking transcript mutation
|
||||||
|
|
||||||
|
So there is a real failure mode:
|
||||||
|
- fewer tokens per compression
|
||||||
|
- more compression events overall
|
||||||
|
- worse cache break cadence
|
||||||
|
|
||||||
|
That is exactly the tradeoff we want to avoid.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Cache-aware principle
|
||||||
|
|
||||||
|
Prune-first compaction should only short-circuit when it buys real runway, not when it merely dips under threshold.
|
||||||
|
|
||||||
|
Rule of thumb:
|
||||||
|
- compression frequency matters as much as compression size
|
||||||
|
- a smaller mutation is not automatically cheaper if it causes another mutation a few turns later
|
||||||
|
|
||||||
|
So the design target is:
|
||||||
|
- fewer auxiliary summary calls
|
||||||
|
- without materially increasing compression frequency
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Conservative prototype policy
|
||||||
|
|
||||||
|
The conservative prototype keeps all existing compression invariants and only changes the acceptance rule for prune-only compaction.
|
||||||
|
|
||||||
|
### Phase 1: prune old middle tool outputs
|
||||||
|
|
||||||
|
Only prune tool outputs that are:
|
||||||
|
- in the compressible middle region
|
||||||
|
- not in protected head/tail windows
|
||||||
|
- not from protected tools (`read_file`, `memory`, `clarify`, `skill_view`, `todo`)
|
||||||
|
|
||||||
|
### Phase 2: require a low-water mark
|
||||||
|
|
||||||
|
Do not accept prune-only just because it lands below threshold.
|
||||||
|
|
||||||
|
Instead require:
|
||||||
|
- `post_prune_tokens <= prune_target_tokens`
|
||||||
|
|
||||||
|
Where:
|
||||||
|
- `prune_runway_tokens = max(prune_minimum_tokens, 15% of threshold_tokens)`
|
||||||
|
- `prune_target_tokens = threshold_tokens - prune_runway_tokens`
|
||||||
|
|
||||||
|
Interpretation:
|
||||||
|
- pruning must get us comfortably below threshold
|
||||||
|
- otherwise we immediately fall through to normal LLM summary compaction
|
||||||
|
|
||||||
|
Why this helps:
|
||||||
|
- protects cache by avoiding "micro-compactions" that would be followed by another compression shortly after
|
||||||
|
- still avoids the summary call when pruning truly buys useful runway
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. What the prototype currently does
|
||||||
|
|
||||||
|
The prototype branch currently:
|
||||||
|
- keeps prune-first compaction
|
||||||
|
- adds the low-water / runway requirement above
|
||||||
|
- preserves current main behavior for summary role alternation
|
||||||
|
- preserves the centralized `call_llm()` summary path
|
||||||
|
- keeps head/tail and tool-call/result integrity handling unchanged
|
||||||
|
|
||||||
|
This means the branch is no longer optimizing only for token reduction per event; it is explicitly biased toward fewer compression events.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Metrics we should evaluate before merging any future version
|
||||||
|
|
||||||
|
A serious cache-aware review should measure all of these, not just token savings:
|
||||||
|
|
||||||
|
1. Compression events per 100 conversation turns
|
||||||
|
2. Average turns between compressions
|
||||||
|
3. Auxiliary summary calls per session
|
||||||
|
4. Average tokens reclaimed per compression event
|
||||||
|
5. Total prompt+auxiliary tokens spent over a long session
|
||||||
|
6. Earliest changed message index during compression
|
||||||
|
7. Ratio of prune-only compressions to full summary compressions
|
||||||
|
|
||||||
|
The most important comparison is:
|
||||||
|
- baseline main vs conservative prune-first
|
||||||
|
|
||||||
|
Success is not:
|
||||||
|
- "fewer tokens in one compression"
|
||||||
|
|
||||||
|
Success is:
|
||||||
|
- "equal or better total session cost without increasing compression/cache-break cadence in a meaningful way"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Better long-term directions
|
||||||
|
|
||||||
|
If we want a stronger cache story than conservative prune-first, these are the real next-step options:
|
||||||
|
|
||||||
|
### A. Insertion-time trimming
|
||||||
|
|
||||||
|
Best cache-preserving option.
|
||||||
|
|
||||||
|
Idea:
|
||||||
|
- trim or summarize giant tool outputs before they become durable transcript history
|
||||||
|
- keep a compact representation from the start instead of mutating history later
|
||||||
|
|
||||||
|
Pros:
|
||||||
|
- avoids later cache-breaking rewrites for those blobs
|
||||||
|
- makes transcript size stable earlier
|
||||||
|
|
||||||
|
Cons:
|
||||||
|
- more invasive design change
|
||||||
|
- requires careful UX and provenance handling
|
||||||
|
|
||||||
|
### B. Provider/backend-aware compaction policy
|
||||||
|
|
||||||
|
Different providers may reward:
|
||||||
|
- preserving a longer stable prefix
|
||||||
|
- or simply reducing total prompt size
|
||||||
|
|
||||||
|
We may eventually want backend-specific heuristics for:
|
||||||
|
- prune runway targets
|
||||||
|
- compression thresholds
|
||||||
|
- when to prefer summary vs pruning
|
||||||
|
|
||||||
|
### C. Explicit compression telemetry
|
||||||
|
|
||||||
|
If compression remains a core feature, `ContextCompressor` should expose enough telemetry to understand real-world cadence:
|
||||||
|
- prune-only count
|
||||||
|
- full summary count
|
||||||
|
- average recovered tokens
|
||||||
|
- last compression mode
|
||||||
|
|
||||||
|
This is not required for the conservative prototype, but it would make future tuning much easier.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Recommended next steps
|
||||||
|
|
||||||
|
1. Keep the conservative prototype local for review.
|
||||||
|
2. Run targeted tests plus long-session manual trials.
|
||||||
|
3. If it looks promising, add telemetry before opening another PR.
|
||||||
|
4. If cache stability remains the top priority, pursue insertion-time trimming instead of further read-time pruning tweaks.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Review question for Teknium
|
||||||
|
|
||||||
|
The key product question is:
|
||||||
|
|
||||||
|
"Should Hermes optimize compression primarily for per-event token cost, or for minimizing the number of transcript mutations over the lifetime of a session?"
|
||||||
|
|
||||||
|
This prototype assumes the answer is:
|
||||||
|
- prioritize fewer transcript mutations unless pruning buys substantial runway.
|
||||||
@@ -314,3 +314,143 @@ class TestCompressWithClient:
|
|||||||
for msg in result:
|
for msg in result:
|
||||||
if msg.get("role") == "tool" and msg.get("tool_call_id"):
|
if msg.get("role") == "tool" and msg.get("tool_call_id"):
|
||||||
assert msg["tool_call_id"] in called_ids
|
assert msg["tool_call_id"] in called_ids
|
||||||
|
|
||||||
|
|
||||||
|
class TestPruneToolOutputs:
|
||||||
|
def _make_compressor(self, *, context_length=128000, protect_first_n=2, protect_last_n=2):
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
|
||||||
|
return ContextCompressor(
|
||||||
|
model="test/model",
|
||||||
|
threshold_percent=0.50,
|
||||||
|
protect_first_n=protect_first_n,
|
||||||
|
protect_last_n=protect_last_n,
|
||||||
|
quiet_mode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_prune_replaces_old_middle_tool_outputs(self):
|
||||||
|
c = self._make_compressor(protect_last_n=1)
|
||||||
|
big_content = "x" * (c._prune_protect_tokens * 4)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "newer"},
|
||||||
|
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail"},
|
||||||
|
]
|
||||||
|
|
||||||
|
pruned, chars_saved = c._prune_tool_outputs(messages)
|
||||||
|
|
||||||
|
assert chars_saved > 0
|
||||||
|
assert pruned[3]["content"].startswith("[Tool output pruned")
|
||||||
|
assert pruned[5]["content"] == big_content
|
||||||
|
|
||||||
|
def test_protected_tools_are_never_pruned(self):
|
||||||
|
c = self._make_compressor()
|
||||||
|
big_content = "x" * (c._prune_protect_tokens * 8)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": big_content, "name": "read_file"},
|
||||||
|
{"role": "assistant", "content": "middle"},
|
||||||
|
{"role": "tool", "content": big_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail"},
|
||||||
|
]
|
||||||
|
|
||||||
|
pruned, _ = c._prune_tool_outputs(messages)
|
||||||
|
read_file_msg = next(msg for msg in pruned if msg.get("name") == "read_file")
|
||||||
|
assert read_file_msg["content"] == big_content
|
||||||
|
|
||||||
|
def test_prune_only_path_skips_summary_call_when_sufficient(self):
|
||||||
|
c = self._make_compressor(protect_first_n=2, protect_last_n=1)
|
||||||
|
huge_content = "x" * 180000
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "newer"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail"},
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
|
||||||
|
result = c.compress(messages, current_tokens=200000)
|
||||||
|
|
||||||
|
assert result[3]["content"].startswith("[Tool output pruned")
|
||||||
|
assert result[5]["content"] == huge_content
|
||||||
|
assert c.compression_count == 1
|
||||||
|
|
||||||
|
def test_prune_does_not_touch_protected_tail_messages(self):
|
||||||
|
c = self._make_compressor(context_length=128000, protect_first_n=2, protect_last_n=3)
|
||||||
|
huge_content = "x" * (c._prune_protect_tokens * 8)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail assistant"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "latest"},
|
||||||
|
]
|
||||||
|
|
||||||
|
pruned, _ = c._prune_tool_outputs(messages)
|
||||||
|
|
||||||
|
assert pruned[-2]["content"] == huge_content
|
||||||
|
assert pruned[-1]["content"] == "latest"
|
||||||
|
|
||||||
|
|
||||||
|
class TestPruneAcceptancePolicy:
|
||||||
|
def _make_compressor(self, *, context_length=128000):
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=context_length):
|
||||||
|
return ContextCompressor(
|
||||||
|
model="test/model",
|
||||||
|
threshold_percent=0.50,
|
||||||
|
protect_first_n=2,
|
||||||
|
protect_last_n=1,
|
||||||
|
quiet_mode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_prune_near_threshold_still_falls_back_to_summary(self):
|
||||||
|
c = self._make_compressor()
|
||||||
|
huge_content = "x" * 180000
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "newer"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail"},
|
||||||
|
]
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [MagicMock()]
|
||||||
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compacted"
|
||||||
|
|
||||||
|
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=62000), \
|
||||||
|
patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(messages, current_tokens=68000)
|
||||||
|
|
||||||
|
assert any("CONTEXT SUMMARY" in (msg.get("content") or "") for msg in result)
|
||||||
|
|
||||||
|
def test_prune_only_is_allowed_when_it_buys_real_runway(self):
|
||||||
|
c = self._make_compressor()
|
||||||
|
huge_content = "x" * 180000
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "sys"},
|
||||||
|
{"role": "user", "content": "task"},
|
||||||
|
{"role": "assistant", "content": "older"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "newer"},
|
||||||
|
{"role": "tool", "content": huge_content, "name": "terminal"},
|
||||||
|
{"role": "assistant", "content": "tail"},
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("agent.context_compressor.estimate_messages_tokens_rough", return_value=48000), \
|
||||||
|
patch.object(ContextCompressor, "_generate_summary", side_effect=AssertionError("summary should not be called")):
|
||||||
|
result = c.compress(messages, current_tokens=68000)
|
||||||
|
|
||||||
|
assert result[3]["content"].startswith("[Tool output pruned")
|
||||||
|
assert result[5]["content"] == huge_content
|
||||||
|
|||||||
Reference in New Issue
Block a user