mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 15:31:38 +08:00
Compare commits
1 Commits
fix/plugin
...
hermes/her
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
885c5dc5e6 |
@@ -65,12 +65,41 @@ class ContextCompressor:
|
||||
|
||||
self.summary_model = summary_model_override or ""
|
||||
|
||||
# Context usage warning thresholds (fire once each per session)
|
||||
self._warned_80 = False
|
||||
self._warned_95 = False
|
||||
|
||||
def update_from_response(self, usage: Dict[str, Any]):
|
||||
"""Update tracked token usage from API response."""
|
||||
self.last_prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
self.last_completion_tokens = usage.get("completion_tokens", 0)
|
||||
self.last_total_tokens = usage.get("total_tokens", 0)
|
||||
|
||||
def check_context_warning(self) -> str | None:
|
||||
"""Return a warning string if context usage crossed a threshold, else None.
|
||||
|
||||
Each threshold fires at most once per session to avoid spam.
|
||||
"""
|
||||
if not self.context_length or not self.last_prompt_tokens:
|
||||
return None
|
||||
pct = self.last_prompt_tokens / self.context_length
|
||||
if pct >= 0.95 and not self._warned_95:
|
||||
self._warned_95 = True
|
||||
used = f"{self.last_prompt_tokens:,}"
|
||||
total = f"{self.context_length:,}"
|
||||
return (
|
||||
f"⚠ Context nearly exhausted ({used}/{total} tokens, {pct:.0%}). "
|
||||
f"Risk of errors or truncation. Use /new to start fresh."
|
||||
)
|
||||
if pct >= 0.80 and not self._warned_80:
|
||||
self._warned_80 = True
|
||||
used = f"{self.last_prompt_tokens:,}"
|
||||
total = f"{self.context_length:,}"
|
||||
return (
|
||||
f"⚠ Context window {pct:.0%} full ({used}/{total} tokens). "
|
||||
f"Consider /compress or /new if responses degrade."
|
||||
)
|
||||
|
||||
def should_compress(self, prompt_tokens: int = None) -> bool:
|
||||
"""Check if context exceeds the compression threshold."""
|
||||
tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
|
||||
|
||||
13
run_agent.py
13
run_agent.py
@@ -5102,6 +5102,10 @@ class AIAgent:
|
||||
if hasattr(response, 'usage') and response.usage:
|
||||
if self.api_mode in ("codex_responses", "anthropic_messages"):
|
||||
prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0
|
||||
# Include cached input tokens for accurate context tracking
|
||||
# (Anthropic reports non-cached, cache-read, and cache-creation separately)
|
||||
prompt_tokens += getattr(response.usage, 'cache_read_input_tokens', 0) or 0
|
||||
prompt_tokens += getattr(response.usage, 'cache_creation_input_tokens', 0) or 0
|
||||
completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0
|
||||
total_tokens = (
|
||||
getattr(response.usage, 'total_tokens', None)
|
||||
@@ -5118,6 +5122,15 @@ class AIAgent:
|
||||
}
|
||||
self.context_compressor.update_from_response(usage_dict)
|
||||
|
||||
# Emit one-time warnings when context crosses 80% or 95%.
|
||||
# Always show these (even in quiet_mode) — they're critical
|
||||
# user-facing alerts, not debug noise. Only suppress for
|
||||
# subagents (delegate_depth > 0) where the user can't act.
|
||||
if getattr(self, '_delegate_depth', 0) == 0:
|
||||
_ctx_warning = self.context_compressor.check_context_warning()
|
||||
if _ctx_warning:
|
||||
print(f"\n{_ctx_warning}\n")
|
||||
|
||||
# Cache discovered context length after successful call
|
||||
if self.context_compressor._context_probed:
|
||||
ctx = self.context_compressor.context_length
|
||||
|
||||
70
tests/test_context_warning.py
Normal file
70
tests/test_context_warning.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Tests for context window usage warnings."""
|
||||
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
|
||||
class TestContextWarning:
|
||||
def _make_compressor(self, context_length=200_000):
|
||||
c = ContextCompressor(model="test/model", threshold_percent=0.50)
|
||||
c.context_length = context_length
|
||||
c.threshold_tokens = int(context_length * 0.50)
|
||||
return c
|
||||
|
||||
def test_no_warning_below_80_percent(self):
|
||||
c = self._make_compressor()
|
||||
c.update_from_response({"prompt_tokens": 100_000}) # 50%
|
||||
assert c.check_context_warning() is None
|
||||
|
||||
def test_warning_at_80_percent(self):
|
||||
c = self._make_compressor()
|
||||
c.update_from_response({"prompt_tokens": 160_000}) # 80%
|
||||
warning = c.check_context_warning()
|
||||
assert warning is not None
|
||||
assert "80%" in warning
|
||||
assert "/compress" in warning
|
||||
|
||||
def test_warning_at_95_percent(self):
|
||||
c = self._make_compressor()
|
||||
c.update_from_response({"prompt_tokens": 190_000}) # 95%
|
||||
warning = c.check_context_warning()
|
||||
assert warning is not None
|
||||
assert "95%" in warning
|
||||
assert "/new" in warning
|
||||
|
||||
def test_warning_fires_only_once_per_threshold(self):
|
||||
c = self._make_compressor()
|
||||
c.update_from_response({"prompt_tokens": 170_000}) # 85%
|
||||
w1 = c.check_context_warning()
|
||||
assert w1 is not None # First time at 80%
|
||||
|
||||
c.update_from_response({"prompt_tokens": 175_000}) # Still above 80%
|
||||
w2 = c.check_context_warning()
|
||||
assert w2 is None # Already warned
|
||||
|
||||
def test_95_fires_after_80_already_warned(self):
|
||||
c = self._make_compressor()
|
||||
c.update_from_response({"prompt_tokens": 165_000}) # 82.5%
|
||||
w1 = c.check_context_warning()
|
||||
assert w1 is not None
|
||||
assert "82%" in w1 or "Context window" in w1
|
||||
|
||||
c.update_from_response({"prompt_tokens": 195_000}) # 97.5%
|
||||
w2 = c.check_context_warning()
|
||||
assert w2 is not None
|
||||
assert "nearly exhausted" in w2 # Escalated warning
|
||||
|
||||
def test_no_warning_when_context_length_zero(self):
|
||||
c = self._make_compressor(context_length=0)
|
||||
c.update_from_response({"prompt_tokens": 100_000})
|
||||
assert c.check_context_warning() is None
|
||||
|
||||
def test_no_warning_when_no_tokens(self):
|
||||
c = self._make_compressor()
|
||||
assert c.check_context_warning() is None
|
||||
|
||||
def test_warning_includes_token_counts(self):
|
||||
c = self._make_compressor(context_length=100_000)
|
||||
c.update_from_response({"prompt_tokens": 85_000})
|
||||
warning = c.check_context_warning()
|
||||
assert "85,000" in warning
|
||||
assert "100,000" in warning
|
||||
Reference in New Issue
Block a user