diff --git a/agent/context_compressor.py b/agent/context_compressor.py index aa05a8daa0..0c6cb215dd 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -65,12 +65,41 @@ class ContextCompressor: self.summary_model = summary_model_override or "" + # Context usage warning thresholds (fire once each per session) + self._warned_80 = False + self._warned_95 = False + def update_from_response(self, usage: Dict[str, Any]): """Update tracked token usage from API response.""" self.last_prompt_tokens = usage.get("prompt_tokens", 0) self.last_completion_tokens = usage.get("completion_tokens", 0) self.last_total_tokens = usage.get("total_tokens", 0) + def check_context_warning(self) -> str | None: + """Return a warning string if context usage crossed a threshold, else None. + + Each threshold fires at most once per session to avoid spam. + """ + if not self.context_length or not self.last_prompt_tokens: + return None + pct = self.last_prompt_tokens / self.context_length + if pct >= 0.95 and not self._warned_95: + self._warned_95 = True + used = f"{self.last_prompt_tokens:,}" + total = f"{self.context_length:,}" + return ( + f"⚠ Context nearly exhausted ({used}/{total} tokens, {pct:.0%}). " + f"Risk of errors or truncation. Use /new to start fresh." + ) + if pct >= 0.80 and not self._warned_80: + self._warned_80 = True + used = f"{self.last_prompt_tokens:,}" + total = f"{self.context_length:,}" + return ( + f"⚠ Context window {pct:.0%} full ({used}/{total} tokens). " + f"Consider /compress or /new if responses degrade." + ) + def should_compress(self, prompt_tokens: int = None) -> bool: """Check if context exceeds the compression threshold.""" tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens diff --git a/run_agent.py b/run_agent.py index 8a4147a8b0..792b90b6bd 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5102,6 +5102,10 @@ class AIAgent: if hasattr(response, 'usage') and response.usage: if self.api_mode in ("codex_responses", "anthropic_messages"): prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0 + # Include cached input tokens for accurate context tracking + # (Anthropic reports non-cached, cache-read, and cache-creation separately) + prompt_tokens += getattr(response.usage, 'cache_read_input_tokens', 0) or 0 + prompt_tokens += getattr(response.usage, 'cache_creation_input_tokens', 0) or 0 completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0 total_tokens = ( getattr(response.usage, 'total_tokens', None) @@ -5118,6 +5122,15 @@ class AIAgent: } self.context_compressor.update_from_response(usage_dict) + # Emit one-time warnings when context crosses 80% or 95%. + # Always show these (even in quiet_mode) — they're critical + # user-facing alerts, not debug noise. Only suppress for + # subagents (delegate_depth > 0) where the user can't act. + if getattr(self, '_delegate_depth', 0) == 0: + _ctx_warning = self.context_compressor.check_context_warning() + if _ctx_warning: + print(f"\n{_ctx_warning}\n") + # Cache discovered context length after successful call if self.context_compressor._context_probed: ctx = self.context_compressor.context_length diff --git a/tests/test_context_warning.py b/tests/test_context_warning.py new file mode 100644 index 0000000000..2186d30d3a --- /dev/null +++ b/tests/test_context_warning.py @@ -0,0 +1,70 @@ +"""Tests for context window usage warnings.""" + +from agent.context_compressor import ContextCompressor + + +class TestContextWarning: + def _make_compressor(self, context_length=200_000): + c = ContextCompressor(model="test/model", threshold_percent=0.50) + c.context_length = context_length + c.threshold_tokens = int(context_length * 0.50) + return c + + def test_no_warning_below_80_percent(self): + c = self._make_compressor() + c.update_from_response({"prompt_tokens": 100_000}) # 50% + assert c.check_context_warning() is None + + def test_warning_at_80_percent(self): + c = self._make_compressor() + c.update_from_response({"prompt_tokens": 160_000}) # 80% + warning = c.check_context_warning() + assert warning is not None + assert "80%" in warning + assert "/compress" in warning + + def test_warning_at_95_percent(self): + c = self._make_compressor() + c.update_from_response({"prompt_tokens": 190_000}) # 95% + warning = c.check_context_warning() + assert warning is not None + assert "95%" in warning + assert "/new" in warning + + def test_warning_fires_only_once_per_threshold(self): + c = self._make_compressor() + c.update_from_response({"prompt_tokens": 170_000}) # 85% + w1 = c.check_context_warning() + assert w1 is not None # First time at 80% + + c.update_from_response({"prompt_tokens": 175_000}) # Still above 80% + w2 = c.check_context_warning() + assert w2 is None # Already warned + + def test_95_fires_after_80_already_warned(self): + c = self._make_compressor() + c.update_from_response({"prompt_tokens": 165_000}) # 82.5% + w1 = c.check_context_warning() + assert w1 is not None + assert "82%" in w1 or "Context window" in w1 + + c.update_from_response({"prompt_tokens": 195_000}) # 97.5% + w2 = c.check_context_warning() + assert w2 is not None + assert "nearly exhausted" in w2 # Escalated warning + + def test_no_warning_when_context_length_zero(self): + c = self._make_compressor(context_length=0) + c.update_from_response({"prompt_tokens": 100_000}) + assert c.check_context_warning() is None + + def test_no_warning_when_no_tokens(self): + c = self._make_compressor() + assert c.check_context_warning() is None + + def test_warning_includes_token_counts(self): + c = self._make_compressor(context_length=100_000) + c.update_from_response({"prompt_tokens": 85_000}) + warning = c.check_context_warning() + assert "85,000" in warning + assert "100,000" in warning