Compare commits

...

2 Commits

Author SHA1 Message Date
Teknium
bd0c3eadd1 fix: prevent context pressure warning spam (agent loop + gateway rate-limit)
Two complementary fixes for repeated context pressure warnings spamming
gateway users (Telegram, Discord, etc.):

1. Agent-level loop fix (run_agent.py):
   After compression, only reset _context_pressure_warned if the
   post-compression estimate is actually below the 85% warning level.
   Previously the flag was unconditionally reset, causing the warning
   to re-fire every loop iteration when compression couldn't reduce
   below 85% of the threshold (e.g. very low threshold like 15%,
   or system prompt alone exceeds the warning level).

2. Gateway-level rate-limit (gateway/run.py, salvaged from PR #3786):
   Per-chat_id cooldown of 1 hour on compression warning messages.
   Both warning paths ('still large after compression' and 'compression
   failed') are gated. Defense-in-depth — even if the agent-level fix
   has edge cases, users won't see more than one warning per hour.

Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com>
2026-03-30 13:02:57 -07:00
Teknium
9a61265824 feat: add /yolo slash command to toggle dangerous command approvals
Adds a /yolo command that toggles HERMES_YOLO_MODE at runtime, skipping
all dangerous command approval prompts for the current session. Works in
both CLI and gateway (Telegram, Discord, etc.).

- /yolo -> ON: all commands auto-approved, no confirmation prompts
- /yolo -> OFF: normal approval flow restored

The --yolo CLI flag already existed for launch-time opt-in. This adds
the ability to toggle mid-session without restarting.

Session-scoped — resets when the process ends. Uses the existing
HERMES_YOLO_MODE env var that check_all_command_guards() already
respects.
2026-03-30 09:18:04 -07:00
5 changed files with 100 additions and 7 deletions

13
cli.py
View File

@@ -3836,6 +3836,8 @@ class HermesCLI:
self.console.print(f" Status bar {state}")
elif canonical == "verbose":
self._toggle_verbose()
elif canonical == "yolo":
self._toggle_yolo()
elif canonical == "reasoning":
self._handle_reasoning_command(cmd_original)
elif canonical == "compress":
@@ -4434,6 +4436,17 @@ class HermesCLI:
}
_cprint(labels.get(self.tool_progress_mode, ""))
def _toggle_yolo(self):
"""Toggle YOLO mode — skip all dangerous command approval prompts."""
import os
current = bool(os.environ.get("HERMES_YOLO_MODE"))
if current:
os.environ.pop("HERMES_YOLO_MODE", None)
self.console.print(" ⚠ YOLO mode [bold red]OFF[/] — dangerous commands will require approval.")
else:
os.environ["HERMES_YOLO_MODE"] = "1"
self.console.print(" ⚡ YOLO mode [bold green]ON[/] — all commands auto-approved. Use with caution.")
def _handle_reasoning_command(self, cmd: str):
"""Handle /reasoning — manage effort level and display toggle.

View File

@@ -432,6 +432,13 @@ class GatewayRunner:
self._honcho_managers: Dict[str, Any] = {}
self._honcho_configs: Dict[str, Any] = {}
# Rate-limit compression warning messages sent to users.
# Keyed by chat_id — value is the timestamp of the last warning sent.
# Prevents the warning from firing on every message when a session
# remains above the threshold after compression.
self._compression_warn_sent: Dict[str, float] = {}
self._compression_warn_cooldown: int = 3600 # seconds (1 hour)
# Ensure tirith security scanner is available (downloads if needed)
try:
from tools.tirith_security import ensure_installed
@@ -1830,6 +1837,9 @@ class GatewayRunner:
if canonical == "verbose":
return await self._handle_verbose_command(event)
if canonical == "yolo":
return await self._handle_yolo_command(event)
if canonical == "provider":
return await self._handle_provider_command(event)
@@ -2344,13 +2354,18 @@ class GatewayRunner:
pass
# Still too large after compression — warn user
# Rate-limited to once per cooldown period per
# chat to avoid spamming on every message.
if _new_tokens >= _warn_token_threshold:
logger.warning(
"Session hygiene: still ~%s tokens after "
"compression — suggesting /reset",
f"{_new_tokens:,}",
)
if _hyg_adapter:
_now = time.time()
_last_warn = self._compression_warn_sent.get(source.chat_id, 0)
if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
self._compression_warn_sent[source.chat_id] = _now
try:
await _hyg_adapter.send(
source.chat_id,
@@ -2372,7 +2387,10 @@ class GatewayRunner:
if _approx_tokens >= _warn_token_threshold:
_hyg_adapter = self.adapters.get(source.platform)
_hyg_meta = {"thread_id": source.thread_id} if source.thread_id else None
if _hyg_adapter:
_now = time.time()
_last_warn = self._compression_warn_sent.get(source.chat_id, 0)
if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
self._compression_warn_sent[source.chat_id] = _now
try:
await _hyg_adapter.send(
source.chat_id,
@@ -3999,6 +4017,16 @@ class GatewayRunner:
else:
return f"🧠 ✓ Reasoning effort set to `{effort}` (this session only)"
async def _handle_yolo_command(self, event: MessageEvent) -> str:
"""Handle /yolo — toggle dangerous command approval bypass."""
current = bool(os.environ.get("HERMES_YOLO_MODE"))
if current:
os.environ.pop("HERMES_YOLO_MODE", None)
return "⚠️ YOLO mode **OFF** — dangerous commands will require approval."
else:
os.environ["HERMES_YOLO_MODE"] = "1"
return "⚡ YOLO mode **ON** — all commands auto-approved. Use with caution."
async def _handle_verbose_command(self, event: MessageEvent) -> str:
"""Handle /verbose command — cycle tool progress display mode.

View File

@@ -90,6 +90,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
CommandDef("verbose", "Cycle tool progress display: off -> new -> all -> verbose",
"Configuration", cli_only=True,
gateway_config_gate="display.tool_progress_command"),
CommandDef("yolo", "Toggle YOLO mode (skip all dangerous command approvals)",
"Configuration"),
CommandDef("reasoning", "Manage reasoning effort and display", "Configuration",
args_hint="[level|show|hide]",
subcommands=("none", "low", "minimal", "medium", "high", "xhigh", "show", "hide", "on", "off")),

View File

@@ -5204,11 +5204,8 @@ class AIAgent:
except Exception as e:
logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
# Reset context pressure warning and token estimate — usage drops
# after compaction. Without this, the stale last_prompt_tokens from
# the previous API call causes the pressure calculation to stay at
# >1000% and spam warnings / re-trigger compression in a loop.
self._context_pressure_warned = False
# Update token estimate after compaction so pressure calculations
# use the post-compression count, not the stale pre-compression one.
_compressed_est = (
estimate_tokens_rough(new_system_prompt)
+ estimate_messages_tokens_rough(compressed)
@@ -5216,6 +5213,16 @@ class AIAgent:
self.context_compressor.last_prompt_tokens = _compressed_est
self.context_compressor.last_completion_tokens = 0
# Only reset the pressure warning if compression actually brought
# us below the warning level (85% of threshold). When compression
# can't reduce enough (e.g. threshold is very low, or system prompt
# alone exceeds the warning level), keep the flag set to prevent
# spamming the user with repeated warnings every loop iteration.
if self.context_compressor.threshold_tokens > 0:
_post_progress = _compressed_est / self.context_compressor.threshold_tokens
if _post_progress < 0.85:
self._context_pressure_warned = False
return compressed, new_system_prompt
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:

View File

@@ -212,6 +212,49 @@ class TestSessionHygieneWarnThreshold:
assert post_compress_tokens < warn_threshold
class TestCompressionWarnRateLimit:
"""Compression warning messages must be rate-limited per chat_id."""
def _make_runner(self):
from unittest.mock import MagicMock, patch
with patch("gateway.run.load_gateway_config"), \
patch("gateway.run.SessionStore"), \
patch("gateway.run.DeliveryRouter"):
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner._compression_warn_sent = {}
runner._compression_warn_cooldown = 3600
return runner
def test_first_warn_is_sent(self):
runner = self._make_runner()
now = 1_000_000.0
last = runner._compression_warn_sent.get("chat:1", 0)
assert now - last >= runner._compression_warn_cooldown
def test_second_warn_suppressed_within_cooldown(self):
runner = self._make_runner()
now = 1_000_000.0
runner._compression_warn_sent["chat:1"] = now - 60 # 1 minute ago
last = runner._compression_warn_sent.get("chat:1", 0)
assert now - last < runner._compression_warn_cooldown
def test_warn_allowed_after_cooldown(self):
runner = self._make_runner()
now = 1_000_000.0
runner._compression_warn_sent["chat:1"] = now - 3601 # just past cooldown
last = runner._compression_warn_sent.get("chat:1", 0)
assert now - last >= runner._compression_warn_cooldown
def test_rate_limit_is_per_chat(self):
"""Rate-limiting one chat must not suppress warnings for another."""
runner = self._make_runner()
now = 1_000_000.0
runner._compression_warn_sent["chat:1"] = now - 60 # suppressed
last_other = runner._compression_warn_sent.get("chat:2", 0)
assert now - last_other >= runner._compression_warn_cooldown
class TestEstimatedTokenThreshold:
"""Verify that hygiene thresholds are always below the model's context
limit — for both actual and estimated token counts.