mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 17:27:37 +08:00
The user-visible /compress banner and the post-compression last_prompt_tokens writeback both counted only the raw message transcript (chars/4). With a 15KB system prompt and 30 tool schemas (~26KB), a 4-message transcript that looks like ~45 tokens to the transcript-only estimator is really ~10.5K tokens of request pressure — a 234x gap. Two user-facing consequences: - Banner shows 'Compressing … (~45 tokens)…' while compression is actually firing on 10K+ tokens of real pressure, confusing users about why compression triggered (reported by @codecovenant on X; #6217). - Post-compression last_prompt_tokens writeback omits tool schemas, so the next should_compress() check compares real usage against a stale underestimate — compression triggers late, potentially past the model's context limit on small-context models (#14695). Swap estimate_messages_tokens_rough() for estimate_request_tokens_rough() at every user-visible banner and at the post-compression writeback. estimate_request_tokens_rough() already existed for exactly this purpose and includes system prompt + tool schemas. Touched call sites: - run_agent.py: post-compression last_prompt_tokens writeback, post-tool call should_compress() fallback when provider usage is missing - cli.py: /compress banner + summary - gateway/run.py: gateway /compress banner + summary - tui_gateway/server.py: TUI /compress status + summary - acp_adapter/server.py: ACP /compact before/after Left intentionally alone: - Session-hygiene fallback and the 'no agent' /status path in gateway/run.py — no agent instance is in scope to query for system prompt/tools, and the existing 30-50% overestimate wobble on hygiene is safety-accepted. - Verbose-mode 'Request size' logging — informational only, already counts system prompt via api_messages[0]. Also relabels the feedback line from 'Rough transcript estimate' to 'Approx request size' so the metric label matches what it actually measures. Credits: diagnoses from @devilardis (#14695) and @Jackten (#6217); user report @codecovenant on X (2026-04-30). Closes #14695 Closes #6217
50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
"""User-facing summaries for manual compression commands."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Sequence
|
|
|
|
|
|
def summarize_manual_compression(
|
|
before_messages: Sequence[dict[str, Any]],
|
|
after_messages: Sequence[dict[str, Any]],
|
|
before_tokens: int,
|
|
after_tokens: int,
|
|
) -> dict[str, Any]:
|
|
"""Return consistent user-facing feedback for manual compression."""
|
|
before_count = len(before_messages)
|
|
after_count = len(after_messages)
|
|
noop = list(after_messages) == list(before_messages)
|
|
|
|
if noop:
|
|
headline = f"No changes from compression: {before_count} messages"
|
|
if after_tokens == before_tokens:
|
|
token_line = (
|
|
f"Approx request size: ~{before_tokens:,} tokens (unchanged)"
|
|
)
|
|
else:
|
|
token_line = (
|
|
f"Approx request size: ~{before_tokens:,} → "
|
|
f"~{after_tokens:,} tokens"
|
|
)
|
|
else:
|
|
headline = f"Compressed: {before_count} → {after_count} messages"
|
|
token_line = (
|
|
f"Approx request size: ~{before_tokens:,} → "
|
|
f"~{after_tokens:,} tokens"
|
|
)
|
|
|
|
note = None
|
|
if not noop and after_count < before_count and after_tokens > before_tokens:
|
|
note = (
|
|
"Note: fewer messages can still raise this estimate when "
|
|
"compression rewrites the transcript into denser summaries."
|
|
)
|
|
|
|
return {
|
|
"noop": noop,
|
|
"headline": headline,
|
|
"token_line": token_line,
|
|
"note": note,
|
|
}
|