fix(compressor): use text char sum for multimodal token estimation in _find_tail_cut_by_tokens

_find_tail_cut_by_tokens called len(content) to estimate message tokens.
When content is a list of blocks (multimodal: text + image_url), len()
returns block count (e.g. 2) rather than character count, so a message
with 500 chars of text was counted as ~10 tokens instead of ~135.

This caused the backward walk to exhaust all messages before hitting the
budget ceiling; the head_end safeguard then forced cut = n - min_tail,
shrinking the protected tail to the bare minimum and preventing effective
compression of long multimodal conversations.

Fix mirrors the existing pattern in _prune_old_tool_results (line 487):
  sum(len(p.get("text", "")) for p in raw_content)
  if isinstance(raw_content, list) else len(raw_content)

Tests: 3 new cases in TestTokenBudgetTailProtection — regression guard
(confirms the test fails with the bug), plain-string regression guard,
and image-only block edge case.

Fixes #16087.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
briandevans
2026-04-26 08:38:16 -07:00
committed by Teknium
parent 3e68809fe0
commit cfc8befe65
2 changed files with 83 additions and 2 deletions

View File

@@ -1082,8 +1082,13 @@ The user has requested that this compaction PRIORITISE preserving all informatio
for i in range(n - 1, head_end - 1, -1): for i in range(n - 1, head_end - 1, -1):
msg = messages[i] msg = messages[i]
content = msg.get("content") or "" raw_content = msg.get("content") or ""
msg_tokens = len(content) // _CHARS_PER_TOKEN + 10 # +10 for role/metadata content_len = (
sum(len(p.get("text", "")) for p in raw_content)
if isinstance(raw_content, list)
else len(raw_content)
)
msg_tokens = content_len // _CHARS_PER_TOKEN + 10 # +10 for role/metadata
# Include tool call arguments in estimate # Include tool call arguments in estimate
for tc in msg.get("tool_calls") or []: for tc in msg.get("tool_calls") or []:
if isinstance(tc, dict): if isinstance(tc, dict):

View File

@@ -846,6 +846,82 @@ class TestTokenBudgetTailProtection:
# so it might or might not be pruned depending on boundary # so it might or might not be pruned depending on boundary
assert isinstance(pruned, int) assert isinstance(pruned, int)
def test_multimodal_message_accumulates_text_chars_not_block_count(self, budget_compressor):
"""_find_tail_cut_by_tokens must use text char count, not list length,
for multimodal content. Regression guard for #16087.
Setup: 6 messages, budget=80 (soft_ceiling=120). The multimodal message
at index 1 has 500 chars of text → 135 tokens (correct) or 10 tokens (bug).
Fixed path: walk stops at the multimodal (44+135=179 > 120), cut stays at 2,
tail = messages[2:] = 4 messages.
Bug path: walk counts only 10 tokens for the multimodal, exhausts to head_end,
the head_end safeguard forces cut = n - min_tail = 3, tail = only 3 messages.
"""
c = budget_compressor
# 500 chars → 500//4 + 10 = 135 tokens; len([text, image]) // 4 + 10 = 10 (bug)
big_text = "x" * 500
multimodal_content = [
{"type": "text", "text": big_text},
{"type": "image_url", "image_url": {"url": "https://example.com/img.jpg"}},
]
messages = [
{"role": "user", "content": "head1"}, # 0
{"role": "user", "content": multimodal_content}, # 1: BIG (index under test)
{"role": "assistant", "content": "tail1"}, # 2
{"role": "user", "content": "tail2"}, # 3
{"role": "assistant", "content": "tail3"}, # 4
{"role": "user", "content": "tail4"}, # 5
]
c.tail_token_budget = 80 # soft_ceiling = 120
head_end = 0
cut = c._find_tail_cut_by_tokens(messages, head_end)
# With the fix: cut=2, tail has 4 messages (soft_ceiling not exceeded by tail1-4).
# With the bug: head_end safeguard fires → cut = n - min_tail = 3, only 3 in tail.
assert len(messages) - cut >= 4, (
f"Expected ≥4 messages in tail (got {len(messages) - cut}, cut={cut}). "
"The multimodal message was underestimated — len(list) used instead of text chars."
)
def test_plain_string_content_unchanged(self, budget_compressor):
"""Plain string content must still be estimated correctly after the fix."""
c = budget_compressor
# Same layout as the multimodal test but with a plain 500-char string.
# Both buggy and fixed code count plain strings the same way (len(str)).
# With 135 tokens the plain string also exceeds soft_ceiling=120, so
# the walk stops at index 1 and tail has 4 messages — same as the fix path.
big_plain = "x" * 500
messages = [
{"role": "user", "content": "head1"},
{"role": "user", "content": big_plain}, # 1: 135 tokens, plain string
{"role": "assistant", "content": "tail1"},
{"role": "user", "content": "tail2"},
{"role": "assistant", "content": "tail3"},
{"role": "user", "content": "tail4"},
]
c.tail_token_budget = 80
head_end = 0
cut = c._find_tail_cut_by_tokens(messages, head_end)
assert len(messages) - cut >= 4, (
f"Plain string regression: expected ≥4 messages in tail, got {len(messages) - cut}"
)
def test_image_only_block_contributes_zero_text_chars(self, budget_compressor):
"""Image-only content blocks (no 'text' key) contribute 0 chars + base overhead."""
c = budget_compressor
c.tail_token_budget = 500
image_only = [{"type": "image_url", "image_url": {"url": "https://example.com/x.jpg"}}]
messages = [
{"role": "user", "content": "a" * 4000},
{"role": "user", "content": image_only}, # 0 text chars → 10 tokens overhead
{"role": "assistant", "content": "ok"},
]
head_end = 0
cut = c._find_tail_cut_by_tokens(messages, head_end)
assert isinstance(cut, int)
assert 0 <= cut <= len(messages)
class TestUpdateModelBudgets: class TestUpdateModelBudgets:
"""Regression: update_model() must recalculate token budgets.""" """Regression: update_model() must recalculate token budgets."""