Compare commits

...

1 Commits

Author SHA1 Message Date
teknium1
1b8a1c7d5e fix: handle multimodal content in context compression summarization
The _generate_summary() method assumed message content is always a
string (msg.get('content') or ''). When content is a multimodal list
(e.g. [{type: 'text', text: '...'}, {type: 'image_url', ...}]), this
produced mangled output: len() returned the list length instead of
character count, and slicing produced list items instead of substrings.

Add _content_to_text() helper that safely converts any content format
to plain text:
- str → returned as-is
- None → empty string
- list (multimodal) → text parts joined, images replaced with [image]
- dict/other → JSON serialization with str() fallback

This ensures multimodal conversations compress correctly instead of
producing garbled summaries.

Inspired by PR #776 by @kshitijk4poor.
2026-03-11 05:42:31 -07:00
2 changed files with 101 additions and 1 deletions

View File

@@ -5,6 +5,7 @@ Uses Gemini Flash (cheap/fast) to summarize middle turns while
protecting head and tail context. protecting head and tail context.
""" """
import json
import logging import logging
import os import os
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -82,6 +83,41 @@ class ContextCompressor:
"compression_count": self.compression_count, "compression_count": self.compression_count,
} }
@staticmethod
def _content_to_text(content: Any) -> str:
"""Convert message content to plain text for summarization.
Handles:
- str → returned as-is
- None → empty string
- list (multimodal) → text parts joined, images replaced with [image]
- other → JSON serialization or str() fallback
"""
if isinstance(content, str):
return content
if content is None:
return ""
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict):
item_type = item.get("type")
if item_type == "text":
parts.append(item.get("text", ""))
elif item_type == "image_url":
parts.append("[image]")
elif item_type:
parts.append(f"[{item_type}]")
else:
parts.append(str(item))
else:
parts.append(str(item))
return "\n".join(part for part in parts if part)
try:
return json.dumps(content, ensure_ascii=False, sort_keys=True)
except TypeError:
return str(content)
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]: def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
"""Generate a concise summary of conversation turns. """Generate a concise summary of conversation turns.
@@ -93,7 +129,7 @@ class ContextCompressor:
parts = [] parts = []
for msg in turns_to_summarize: for msg in turns_to_summarize:
role = msg.get("role", "unknown") role = msg.get("role", "unknown")
content = msg.get("content") or "" content = self._content_to_text(msg.get("content"))
if len(content) > 2000: if len(content) > 2000:
content = content[:1000] + "\n...[truncated]...\n" + content[-500:] content = content[:1000] + "\n...[truncated]...\n" + content[-500:]
tool_calls = msg.get("tool_calls", []) tool_calls = msg.get("tool_calls", [])

View File

@@ -115,6 +115,70 @@ class TestCompress:
assert result[-2]["content"] == msgs[-2]["content"] assert result[-2]["content"] == msgs[-2]["content"]
class TestContentToText:
"""Test _content_to_text handles all content types without crashing."""
def test_string_passthrough(self, compressor):
assert compressor._content_to_text("hello") == "hello"
def test_none_returns_empty(self, compressor):
assert compressor._content_to_text(None) == ""
def test_multimodal_text_parts(self, compressor):
content = [
{"type": "text", "text": "describe this image"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
]
result = compressor._content_to_text(content)
assert "describe this image" in result
assert "[image]" in result
def test_multimodal_mixed_types(self, compressor):
content = [
{"type": "text", "text": "first part"},
{"type": "audio", "audio": {"data": "..."}},
{"type": "text", "text": "second part"},
]
result = compressor._content_to_text(content)
assert "first part" in result
assert "[audio]" in result
assert "second part" in result
def test_dict_content_json_serialized(self, compressor):
content = {"key": "value"}
result = compressor._content_to_text(content)
assert "key" in result
assert "value" in result
def test_multimodal_in_generate_summary(self):
"""Multimodal user messages should not crash _generate_summary."""
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: image was discussed"
mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor(model="test", quiet_mode=True)
messages = [
{"role": "user", "content": [
{"type": "text", "text": "What is in this image?"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
]},
{"role": "assistant", "content": "I see a cat."},
{"role": "user", "content": "thanks"},
]
summary = c._generate_summary(messages)
assert isinstance(summary, str)
# The prompt sent to the model should contain the text, not raw list
prompt = mock_client.chat.completions.create.call_args.kwargs["messages"][0]["content"]
assert "What is in this image?" in prompt
assert "[image]" in prompt
class TestGenerateSummaryNoneContent: class TestGenerateSummaryNoneContent:
"""Regression: content=None (from tool-call-only assistant messages) must not crash.""" """Regression: content=None (from tool-call-only assistant messages) must not crash."""