fix: handle Mistral Magistral structured content blocks

Mistral Magistral reasoning models (and mistral-large-2512+) return message content as a list of typed blocks instead of a plain string: [{"type": "thinking", "thinking": [{"type": "text", "text": "..."}]}, {"type": "text", "text": "final answer"}] This happens in both streaming deltas and non-streaming responses, causing TypeError: sequence item 0: expected str instance, list found when the code tries to join content parts. Changes: - Add _normalize_structured_content() helper that extracts text and thinking parts from Mistral structured blocks - Fix streaming path: normalize delta.content before appending to content_parts, route thinking to reasoning_parts - Fix non-streaming normalization: use the helper to also extract thinking blocks as reasoning_content (was silently dropping them) - Fix _build_assistant_message: normalize list content before string operations - Fix length truncation/continuation paths: normalize content before string concatenation - Add 25 tests covering the helper, streaming, non-streaming, and _build_assistant_message paths Fixes the reported CLI/Discord bot crash when using magistral-latest or magistral-medium-latest via api.mistral.ai.
2026-06-14 22:29:09 +08:00 · 2026-04-03 01:49:14 -07:00
2 changed files with 509 additions and 39 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -375,6 +375,58 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
    return found


+def _normalize_structured_content(content) -> tuple:
+    """Normalize Mistral-style structured content blocks to (text, reasoning).
+
+    Mistral's Magistral models (and mistral-large-2512+) return ``content`` as
+    a list of typed blocks instead of a plain string::
+
+        [{"type": "thinking", "thinking": [{"type": "text", "text": "..."}]},
+         {"type": "text", "text": "final answer"},
+         {"type": "reference", ...}]
+
+    This also appears in streaming deltas (``delta.content`` is a list).
+
+    Returns:
+        (text_content, thinking_content) — text is always a string (possibly
+        empty), thinking is a string or None.
+    """
+    if content is None:
+        return ("", None)
+    if isinstance(content, str):
+        return (content, None)
+    if not isinstance(content, list):
+        return (str(content), None)
+
+    text_parts: list = []
+    thinking_parts: list = []
+    for block in content:
+        if isinstance(block, str):
+            text_parts.append(block)
+            continue
+        if not isinstance(block, dict):
+            continue
+        block_type = block.get("type", "")
+        if block_type == "text":
+            text_parts.append(block.get("text", ""))
+        elif block_type == "thinking":
+            # "thinking" is itself a list of text blocks
+            thinking = block.get("thinking", [])
+            if isinstance(thinking, list):
+                for t in thinking:
+                    if isinstance(t, dict) and t.get("type") == "text":
+                        thinking_parts.append(t.get("text", ""))
+                    elif isinstance(t, str):
+                        thinking_parts.append(t)
+            elif isinstance(thinking, str):
+                thinking_parts.append(thinking)
+        # Other types (reference, image, document, audio, file) are skipped.
+
+    text = "\n".join(p for p in text_parts if p)
+    thinking = "\n\n".join(p for p in thinking_parts if p) or None
+    return (text, thinking)
+
+
 def _strip_budget_warnings_from_history(messages: list) -> None:
    """Remove budget pressure warnings from tool-result messages in-place.

@@ -4107,30 +4159,43 @@ class AIAgent:
                    _fire_first_delta()
                    self._fire_reasoning_delta(reasoning_text)

-                # Accumulate text content — fire callback only when no tool calls
+                # Accumulate text content — fire callback only when no tool calls.
+                # Mistral Magistral models return delta.content as a list of
+                # structured blocks instead of a plain string; normalize first.
                if delta and delta.content:
-                    content_parts.append(delta.content)
-                    if not tool_calls_acc:
-                        _fire_first_delta()
-                        self._fire_stream_delta(delta.content)
-                        deltas_were_sent["yes"] = True
+                    _raw_delta_content = delta.content
+                    if isinstance(_raw_delta_content, list):
+                        _delta_text, _delta_thinking = _normalize_structured_content(_raw_delta_content)
+                        if _delta_thinking:
+                            reasoning_parts.append(_delta_thinking)
+                            _fire_first_delta()
+                            self._fire_reasoning_delta(_delta_thinking)
                    else:
-                        # Tool calls suppress regular content streaming (avoids
-                        # displaying chatty "I'll use the tool..." text alongside
-                        # tool calls).  But reasoning tags embedded in suppressed
-                        # content should still reach the display — otherwise the
-                        # reasoning box only appears as a post-response fallback,
-                        # rendering it confusingly after the already-streamed
-                        # response.  Route suppressed content through the stream
-                        # delta callback so its tag extraction can fire the
-                        # reasoning display.  Non-reasoning text is harmlessly
-                        # suppressed by the CLI's _stream_delta when the stream
-                        # box is already closed (tool boundary flush).
-                        if self.stream_delta_callback:
-                            try:
-                                self.stream_delta_callback(delta.content)
-                            except Exception:
-                                pass
+                        _delta_text = _raw_delta_content
+
+                    if _delta_text:
+                        content_parts.append(_delta_text)
+                        if not tool_calls_acc:
+                            _fire_first_delta()
+                            self._fire_stream_delta(_delta_text)
+                            deltas_were_sent["yes"] = True
+                        else:
+                            # Tool calls suppress regular content streaming (avoids
+                            # displaying chatty "I'll use the tool..." text alongside
+                            # tool calls).  But reasoning tags embedded in suppressed
+                            # content should still reach the display — otherwise the
+                            # reasoning box only appears as a post-response fallback,
+                            # rendering it confusingly after the already-streamed
+                            # response.  Route suppressed content through the stream
+                            # delta callback so its tag extraction can fire the
+                            # reasoning display.  Non-reasoning text is harmlessly
+                            # suppressed by the CLI's _stream_delta when the stream
+                            # box is already closed (tool boundary flush).
+                            if self.stream_delta_callback:
+                                try:
+                                    self.stream_delta_callback(_delta_text)
+                                except Exception:
+                                    pass

                # Accumulate tool call deltas — notify display on first name
                if delta and delta.tool_calls:
@@ -5170,18 +5235,32 @@ class AIAgent:
        Handles reasoning extraction, reasoning_details, and optional tool_calls
        so both the tool-call path and the final-response path share one builder.
        """
+        # Normalize content early — Mistral Magistral models return content
+        # as a list of structured blocks instead of a string.
+        _raw_content = assistant_message.content
+        _structured_thinking = None
+        if isinstance(_raw_content, list):
+            _raw_content, _structured_thinking = _normalize_structured_content(_raw_content)
+
        reasoning_text = self._extract_reasoning(assistant_message)
        _from_structured = bool(reasoning_text)

+        # If the structured content included thinking blocks and
+        # _extract_reasoning didn't find anything, use the structured thinking.
+        if not reasoning_text and _structured_thinking:
+            reasoning_text = _structured_thinking
+            _from_structured = True
+
        # Fallback: extract inline <think> blocks from content when no structured
        # reasoning fields are present (some models/providers embed thinking
        # directly in the content rather than returning separate API fields).
        if not reasoning_text:
-            content = assistant_message.content or ""
-            think_blocks = re.findall(r'<think>(.*?)</think>', content, flags=re.DOTALL)
-            if think_blocks:
-                combined = "\n\n".join(b.strip() for b in think_blocks if b.strip())
-                reasoning_text = combined or None
+            content = _raw_content or ""
+            if isinstance(content, str):
+                think_blocks = re.findall(r'<think>(.*?)</think>', content, flags=re.DOTALL)
+                if think_blocks:
+                    combined = "\n\n".join(b.strip() for b in think_blocks if b.strip())
+                    reasoning_text = combined or None

        if reasoning_text and self.verbose_logging:
            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")
@@ -5203,7 +5282,7 @@ class AIAgent:

        msg = {
            "role": "assistant",
-            "content": assistant_message.content or "",
+            "content": _raw_content or "",
            "reasoning": reasoning_text,
            "finish_reason": finish_reason,
        }
@@ -7022,6 +7101,9 @@ class AIAgent:
                        if self.api_mode == "chat_completions":
                            _trunc_msg = response.choices[0].message if (hasattr(response, "choices") and response.choices) else None
                            _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None
+                            # Mistral Magistral: content may be a list of blocks
+                            if isinstance(_trunc_content, list):
+                                _trunc_content, _ = _normalize_structured_content(_trunc_content)
                        elif self.api_mode == "anthropic_messages":
                            # Anthropic response.content is a list of blocks
                            _text_parts = []
@@ -7076,7 +7158,10 @@ class AIAgent:
                                interim_msg = self._build_assistant_message(assistant_message, finish_reason)
                                messages.append(interim_msg)
                                if assistant_message.content:
-                                    truncated_response_prefix += assistant_message.content
+                                    _cont = assistant_message.content
+                                    if isinstance(_cont, list):
+                                        _cont, _ = _normalize_structured_content(_cont)
+                                    truncated_response_prefix += _cont

                                if length_continue_retries < 3:
                                    self._vprint(
@@ -7791,21 +7876,22 @@ class AIAgent:
                # Normalize content to string — some OpenAI-compatible servers
                # (llama-server, etc.) return content as a dict or list instead
                # of a plain string, which crashes downstream .strip() calls.
+                # Mistral Magistral models return a list of structured blocks
+                # including {type: "thinking"} and {type: "text"}.
                if assistant_message.content is not None and not isinstance(assistant_message.content, str):
                    raw = assistant_message.content
                    if isinstance(raw, dict):
                        assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw)
                    elif isinstance(raw, list):
-                        # Multimodal content list — extract text parts
-                        parts = []
-                        for part in raw:
-                            if isinstance(part, str):
-                                parts.append(part)
-                            elif isinstance(part, dict) and part.get("type") == "text":
-                                parts.append(part.get("text", ""))
-                            elif isinstance(part, dict) and "text" in part:
-                                parts.append(str(part["text"]))
-                        assistant_message.content = "\n".join(parts)
+                        _norm_text, _norm_thinking = _normalize_structured_content(raw)
+                        assistant_message.content = _norm_text
+                        # Preserve extracted thinking as reasoning_content so
+                        # _extract_reasoning / _build_assistant_message picks it up.
+                        if _norm_thinking and not getattr(assistant_message, "reasoning_content", None):
+                            try:
+                                assistant_message.reasoning_content = _norm_thinking
+                            except (AttributeError, TypeError):
+                                pass  # frozen/read-only SDK object
                    else:
                        assistant_message.content = str(raw)

--- a/tests/test_mistral_structured_content.py
+++ b/tests/test_mistral_structured_content.py
@@ -0,0 +1,384 @@
+"""Tests for Mistral Magistral structured content handling.
+
+Mistral's Magistral reasoning models return ``content`` as a list of typed
+blocks instead of a plain string (both in streaming deltas and non-streaming
+responses).  This test suite verifies that:
+
+1. _normalize_structured_content() correctly extracts text and thinking parts.
+2. The streaming path handles list-valued delta.content without crashing.
+3. The non-streaming path normalizes list content and extracts thinking.
+4. _build_assistant_message handles list content correctly.
+"""
+
+import os
+import uuid
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ── Ensure HERMES_HOME is set before importing run_agent ──────────────
+if not os.environ.get("HERMES_HOME"):
+    import tempfile
+
+    _tmp = tempfile.mkdtemp(prefix="hermes_test_")
+    os.environ["HERMES_HOME"] = _tmp
+
+from run_agent import AIAgent, _normalize_structured_content
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────
+
+def _make_tool_defs(*names):
+    """Build minimal tool definitions matching get_tool_definitions output."""
+    return [
+        {"type": "function", "function": {"name": n, "description": n, "parameters": {}}}
+        for n in names
+    ]
+
+
+@pytest.fixture
+def agent():
+    """Minimal AIAgent for testing _build_assistant_message."""
+    with (
+        patch("run_agent.get_tool_definitions", return_value=_make_tool_defs("web_search")),
+        patch("run_agent.check_toolset_requirements", return_value={}),
+        patch("run_agent.OpenAI"),
+    ):
+        ag = AIAgent(
+            api_key="test-key-1234567890",
+            model="mistral/magistral-medium-latest",
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+        )
+    ag.client = MagicMock()
+    ag.verbose_logging = False
+    ag.reasoning_callback = None
+    ag.stream_delta_callback = None
+    return ag
+
+
+# ── Sample data matching Mistral's API format ─────────────────────────
+
+MAGISTRAL_CONTENT_BLOCKS = [
+    {
+        "type": "thinking",
+        "thinking": [
+            {"type": "text", "text": "Let me think about this step by step."},
+            {"type": "text", "text": "The capital of France is Paris."},
+        ],
+    },
+    {"type": "text", "text": "The capital of France is Paris."},
+]
+
+MAGISTRAL_TEXT_ONLY_BLOCKS = [
+    {"type": "text", "text": "Hello, how can I help?"},
+]
+
+MAGISTRAL_WITH_REFERENCE = [
+    {"type": "thinking", "thinking": [{"type": "text", "text": "Checking references."}]},
+    {"type": "text", "text": "Here is the answer."},
+    {"type": "reference", "url": "https://example.com"},
+]
+
+STREAMING_THINKING_DELTA = [
+    {"type": "thinking", "thinking": [{"type": "text", "text": "Okay"}]},
+]
+
+STREAMING_TEXT_DELTA = [
+    {"type": "text", "text": "Hello"},
+]
+
+
+# ── Tests: _normalize_structured_content ──────────────────────────────
+
+class TestNormalizeStructuredContent:
+    """Tests for the _normalize_structured_content helper."""
+
+    def test_string_passthrough(self):
+        text, thinking = _normalize_structured_content("Hello world")
+        assert text == "Hello world"
+        assert thinking is None
+
+    def test_none_returns_empty_string(self):
+        text, thinking = _normalize_structured_content(None)
+        assert text == ""
+        assert thinking is None
+
+    def test_non_list_non_string_coerced(self):
+        text, thinking = _normalize_structured_content(42)
+        assert text == "42"
+        assert thinking is None
+
+    def test_magistral_full_response(self):
+        text, thinking = _normalize_structured_content(MAGISTRAL_CONTENT_BLOCKS)
+        assert text == "The capital of France is Paris."
+        assert "step by step" in thinking
+        assert "capital of France is Paris" in thinking
+
+    def test_text_only_blocks(self):
+        text, thinking = _normalize_structured_content(MAGISTRAL_TEXT_ONLY_BLOCKS)
+        assert text == "Hello, how can I help?"
+        assert thinking is None
+
+    def test_with_reference_blocks(self):
+        """Reference blocks should be skipped, not cause errors."""
+        text, thinking = _normalize_structured_content(MAGISTRAL_WITH_REFERENCE)
+        assert text == "Here is the answer."
+        assert thinking == "Checking references."
+
+    def test_streaming_thinking_delta(self):
+        text, thinking = _normalize_structured_content(STREAMING_THINKING_DELTA)
+        assert text == ""
+        assert thinking == "Okay"
+
+    def test_streaming_text_delta(self):
+        text, thinking = _normalize_structured_content(STREAMING_TEXT_DELTA)
+        assert text == "Hello"
+        assert thinking is None
+
+    def test_empty_list(self):
+        text, thinking = _normalize_structured_content([])
+        assert text == ""
+        assert thinking is None
+
+    def test_mixed_string_and_dict_blocks(self):
+        """Some providers might mix raw strings with typed blocks."""
+        content = ["raw text", {"type": "text", "text": "typed text"}]
+        text, thinking = _normalize_structured_content(content)
+        assert "raw text" in text
+        assert "typed text" in text
+
+    def test_thinking_as_plain_string(self):
+        """Handle edge case where thinking value is a string not a list."""
+        content = [{"type": "thinking", "thinking": "I'm thinking..."}]
+        text, thinking = _normalize_structured_content(content)
+        assert text == ""
+        assert thinking == "I'm thinking..."
+
+    def test_multiple_text_blocks_joined(self):
+        content = [
+            {"type": "text", "text": "First paragraph."},
+            {"type": "text", "text": "Second paragraph."},
+        ]
+        text, thinking = _normalize_structured_content(content)
+        assert "First paragraph." in text
+        assert "Second paragraph." in text
+        assert "\n" in text  # joined with newline
+
+    def test_empty_thinking_block(self):
+        """Thinking block with no text should result in thinking=None."""
+        content = [
+            {"type": "thinking", "thinking": []},
+            {"type": "text", "text": "Answer"},
+        ]
+        text, thinking = _normalize_structured_content(content)
+        assert text == "Answer"
+        assert thinking is None
+
+
+# ── Tests: _build_assistant_message with structured content ────────────
+
+class TestBuildAssistantMessageStructuredContent:
+    """Tests that _build_assistant_message correctly handles Mistral list content."""
+
+    def test_list_content_normalized_to_string(self, agent):
+        msg = SimpleNamespace(
+            content=MAGISTRAL_CONTENT_BLOCKS,
+            tool_calls=None,
+        )
+        result = agent._build_assistant_message(msg, "stop")
+        assert isinstance(result["content"], str)
+        assert "The capital of France is Paris." in result["content"]
+
+    def test_list_content_thinking_extracted(self, agent):
+        msg = SimpleNamespace(
+            content=MAGISTRAL_CONTENT_BLOCKS,
+            tool_calls=None,
+        )
+        result = agent._build_assistant_message(msg, "stop")
+        assert result["reasoning"] is not None
+        assert "step by step" in result["reasoning"]
+
+    def test_string_content_unchanged(self, agent):
+        msg = SimpleNamespace(
+            content="Normal string response",
+            tool_calls=None,
+        )
+        result = agent._build_assistant_message(msg, "stop")
+        assert result["content"] == "Normal string response"
+
+    def test_list_content_with_tool_calls(self, agent):
+        tool_call = SimpleNamespace(
+            id="call_123",
+            type="function",
+            function=SimpleNamespace(name="web_search", arguments='{"query": "test"}'),
+        )
+        msg = SimpleNamespace(
+            content=MAGISTRAL_CONTENT_BLOCKS,
+            tool_calls=[tool_call],
+        )
+        result = agent._build_assistant_message(msg, "tool_calls")
+        assert isinstance(result["content"], str)
+        assert "tool_calls" in result
+
+    def test_text_only_blocks_no_reasoning(self, agent):
+        msg = SimpleNamespace(
+            content=MAGISTRAL_TEXT_ONLY_BLOCKS,
+            tool_calls=None,
+        )
+        result = agent._build_assistant_message(msg, "stop")
+        assert result["content"] == "Hello, how can I help?"
+        assert result["reasoning"] is None
+
+    def test_structured_thinking_not_duplicated_with_reasoning_content(self, agent):
+        """When reasoning_content is set AND content has thinking blocks,
+        don't duplicate the reasoning."""
+        msg = SimpleNamespace(
+            content=MAGISTRAL_CONTENT_BLOCKS,
+            tool_calls=None,
+            reasoning_content="Already extracted reasoning",
+        )
+        result = agent._build_assistant_message(msg, "stop")
+        # Should use the already-set reasoning_content, not duplicate
+        assert result["reasoning"] == "Already extracted reasoning"
+
+
+# ── Tests: Non-streaming content normalization ─────────────────────────
+
+class TestNonStreamingContentNormalization:
+    """Tests for the non-streaming content normalization block in the agent loop."""
+
+    def test_list_content_normalized(self, agent):
+        """Simulate the normalization block that runs after getting the
+        assistant_message from response.choices[0].message."""
+        msg = SimpleNamespace(content=MAGISTRAL_CONTENT_BLOCKS, tool_calls=None)
+
+        # Simulate the normalization block from run_agent.py
+        if msg.content is not None and not isinstance(msg.content, str):
+            raw = msg.content
+            if isinstance(raw, list):
+                text, thinking = _normalize_structured_content(raw)
+                msg.content = text
+                if thinking and not getattr(msg, "reasoning_content", None):
+                    msg.reasoning_content = thinking
+
+        assert isinstance(msg.content, str)
+        assert "The capital of France is Paris." in msg.content
+        assert hasattr(msg, "reasoning_content")
+        assert "step by step" in msg.reasoning_content
+
+    def test_dict_content_handled(self, agent):
+        """Dict content (from llama-server etc.) should still work."""
+        msg = SimpleNamespace(content={"text": "Hello from dict"}, tool_calls=None)
+
+        if msg.content is not None and not isinstance(msg.content, str):
+            raw = msg.content
+            if isinstance(raw, dict):
+                msg.content = raw.get("text", "") or raw.get("content", "") or str(raw)
+
+        assert msg.content == "Hello from dict"
+
+
+# ── Tests: Streaming delta normalization ───────────────────────────────
+
+class TestStreamingDeltaNormalization:
+    """Tests for the streaming delta content normalization."""
+
+    def test_list_delta_content_split(self):
+        """When delta.content is a list, text goes to content_parts
+        and thinking goes to reasoning_parts."""
+        content_parts = []
+        reasoning_parts = []
+
+        # Simulate the streaming normalization block
+        delta_content = MAGISTRAL_CONTENT_BLOCKS
+        if isinstance(delta_content, list):
+            text, thinking = _normalize_structured_content(delta_content)
+            if thinking:
+                reasoning_parts.append(thinking)
+        else:
+            text = delta_content
+
+        if text:
+            content_parts.append(text)
+
+        # Verify text and thinking are separated
+        assert len(content_parts) == 1
+        assert "The capital of France is Paris." in content_parts[0]
+        assert len(reasoning_parts) == 1
+        assert "step by step" in reasoning_parts[0]
+
+        # Verify join succeeds (this was the original crash)
+        full_content = "".join(content_parts)
+        assert isinstance(full_content, str)
+
+    def test_string_delta_passthrough(self):
+        """Normal string deltas should work unchanged."""
+        content_parts = []
+        delta_content = "Hello"
+
+        if isinstance(delta_content, list):
+            text, _ = _normalize_structured_content(delta_content)
+        else:
+            text = delta_content
+
+        if text:
+            content_parts.append(text)
+
+        full_content = "".join(content_parts)
+        assert full_content == "Hello"
+
+    def test_thinking_only_delta(self):
+        """Streaming delta with only thinking and no text."""
+        content_parts = []
+        reasoning_parts = []
+
+        delta_content = STREAMING_THINKING_DELTA
+        if isinstance(delta_content, list):
+            text, thinking = _normalize_structured_content(delta_content)
+            if thinking:
+                reasoning_parts.append(thinking)
+        else:
+            text = delta_content
+
+        if text:
+            content_parts.append(text)
+
+        # No text content, only reasoning
+        assert len(content_parts) == 0
+        assert len(reasoning_parts) == 1
+        assert reasoning_parts[0] == "Okay"
+
+        # Join should succeed (empty list)
+        full_content = "".join(content_parts) or None
+        assert full_content is None
+
+    def test_multiple_streaming_chunks_joined(self):
+        """Multiple streaming chunks with mixed list and string content."""
+        content_parts = []
+        reasoning_parts = []
+
+        chunks = [
+            STREAMING_THINKING_DELTA,  # list: thinking only
+            STREAMING_TEXT_DELTA,  # list: text only
+            "more text",  # string
+        ]
+
+        for delta_content in chunks:
+            if isinstance(delta_content, list):
+                text, thinking = _normalize_structured_content(delta_content)
+                if thinking:
+                    reasoning_parts.append(thinking)
+            else:
+                text = delta_content
+
+            if text:
+                content_parts.append(text)
+
+        full_content = "".join(content_parts)
+        full_reasoning = "".join(reasoning_parts) or None
+
+        assert full_content == "Hellomore text"
+        assert full_reasoning == "Okay"