fix(streaming): prevent <think> in prose from suppressing response output

When the model mentions <think> as literal text in its response (e.g. "(/think not producing <think> tags)"), the streaming display treated it as a reasoning block opener and suppressed everything after it. The response box would close with truncated content and no error — the API response was complete but the display ate it. Root cause: _stream_delta() matched <think> anywhere in the text stream regardless of position. Real reasoning blocks always start at the beginning of a line; mentions in prose appear mid-sentence. Fix: track line position across streaming deltas with a _stream_last_was_newline flag. Only enter reasoning suppression when the tag appears at a block boundary (start of stream, after a newline, or after only whitespace on the current line). Add a _flush_stream() safety net that recovers buffered content if no closing tag is found by end-of-stream. Also fixes three related issues discovered during investigation: - anthropic_adapter: _get_anthropic_max_output() now normalizes dots to hyphens so 'claude-opus-4.6' matches the 'claude-opus-4-6' table key (was returning 32K instead of 128K) - run_agent: send explicit max_tokens for Claude models on Nous Portal, same as OpenRouter — both proxy to Anthropic's API which requires it. Without it the backend defaults to a low limit that truncates responses. - run_agent: reset truncated_tool_call_retries after successful tool execution so a single truncation doesn't poison the entire conversation.
2026-05-05 10:17:17 +08:00 · 2026-04-09 22:12:06 -07:00
4 changed files with 217 additions and 19 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -74,8 +74,11 @@ def _get_anthropic_max_output(model: str) -> int:
    model IDs (claude-sonnet-4-5-20250929) and variant suffixes (:1m, :fast)
    resolve correctly.  Longest-prefix match wins to avoid e.g. "claude-3-5"
    matching before "claude-3-5-sonnet".
    Normalizes dots to hyphens so that model names like
    ``anthropic/claude-opus-4.6`` match the ``claude-opus-4-6`` table key.
    """
-    m = model.lower()
+    m = model.lower().replace(".", "-")
    best_key = ""
    best_val = _ANTHROPIC_DEFAULT_OUTPUT_LIMIT
    for key, val in _ANTHROPIC_OUTPUT_LIMITS.items():
--- a/cli.py
+++ b/cli.py
@@ -2308,17 +2308,59 @@ class HermesCLI:
        # Append to a pre-filter buffer first
        self._stream_prefilt = getattr(self, "_stream_prefilt", "") + text
-        # Check if we're entering a reasoning block
+        # Check if we're entering a reasoning block.
        # Only match tags that appear at a "block boundary": start of the
        # stream, after a newline (with optional whitespace), or when nothing
        # but whitespace has been emitted on the current line.
        # This prevents false positives when models *mention* tags in prose
        # like "(/think not producing <think> tags)".
        #
        # _stream_last_was_newline tracks whether the last character emitted
        # (or the start of the stream) is a line boundary.  It's True at
        # stream start and set True whenever emitted text ends with '\n'.
        if not hasattr(self, "_stream_last_was_newline"):
            self._stream_last_was_newline = True  # start of stream = boundary
        if not getattr(self, "_in_reasoning_block", False):
            for tag in _OPEN_TAGS:
-                idx = self._stream_prefilt.find(tag)
+                search_start = 0
-                if idx != -1:
+                while True:
-                    # Emit everything before the tag
+                    idx = self._stream_prefilt.find(tag, search_start)
-                    before = self._stream_prefilt[:idx]
+                    if idx == -1:
-                    if before:
+                        break
-                        self._emit_stream_text(before)
+                    # Check if this is a block boundary position
-                    self._in_reasoning_block = True
+                    preceding = self._stream_prefilt[:idx]
-                    self._stream_prefilt = self._stream_prefilt[idx + len(tag):]
+                    if idx == 0:
                        # At buffer start — only a boundary if we're at
                        # a line start (stream start or last emit ended
                        # with newline)
                        is_block_boundary = getattr(self, "_stream_last_was_newline", True)
                    else:
                        # Find last newline in the buffer before the tag
                        last_nl = preceding.rfind("\n")
                        if last_nl == -1:
                            # No newline in buffer — boundary only if
                            # last emit was a newline AND only whitespace
                            # has accumulated before the tag
                            is_block_boundary = (
                                getattr(self, "_stream_last_was_newline", True)
                                and preceding.strip() == ""
                            )
                        else:
                            # Text between last newline and tag must be
                            # whitespace-only
                            is_block_boundary = preceding[last_nl + 1:].strip() == ""
                    if is_block_boundary:
                        # Emit everything before the tag
                        if preceding:
                            self._emit_stream_text(preceding)
                            self._stream_last_was_newline = preceding.endswith("\n")
                        self._in_reasoning_block = True
                        self._stream_prefilt = self._stream_prefilt[idx + len(tag):]
                        break
                    # Not a block boundary — keep searching after this occurrence
                    search_start = idx + 1
                if getattr(self, "_in_reasoning_block", False):
                    break
            # Could also be a partial open tag at the end — hold it back
@@ -2332,6 +2374,7 @@ class HermesCLI:
                            break
                if safe:
                    self._emit_stream_text(safe)
                    self._stream_last_was_newline = safe.endswith("\n")
                    self._stream_prefilt = self._stream_prefilt[len(safe):]
                return
@@ -2421,6 +2464,14 @@ class HermesCLI:
    def _flush_stream(self) -> None:
        """Emit any remaining partial line from the stream buffer and close the box."""
        # If we're still inside a "reasoning block" at end-of-stream, it was
        # a false positive — the model mentioned a tag like <think> in prose
        # but never closed it.  Recover the buffered content as regular text.
        if getattr(self, "_in_reasoning_block", False) and getattr(self, "_stream_prefilt", ""):
            self._in_reasoning_block = False
            self._emit_stream_text(self._stream_prefilt)
            self._stream_prefilt = ""
        # Close reasoning box if still open (in case no content tokens arrived)
        self._close_reasoning_box()
@@ -2443,6 +2494,7 @@ class HermesCLI:
        self._stream_text_ansi = ""
        self._stream_prefilt = ""
        self._in_reasoning_block = False
        self._stream_last_was_newline = True
        self._reasoning_box_opened = False
        self._reasoning_buf = ""
        self._reasoning_preview_buf = ""
--- a/run_agent.py
+++ b/run_agent.py
@@ -5610,20 +5610,20 @@ class AIAgent:
        if self.max_tokens is not None:
            if not self._is_qwen_portal():
                api_kwargs.update(self._max_tokens_param(self.max_tokens))
-        elif self._is_openrouter_url() and "claude" in (self.model or "").lower():
+        elif (self._is_openrouter_url() or "nousresearch" in self._base_url_lower) and "claude" in (self.model or "").lower():
-            # OpenRouter translates requests to Anthropic's Messages API,
+            # OpenRouter and Nous Portal translate requests to Anthropic's
-            # which requires max_tokens as a mandatory field.  When we omit
+            # Messages API, which requires max_tokens as a mandatory field.
-            # it, OpenRouter picks a default that can be too low — the model
+            # When we omit it, the proxy picks a default that can be too
-            # spends its output budget on thinking and has almost nothing
+            # low — the model spends its output budget on thinking and has
-            # left for the actual response (especially large tool calls like
+            # almost nothing left for the actual response (especially large
-            # write_file).  Sending the model's real output limit ensures
+            # tool calls like write_file).  Sending the model's real output
-            # full capacity.  Other providers handle the default fine.
+            # limit ensures full capacity.
            try:
                from agent.anthropic_adapter import _get_anthropic_max_output
                _model_output_limit = _get_anthropic_max_output(self.model)
                api_kwargs["max_tokens"] = _model_output_limit
            except Exception:
-                pass  # fail open — let OpenRouter pick its default
+                pass  # fail open — let the proxy pick its default
        extra_body = {}
@@ -9116,6 +9116,11 @@ class AIAgent:
                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)
                    # Reset per-turn retry counters after successful tool
                    # execution so a single truncation doesn't poison the
                    # entire conversation.
                    truncated_tool_call_retries = 0
                    # Signal that a paragraph break is needed before the next
                    # streamed text.  We don't emit it immediately because
                    # multiple consecutive tool iterations would stack up
--- a/tests/cli/test_stream_delta_think_tag.py
+++ b/tests/cli/test_stream_delta_think_tag.py
@@ -0,0 +1,138 @@
 """Tests for _stream_delta's handling of <think> tags in prose vs real reasoning blocks."""
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
 import pytest
 def _make_cli_stub():
    """Create a minimal HermesCLI-like object with stream state."""
    from cli import HermesCLI
    cli = HermesCLI.__new__(HermesCLI)
    cli.show_reasoning = False
    cli._stream_buf = ""
    cli._stream_started = False
    cli._stream_box_opened = False
    cli._stream_prefilt = ""
    cli._in_reasoning_block = False
    cli._reasoning_stream_started = False
    cli._reasoning_box_opened = False
    cli._reasoning_buf = ""
    cli._reasoning_preview_buf = ""
    cli._deferred_content = ""
    cli._stream_text_ansi = ""
    cli._stream_needs_break = False
    cli._emitted = []
    # Mock _emit_stream_text to capture output
    def mock_emit(text):
        cli._emitted.append(text)
    cli._emit_stream_text = mock_emit
    # Mock _stream_reasoning_delta
    cli._reasoning_emitted = []
    def mock_reasoning(text):
        cli._reasoning_emitted.append(text)
    cli._stream_reasoning_delta = mock_reasoning
    return cli
 class TestThinkTagInProse:
    """<think> mentioned in prose should NOT trigger reasoning suppression."""
    def test_think_tag_mid_sentence(self):
        """'(/think not producing <think> tags)' should pass through."""
        cli = _make_cli_stub()
        tokens = [
            "  1. Fix reasoning mode in eval ",
            "(/think not producing ",
            "<think>",
            " tags — ~2% gap)",
            "\n  2. Launch production",
        ]
        for t in tokens:
            cli._stream_delta(t)
        assert not cli._in_reasoning_block, "<think> in prose should not enter reasoning block"
        full = "".join(cli._emitted)
        assert "<think>" in full, "The literal <think> tag should be in the emitted text"
        assert "Launch production" in full
    def test_think_tag_after_text_on_same_line(self):
        """'some text <think>' should NOT trigger reasoning."""
        cli = _make_cli_stub()
        cli._stream_delta("Here is the <think> tag explanation")
        assert not cli._in_reasoning_block
        full = "".join(cli._emitted)
        assert "<think>" in full
    def test_think_tag_in_backticks(self):
        """'`<think>`' should NOT trigger reasoning."""
        cli = _make_cli_stub()
        cli._stream_delta("Use the `<think>` tag for reasoning")
        assert not cli._in_reasoning_block
 class TestRealReasoningBlock:
    """Real <think> tags at block boundaries should still be caught."""
    def test_think_at_start_of_stream(self):
        """'<think>reasoning</think>answer' should suppress reasoning."""
        cli = _make_cli_stub()
        cli._stream_delta("<think>")
        assert cli._in_reasoning_block
        cli._stream_delta("I need to analyze this")
        cli._stream_delta("</think>")
        assert not cli._in_reasoning_block
        cli._stream_delta("Here is my answer")
        full = "".join(cli._emitted)
        assert "Here is my answer" in full
        assert "I need to analyze" not in full  # reasoning was suppressed
    def test_think_after_newline(self):
        """'text\\n<think>' should trigger reasoning block."""
        cli = _make_cli_stub()
        cli._stream_delta("Some preamble\n<think>")
        assert cli._in_reasoning_block
        full = "".join(cli._emitted)
        assert "Some preamble" in full
    def test_think_after_newline_with_whitespace(self):
        """'text\\n  <think>' should trigger reasoning block."""
        cli = _make_cli_stub()
        cli._stream_delta("Some preamble\n  <think>")
        assert cli._in_reasoning_block
    def test_think_with_only_whitespace_before(self):
        """'   <think>' (whitespace only prefix) should trigger."""
        cli = _make_cli_stub()
        cli._stream_delta("   <think>")
        assert cli._in_reasoning_block
 class TestFlushRecovery:
    """_flush_stream should recover content from false-positive reasoning blocks."""
    def test_flush_recovers_buffered_content(self):
        """If somehow in reasoning block at flush, content is recovered."""
        cli = _make_cli_stub()
        # Manually set up a false-positive state
        cli._in_reasoning_block = True
        cli._stream_prefilt = " tags — ~2% gap)\n  2. Launch production"
        cli._stream_box_opened = True
        # Mock _close_reasoning_box and box closing
        cli._close_reasoning_box = lambda: None
        # Call flush
        from unittest.mock import patch
        import shutil
        with patch.object(shutil, "get_terminal_size", return_value=os.terminal_size((80, 24))):
            with patch("cli._cprint"):
                cli._flush_stream()
        assert not cli._in_reasoning_block
        full = "".join(cli._emitted)
        assert "Launch production" in full