From 099dfca6dbb2ab26380340a274ab42728b9fa756 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 25 Mar 2026 12:05:37 -0700
Subject: [PATCH] fix: GLM reasoning-only and max-length handling (#3010)

- Add 'prompt exceeds max length' to context overflow detection for
  Z.AI/GLM 400 errors
- Extract inline reasoning blocks from assistant content as fallback
  when no structured reasoning fields are present
- Guard inline extraction so structured API reasoning takes priority
- Update test for reasoning-only response salvage behavior

Cherry-picked from PR #2993 by kshitijk4poor. Added priority guard
to fix test_structured_reasoning_takes_priority failure.

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
---
 run_agent.py            | 19 +++++++++++++++
 tests/test_run_agent.py | 54 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 68 insertions(+), 5 deletions(-)
diff --git a/run_agent.py b/run_agent.py
index 60be561292..fb03ee5c4f 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -1326,6 +1326,24 @@ class AIAgent:
                     summary = detail.get('summary') or detail.get('content') or detail.get('text')
                     if summary and summary not in reasoning_parts:
                         reasoning_parts.append(summary)
+
+        # Some providers embed reasoning directly inside assistant content
+        # instead of returning structured reasoning fields.  Only fall back
+        # to inline extraction when no structured reasoning was found.
+        content = getattr(assistant_message, "content", None)
+        if not reasoning_parts and isinstance(content, str) and content:
+            inline_patterns = (
+                r"<think>(.*?)</think>",
+                r"<thinking>(.*?)</thinking>",
+                r"<reasoning>(.*?)</reasoning>",
+                r"<REASONING_SCRATCHPAD>(.*?)</REASONING_SCRATCHPAD>",
+            )
+            for pattern in inline_patterns:
+                flags = re.DOTALL | re.IGNORECASE
+                for block in re.findall(pattern, content, flags=flags):
+                    cleaned = block.strip()
+                    if cleaned and cleaned not in reasoning_parts:
+                        reasoning_parts.append(cleaned)
         
         # Combine all reasoning parts
         if reasoning_parts:
@@ -6392,6 +6410,7 @@ class AIAgent:
                         'exceeds the limit', 'context window',
                         'request entity too large',  # OpenRouter/Nous 413 safety net
                         'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
+                        'prompt exceeds max length',  # Z.AI / GLM: generic 400 overflow wording
                     ])
 
                     # Fallback heuristic: Anthropic sometimes returns a generic
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index 81e16b7027..3dd9a134b3 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -267,6 +267,21 @@ class TestExtractReasoning:
         result = agent._extract_reasoning(msg)
         assert result == "same text"
 
+    @pytest.mark.parametrize(
+        ("content", "expected"),
+        [
+            ("<think>thinking hard</think>", "thinking hard"),
+            ("<thinking>step by step</thinking>", "step by step"),
+            (
+                "<REASONING_SCRATCHPAD>scratch analysis</REASONING_SCRATCHPAD>",
+                "scratch analysis",
+            ),
+        ],
+    )
+    def test_inline_reasoning_blocks_fallback(self, agent, content, expected):
+        msg = _mock_assistant_msg(content=content)
+        assert agent._extract_reasoning(msg) == expected
+
 
 class TestCleanSessionContent:
     def test_none_passthrough(self):
@@ -1202,8 +1217,8 @@ class TestRunConversation:
         assert result["completed"] is True
         assert result["api_calls"] == 2
 
-    def test_empty_content_retry_and_fallback(self, agent):
-        """Empty content (only think block) retries, then falls back to partial."""
+    def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
+        """Reasoning-only payloads should recover the inline reasoning text."""
         self._setup_agent(agent)
         empty_resp = _mock_response(
             content="<think>internal reasoning</think>",
@@ -1221,9 +1236,8 @@ class TestRunConversation:
             patch.object(agent, "_cleanup_task_resources"),
         ):
             result = agent.run_conversation("answer me")
-        # After 3 retries with no real content, should return partial
-        assert result["completed"] is False
-        assert result.get("partial") is True
+        assert result["completed"] is True
+        assert result["final_response"] == "internal reasoning"
 
     def test_nous_401_refreshes_after_remint_and_retries(self, agent):
         self._setup_agent(agent)
@@ -1296,6 +1310,36 @@ class TestRunConversation:
         assert result["final_response"] == "All done"
         assert result["completed"] is True
 
+    def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent):
+        """GLM/Z.AI uses 'Prompt exceeds max length' for context overflow."""
+        self._setup_agent(agent)
+        err_400 = Exception(
+            "Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}"
+        )
+        err_400.status_code = 400
+        ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
+        prefill = [
+            {"role": "user", "content": "previous question"},
+            {"role": "assistant", "content": "previous answer"},
+        ]
+
+        with (
+            patch.object(agent, "_compress_context") as mock_compress,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            mock_compress.return_value = (
+                [{"role": "user", "content": "hello"}],
+                "compressed system prompt",
+            )
+            result = agent.run_conversation("hello", conversation_history=prefill)
+
+        mock_compress.assert_called_once()
+        assert result["final_response"] == "Recovered after compression"
+        assert result["completed"] is True
+
     @pytest.mark.parametrize(
         ("first_content", "second_content", "expected_final"),
         [