fix: guard aux LLM calls against None content + reasoning fallback + retry (salvage #3389) (#3449)

Salvage of #3389 by @binhnt92 with reasoning fallback and retry logic added on top. All 7 auxiliary LLM call sites now use extract_content_or_reasoning() which mirrors the main agent loop's behavior: extract content, strip think blocks, fall back to structured reasoning fields, retry on empty. Closes #3389.
2026-04-28 06:51:16 +08:00 · 2026-03-27 15:28:19 -07:00
parent ab09f6b568
commit 658692799d
7 changed files with 414 additions and 14 deletions
--- a/tools/mixture_of_agents_tool.py
+++ b/tools/mixture_of_agents_tool.py
@@ -52,6 +52,7 @@ import asyncio
 import datetime
 from typing import Dict, Any, List, Optional
 from tools.openrouter_client import get_async_client as _get_openrouter_client, check_api_key as check_openrouter_api_key
+from agent.auxiliary_client import extract_content_or_reasoning
 from tools.debug_helpers import DebugSession

 logger = logging.getLogger(__name__)
@@ -143,7 +144,13 @@ async def _run_reference_model_safe(
            
            response = await _get_openrouter_client().chat.completions.create(**api_params)
            
-            content = response.choices[0].message.content.strip()
+            content = extract_content_or_reasoning(response)
+            if not content:
+                # Reasoning-only response — let the retry loop handle it
+                logger.warning("%s returned empty content (attempt %s/%s), retrying", model, attempt + 1, max_retries)
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(min(2 ** (attempt + 1), 60))
+                    continue
            logger.info("%s responded (%s characters)", model, len(content))
            return model, content, True
            
@@ -211,7 +218,14 @@ async def _run_aggregator_model(

    response = await _get_openrouter_client().chat.completions.create(**api_params)

-    content = response.choices[0].message.content.strip()
+    content = extract_content_or_reasoning(response)
+
+    # Retry once on empty content (reasoning-only response)
+    if not content:
+        logger.warning("Aggregator returned empty content, retrying once")
+        response = await _get_openrouter_client().chat.completions.create(**api_params)
+        content = extract_content_or_reasoning(response)
+
    logger.info("Aggregation complete (%s characters)", len(content))
    return content

--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@@ -21,7 +21,7 @@ import json
 import logging
 from typing import Dict, Any, List, Optional, Union

-from agent.auxiliary_client import async_call_llm
+from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
 MAX_SESSION_CHARS = 100_000
 MAX_SUMMARY_TOKENS = 10000

@@ -161,7 +161,15 @@ async def _summarize_session(
                temperature=0.1,
                max_tokens=MAX_SUMMARY_TOKENS,
            )
-            return response.choices[0].message.content.strip()
+            content = extract_content_or_reasoning(response)
+            if content:
+                return content
+            # Reasoning-only / empty — let the retry loop handle it
+            logging.warning("Session search LLM returned empty content (attempt %d/%d)", attempt + 1, max_retries)
+            if attempt < max_retries - 1:
+                await asyncio.sleep(1 * (attempt + 1))
+                continue
+            return content
        except RuntimeError:
            logging.warning("No auxiliary model available for session summarization")
            return None
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -948,9 +948,9 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,

    # Call the LLM via the centralized provider router
    try:
-        from agent.auxiliary_client import call_llm
+        from agent.auxiliary_client import call_llm, extract_content_or_reasoning

-        response = call_llm(
+        call_kwargs = dict(
            provider="openrouter",
            model=model,
            messages=[{
@@ -960,7 +960,13 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
            temperature=0,
            max_tokens=1000,
        )
-        llm_text = response.choices[0].message.content.strip()
+        response = call_llm(**call_kwargs)
+        llm_text = extract_content_or_reasoning(response)
+
+        # Retry once on empty content (reasoning-only response)
+        if not llm_text:
+            response = call_llm(**call_kwargs)
+            llm_text = extract_content_or_reasoning(response)
    except Exception:
        # LLM audit is best-effort — don't block install if the call fails
        return static_result
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -37,7 +37,7 @@ from pathlib import Path
 from typing import Any, Awaitable, Dict, Optional
 from urllib.parse import urlparse
 import httpx
-from agent.auxiliary_client import async_call_llm
+from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
 from tools.debug_helpers import DebugSession

 logger = logging.getLogger(__name__)
@@ -346,8 +346,15 @@ async def vision_analyze_tool(
            call_kwargs["model"] = model
        response = await async_call_llm(**call_kwargs)
        
-        # Extract the analysis
-        analysis = response.choices[0].message.content.strip()
+        # Extract the analysis — fall back to reasoning if content is empty
+        analysis = extract_content_or_reasoning(response)
+
+        # Retry once on empty content (reasoning-only response)
+        if not analysis:
+            logger.warning("Vision LLM returned empty content, retrying once")
+            response = await async_call_llm(**call_kwargs)
+            analysis = extract_content_or_reasoning(response)
+
        analysis_length = len(analysis)
        
        logger.info("Image analysis completed (%s characters)", analysis_length)
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -44,7 +44,7 @@ import asyncio
 from typing import List, Dict, Any, Optional
 import httpx
 from firecrawl import Firecrawl
-from agent.auxiliary_client import async_call_llm
+from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
 from tools.debug_helpers import DebugSession
 from tools.url_safety import is_safe_url
 from tools.website_policy import check_website_access
@@ -416,7 +416,16 @@ Create a markdown summary that captures all key information in a well-organized,
            if model:
                call_kwargs["model"] = model
            response = await async_call_llm(**call_kwargs)
-            return response.choices[0].message.content.strip()
+            content = extract_content_or_reasoning(response)
+            if content:
+                return content
+            # Reasoning-only / empty response — let the retry loop handle it
+            logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries)
+            if attempt < max_retries - 1:
+                await asyncio.sleep(retry_delay)
+                retry_delay = min(retry_delay * 2, 60)
+                continue
+            return content  # Return whatever we got after exhausting retries
        except RuntimeError:
            logger.warning("No auxiliary model available for web content processing")
            return None
@@ -535,8 +544,14 @@ Create a single, unified markdown summary."""
        if model:
            call_kwargs["model"] = model
        response = await async_call_llm(**call_kwargs)
-        final_summary = response.choices[0].message.content.strip()
-        
+        final_summary = extract_content_or_reasoning(response)
+
+        # Retry once on empty content (reasoning-only response)
+        if not final_summary:
+            logger.warning("Synthesis LLM returned empty content, retrying once")
+            response = await async_call_llm(**call_kwargs)
+            final_summary = extract_content_or_reasoning(response)
+
        # Enforce hard cap
        if len(final_summary) > max_output_size:
            final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"