fix: update async fallback test mock to 5-tuple for api_mode

fix: warn and clear stale OPENAI_BASE_URL on provider switch (#5161 )
fix(auxiliary): validate response shape in call_llm/async_call_llm (#7264 )
2026-06-21 17:41:08 +08:00 · 2026-04-11 01:43:01 -07:00 · 2026-04-11 01:38:37 -07:00 · 2026-04-11 01:37:48 -07:00 · 2026-04-11 01:37:03 -07:00 · 2026-04-11 01:36:56 -07:00
4 changed files with 666 additions and 52 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -59,6 +59,9 @@ from hermes_constants import OPENROUTER_BASE_URL

 logger = logging.getLogger(__name__)

+# Module-level flag: only warn once per process about stale OPENAI_BASE_URL.
+_stale_base_url_warned = False
+
 _PROVIDER_ALIASES = {
    "google": "gemini",
    "google-gemini": "gemini",
@@ -707,7 +710,9 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            base_url = _to_openai_base_url(
                _pool_runtime_base_url(entry, pconfig.inference_base_url) or pconfig.inference_base_url
            )
-            model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id, "default")
+            model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id)
+            if model is None:
+                continue  # skip provider if we don't know a valid aux model
            logger.debug("Auxiliary text client: %s (%s) via pool", pconfig.name, model)
            extra = {}
            if "api.kimi.com" in base_url.lower():
@@ -726,7 +731,9 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
        base_url = _to_openai_base_url(
            str(creds.get("base_url", "")).strip().rstrip("/") or pconfig.inference_base_url
        )
-        model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id, "default")
+        model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id)
+        if model is None:
+            continue  # skip provider if we don't know a valid aux model
        logger.debug("Auxiliary text client: %s (%s)", pconfig.name, model)
        extra = {}
        if "api.kimi.com" in base_url.lower():
@@ -1075,11 +1082,12 @@ def _is_connection_error(exc: Exception) -> bool:
 def _try_payment_fallback(
    failed_provider: str,
    task: str = None,
+    reason: str = "payment error",
 ) -> Tuple[Optional[Any], Optional[str], str]:
-    """Try alternative providers after a payment/credit error.
+    """Try alternative providers after a payment/credit or connection error.

    Iterates the standard auto-detection chain, skipping the provider that
-    returned a payment error.
+    failed.

    Returns:
        (client, model, provider_label) or (None, None, "") if no fallback.
@@ -1105,15 +1113,15 @@ def _try_payment_fallback(
        client, model = try_fn()
        if client is not None:
            logger.info(
-                "Auxiliary %s: payment error on %s — falling back to %s (%s)",
-                task or "call", failed_provider, label, model or "default",
+                "Auxiliary %s: %s on %s — falling back to %s (%s)",
+                task or "call", reason, failed_provider, label, model or "default",
            )
            return client, model, label
        tried.append(label)

    logger.warning(
-        "Auxiliary %s: payment error on %s and no fallback available (tried: %s)",
-        task or "call", failed_provider, ", ".join(tried),
+        "Auxiliary %s: %s on %s and no fallback available (tried: %s)",
+        task or "call", reason, failed_provider, ", ".join(tried),
    )
    return None, None, ""

@@ -1128,9 +1136,28 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
         provider they already have credentials for — no OpenRouter key needed.
      2. OpenRouter → Nous → custom → Codex → API-key providers (original chain).
    """
-    global auxiliary_is_nous
+    global auxiliary_is_nous, _stale_base_url_warned
    auxiliary_is_nous = False  # Reset — _try_nous() will set True if it wins

+    # ── Warn once if OPENAI_BASE_URL is set but config.yaml uses a named
+    #    provider (not 'custom').  This catches the common "env poisoning"
+    #    scenario where a user switches providers via `hermes model` but the
+    #    old OPENAI_BASE_URL lingers in ~/.hermes/.env. ──
+    if not _stale_base_url_warned:
+        _env_base = os.getenv("OPENAI_BASE_URL", "").strip()
+        _cfg_provider = _read_main_provider()
+        if (_env_base and _cfg_provider
+                and _cfg_provider != "custom"
+                and not _cfg_provider.startswith("custom:")):
+            logger.warning(
+                "OPENAI_BASE_URL is set (%s) but model.provider is '%s'. "
+                "Auxiliary clients may route to the wrong endpoint. "
+                "Run: hermes model to reconfigure, or remove "
+                "OPENAI_BASE_URL from ~/.hermes/.env",
+                _env_base, _cfg_provider,
+            )
+            _stale_base_url_warned = True
+
    # ── Step 1: non-aggregator main provider → use main model directly ──
    main_provider = _read_main_provider()
    main_model = _read_main_model()
@@ -1217,6 +1244,7 @@ def resolve_provider_client(
    raw_codex: bool = False,
    explicit_base_url: str = None,
    explicit_api_key: str = None,
+    api_mode: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Central router: given a provider name and optional model, return a
    configured client with the correct auth, base URL, and API format.
@@ -1240,6 +1268,10 @@ def resolve_provider_client(
            the main agent loop).
        explicit_base_url: Optional direct OpenAI-compatible endpoint.
        explicit_api_key: Optional API key paired with explicit_base_url.
+        api_mode: API mode override.  One of "chat_completions",
+            "codex_responses", or None (auto-detect).  When set to
+            "codex_responses", the client is wrapped in
+            CodexAuxiliaryClient to route through the Responses API.

    Returns:
        (client, resolved_model) or (None, None) if auth is unavailable.
@@ -1247,6 +1279,40 @@ def resolve_provider_client(
    # Normalise aliases
    provider = _normalize_aux_provider(provider)

+    def _needs_codex_wrap(client_obj, base_url_str: str, model_str: str) -> bool:
+        """Decide if a plain OpenAI client should be wrapped for Responses API.
+
+        Returns True when api_mode is explicitly "codex_responses", or when
+        auto-detection (api.openai.com + codex-family model) suggests it.
+        Already-wrapped clients (CodexAuxiliaryClient) are skipped.
+        """
+        if isinstance(client_obj, CodexAuxiliaryClient):
+            return False
+        if raw_codex:
+            return False
+        if api_mode == "codex_responses":
+            return True
+        # Auto-detect: api.openai.com + codex model name pattern
+        if api_mode and api_mode != "codex_responses":
+            return False  # explicit non-codex mode
+        normalized_base = (base_url_str or "").strip().lower()
+        if "api.openai.com" in normalized_base and "openrouter" not in normalized_base:
+            model_lower = (model_str or "").lower()
+            if "codex" in model_lower:
+                return True
+        return False
+
+    def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""):
+        """Wrap a plain OpenAI client in CodexAuxiliaryClient if Responses API is needed."""
+        if _needs_codex_wrap(client_obj, base_url_str, final_model_str):
+            logger.debug(
+                "resolve_provider_client: wrapping client in CodexAuxiliaryClient "
+                "(api_mode=%s, model=%s, base_url=%s)",
+                api_mode or "auto-detected", final_model_str,
+                base_url_str[:60] if base_url_str else "")
+            return CodexAuxiliaryClient(client_obj, final_model_str)
+        return client_obj
+
    # ── Auto: try all providers in priority order ────────────────────
    if provider == "auto":
        client, resolved = _resolve_auto()
@@ -1336,6 +1402,7 @@ def resolve_provider_client(
                from hermes_cli.models import copilot_default_headers
                extra["default_headers"] = copilot_default_headers()
            client = OpenAI(api_key=custom_key, base_url=custom_base, **extra)
+            client = _wrap_if_needed(client, final_model, custom_base)
            return (_to_async_client(client, final_model) if async_mode
                    else (client, final_model))
        # Try custom first, then codex, then API-key providers
@@ -1344,6 +1411,8 @@ def resolve_provider_client(
            client, default = try_fn()
            if client is not None:
                final_model = _normalize_resolved_model(model or default, provider)
+                _cbase = str(getattr(client, "base_url", "") or "")
+                client = _wrap_if_needed(client, final_model, _cbase)
                return (_to_async_client(client, final_model) if async_mode
                        else (client, final_model))
        logger.warning("resolve_provider_client: custom/main requested "
@@ -1363,6 +1432,7 @@ def resolve_provider_client(
                    provider,
                )
                client = OpenAI(api_key=custom_key, base_url=custom_base)
+                client = _wrap_if_needed(client, final_model, custom_base)
                logger.debug(
                    "resolve_provider_client: named custom provider %r (%s)",
                    provider, final_model)
@@ -1442,6 +1512,11 @@ def resolve_provider_client(
            except ImportError:
                pass

+        # Honor api_mode for any API-key provider (e.g. direct OpenAI with
+        # codex-family models).  The copilot-specific wrapping above handles
+        # copilot; this covers the general case (#6800).
+        client = _wrap_if_needed(client, final_model, base_url)
+
        logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
        return (_to_async_client(client, final_model) if async_mode
                else (client, final_model))
@@ -1474,12 +1549,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
    Callers may override the returned model with a per-task env var
    (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
    """
-    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
    return resolve_provider_client(
        provider,
        model=model,
        explicit_base_url=base_url,
        explicit_api_key=api_key,
+        api_mode=api_mode,
    )


@@ -1490,13 +1566,14 @@ def get_async_text_auxiliary_client(task: str = ""):
    (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
    Returns (None, None) when no provider is available.
    """
-    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
    return resolve_provider_client(
        provider,
        model=model,
        async_mode=True,
        explicit_base_url=base_url,
        explicit_api_key=api_key,
+        api_mode=api_mode,
    )


@@ -1569,7 +1646,7 @@ def resolve_vision_provider_client(
    backends, so users can intentionally force experimental providers. Auto mode
    stays conservative and only tries vision backends known to work today.
    """
-    requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+    requested, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
        "vision", provider, model, base_url, api_key
    )
    requested = _normalize_vision_provider(requested)
@@ -1785,12 +1862,30 @@ def cleanup_stale_async_clients() -> None:
            del _client_cache[key]


+def _is_openrouter_client(client: Any) -> bool:
+    for obj in (client, getattr(client, "_client", None), getattr(client, "client", None)):
+        if obj and "openrouter" in str(getattr(obj, "base_url", "") or "").lower():
+            return True
+    return False
+
+
+def _compat_model(client: Any, model: Optional[str], cached_default: Optional[str]) -> Optional[str]:
+    """Drop OpenRouter-format model slugs (with '/') for non-OpenRouter clients.
+
+    Mirrors the guard in resolve_provider_client() which is skipped on cache hits.
+    """
+    if model and "/" in model and not _is_openrouter_client(client):
+        return cached_default
+    return model or cached_default
+
+
 def _get_cached_client(
    provider: str,
    model: str = None,
    async_mode: bool = False,
    base_url: str = None,
    api_key: str = None,
+    api_mode: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Get or create a cached client for the given provider.

@@ -1814,7 +1909,7 @@ def _get_cached_client(
            loop_id = id(current_loop)
        except RuntimeError:
            pass
-    cache_key = (provider, async_mode, base_url or "", api_key or "", loop_id)
+    cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id)
    with _client_cache_lock:
        if cache_key in _client_cache:
            cached_client, cached_default, cached_loop = _client_cache[cache_key]
@@ -1826,9 +1921,11 @@ def _get_cached_client(
                    _force_close_async_httpx(cached_client)
                    del _client_cache[cache_key]
                else:
-                    return cached_client, model or cached_default
+                    effective = _compat_model(cached_client, model, cached_default)
+                    return cached_client, effective
            else:
-                return cached_client, model or cached_default
+                effective = _compat_model(cached_client, model, cached_default)
+                return cached_client, effective
    # Build outside the lock
    client, default_model = resolve_provider_client(
        provider,
@@ -1836,6 +1933,7 @@ def _get_cached_client(
        async_mode,
        explicit_base_url=base_url,
        explicit_api_key=api_key,
+        api_mode=api_mode,
    )
    if client is not None:
        # For async clients, remember which loop they were created on so we
@@ -1855,7 +1953,7 @@ def _resolve_task_provider_model(
    model: str = None,
    base_url: str = None,
    api_key: str = None,
-) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
+) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
    """Determine provider + model for a call.

    Priority:
@@ -1864,15 +1962,17 @@ def _resolve_task_provider_model(
      3. Config file (auxiliary.{task}.* or compression.*)
      4. "auto" (full auto-detection chain)

-    Returns (provider, model, base_url, api_key) where model may be None
-    (use provider default). When base_url is set, provider is forced to
-    "custom" and the task uses that direct endpoint.
+    Returns (provider, model, base_url, api_key, api_mode) where model may
+    be None (use provider default). When base_url is set, provider is forced
+    to "custom" and the task uses that direct endpoint. api_mode is one of
+    "chat_completions", "codex_responses", or None (auto-detect).
    """
    config = {}
    cfg_provider = None
    cfg_model = None
    cfg_base_url = None
    cfg_api_key = None
+    cfg_api_mode = None

    if task:
        try:
@@ -1889,6 +1989,7 @@ def _resolve_task_provider_model(
        cfg_model = str(task_config.get("model", "")).strip() or None
        cfg_base_url = str(task_config.get("base_url", "")).strip() or None
        cfg_api_key = str(task_config.get("api_key", "")).strip() or None
+        cfg_api_mode = str(task_config.get("api_mode", "")).strip() or None

        # Backwards compat: compression section has its own keys.
        # The auxiliary.compression defaults to provider="auto", so treat
@@ -1902,30 +2003,32 @@ def _resolve_task_provider_model(
                cfg_base_url = cfg_base_url or _sbu.strip() or None

    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
+    env_api_mode = _get_auxiliary_env_override(task, "API_MODE") if task else None
    resolved_model = model or env_model or cfg_model
+    resolved_api_mode = env_api_mode or cfg_api_mode

    if base_url:
-        return "custom", resolved_model, base_url, api_key
+        return "custom", resolved_model, base_url, api_key, resolved_api_mode
    if provider:
-        return provider, resolved_model, base_url, api_key
+        return provider, resolved_model, base_url, api_key, resolved_api_mode

    if task:
        env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
        env_api_key = _get_auxiliary_env_override(task, "API_KEY")
        if env_base_url:
-            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key
+            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key, resolved_api_mode

        env_provider = _get_auxiliary_provider(task)
        if env_provider != "auto":
-            return env_provider, resolved_model, None, None
+            return env_provider, resolved_model, None, None, resolved_api_mode

        if cfg_base_url:
-            return "custom", resolved_model, cfg_base_url, cfg_api_key
+            return "custom", resolved_model, cfg_base_url, cfg_api_key, resolved_api_mode
        if cfg_provider and cfg_provider != "auto":
-            return cfg_provider, resolved_model, None, None
-        return "auto", resolved_model, None, None
+            return cfg_provider, resolved_model, None, None, resolved_api_mode
+        return "auto", resolved_model, None, None, resolved_api_mode

-    return "auto", resolved_model, None, None
+    return "auto", resolved_model, None, None, resolved_api_mode


 _DEFAULT_AUX_TIMEOUT = 30.0
@@ -1997,6 +2100,37 @@ def _build_call_kwargs(
    return kwargs


+def _validate_llm_response(response: Any, task: str = None) -> Any:
+    """Validate that an LLM response has the expected .choices[0].message shape.
+
+    Fails fast with a clear error instead of letting malformed payloads
+    propagate to downstream consumers where they crash with misleading
+    AttributeError (e.g. "'str' object has no attribute 'choices'").
+
+    See #7264.
+    """
+    if response is None:
+        raise RuntimeError(
+            f"Auxiliary {task or 'call'}: LLM returned None response"
+        )
+    # Allow SimpleNamespace responses from adapters (CodexAuxiliaryClient,
+    # AnthropicAuxiliaryClient) — they have .choices[0].message.
+    try:
+        choices = response.choices
+        if not choices or not hasattr(choices[0], "message"):
+            raise AttributeError("missing choices[0].message")
+    except (AttributeError, TypeError, IndexError) as exc:
+        response_type = type(response).__name__
+        response_preview = str(response)[:120]
+        raise RuntimeError(
+            f"Auxiliary {task or 'call'}: LLM returned invalid response "
+            f"(type={response_type}): {response_preview!r}. "
+            f"Expected object with .choices[0].message — check provider "
+            f"adapter or custom endpoint compatibility."
+        ) from exc
+    return response
+
+
 def call_llm(
    task: str = None,
    *,
@@ -2035,7 +2169,7 @@ def call_llm(
    Raises:
        RuntimeError: If no provider is configured.
    """
-    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
        task, provider, model, base_url, api_key)

    if task == "vision":
@@ -2068,6 +2202,7 @@ def call_llm(
            resolved_model,
            base_url=resolved_base_url,
            api_key=resolved_api_key,
+            api_mode=resolved_api_mode,
        )
        if client is None:
            # When the user explicitly chose a non-OpenRouter provider but no
@@ -2111,18 +2246,20 @@ def call_llm(

    # Handle max_tokens vs max_completion_tokens retry, then payment fallback.
    try:
-        return client.chat.completions.create(**kwargs)
+        return _validate_llm_response(
+            client.chat.completions.create(**kwargs), task)
    except Exception as first_err:
        err_str = str(first_err)
        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
            kwargs.pop("max_tokens", None)
            kwargs["max_completion_tokens"] = max_tokens
            try:
-                return client.chat.completions.create(**kwargs)
+                return _validate_llm_response(
+                    client.chat.completions.create(**kwargs), task)
            except Exception as retry_err:
-                # If the max_tokens retry also hits a payment error,
-                # fall through to the payment fallback below.
-                if not _is_payment_error(retry_err):
+                # If the max_tokens retry also hits a payment or connection
+                # error, fall through to the fallback chain below.
+                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
                    raise
                first_err = retry_err

@@ -2139,19 +2276,24 @@ def call_llm(
        # and providers the user never configured that got picked up by
        # the auto-detection chain.
        should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
-        if should_fallback:
+        # Only try alternative providers when the user didn't explicitly
+        # configure this task's provider.  Explicit provider = hard constraint;
+        # auto (the default) = best-effort fallback chain.  (#7559)
+        is_auto = resolved_provider in ("auto", "", None)
+        if should_fallback and is_auto:
            reason = "payment error" if _is_payment_error(first_err) else "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
                        task or "call", reason, resolved_provider, first_err)
            fb_client, fb_model, fb_label = _try_payment_fallback(
-                resolved_provider, task)
+                resolved_provider, task, reason=reason)
            if fb_client is not None:
                fb_kwargs = _build_call_kwargs(
                    fb_label, fb_model, messages,
                    temperature=temperature, max_tokens=max_tokens,
                    tools=tools, timeout=effective_timeout,
                    extra_body=extra_body)
-                return fb_client.chat.completions.create(**fb_kwargs)
+                return _validate_llm_response(
+                    fb_client.chat.completions.create(**fb_kwargs), task)
        raise


@@ -2229,7 +2371,7 @@ async def async_call_llm(

    Same as call_llm() but async. See call_llm() for full documentation.
    """
-    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
        task, provider, model, base_url, api_key)

    if task == "vision":
@@ -2263,6 +2405,7 @@ async def async_call_llm(
            async_mode=True,
            base_url=resolved_base_url,
            api_key=resolved_api_key,
+            api_mode=resolved_api_mode,
        )
        if client is None:
            _explicit = (resolved_provider or "").strip().lower()
@@ -2273,11 +2416,9 @@ async def async_call_llm(
                    f"variable, or switch to a different provider with `hermes model`."
                )
            if not resolved_base_url:
-                logger.warning("Provider %s unavailable, falling back to openrouter",
-                               resolved_provider)
-                client, final_model = _get_cached_client(
-                    "openrouter", resolved_model or _OPENROUTER_MODEL,
-                    async_mode=True)
+                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
+                            task or "call", resolved_provider)
+                client, final_model = _get_cached_client("auto", async_mode=True)
        if client is None:
            raise RuntimeError(
                f"No LLM provider configured for task={task} provider={resolved_provider}. "
@@ -2292,11 +2433,42 @@ async def async_call_llm(
        base_url=resolved_base_url)

    try:
-        return await client.chat.completions.create(**kwargs)
+        return _validate_llm_response(
+            await client.chat.completions.create(**kwargs), task)
    except Exception as first_err:
        err_str = str(first_err)
        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
            kwargs.pop("max_tokens", None)
            kwargs["max_completion_tokens"] = max_tokens
-            return await client.chat.completions.create(**kwargs)
+            try:
+                return _validate_llm_response(
+                    await client.chat.completions.create(**kwargs), task)
+            except Exception as retry_err:
+                # If the max_tokens retry also hits a payment or connection
+                # error, fall through to the fallback chain below.
+                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
+                    raise
+                first_err = retry_err
+
+        # ── Payment / connection fallback (mirrors sync call_llm) ─────
+        should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
+        is_auto = resolved_provider in ("auto", "", None)
+        if should_fallback and is_auto:
+            reason = "payment error" if _is_payment_error(first_err) else "connection error"
+            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
+                        task or "call", reason, resolved_provider, first_err)
+            fb_client, fb_model, fb_label = _try_payment_fallback(
+                resolved_provider, task, reason=reason)
+            if fb_client is not None:
+                fb_kwargs = _build_call_kwargs(
+                    fb_label, fb_model, messages,
+                    temperature=temperature, max_tokens=max_tokens,
+                    tools=tools, timeout=effective_timeout,
+                    extra_body=extra_body)
+                # Convert sync fallback client to async
+                async_fb, async_fb_model = _to_async_client(fb_client, fb_model or "")
+                if async_fb_model and async_fb_model != fb_kwargs.get("model"):
+                    fb_kwargs["model"] = async_fb_model
+                return _validate_llm_response(
+                    await async_fb.chat.completions.create(**fb_kwargs), task)
        raise
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -1080,6 +1080,42 @@ def select_provider_and_model(args=None):
    elif selected_provider in ("gemini", "zai", "minimax", "minimax-cn", "kilocode", "opencode-zen", "opencode-go", "ai-gateway", "alibaba", "huggingface"):
        _model_flow_api_key_provider(config, selected_provider, current_model)

+    # ── Post-switch cleanup: clear stale OPENAI_BASE_URL ──────────────
+    # When the user switches to a named provider (anything except "custom"),
+    # a leftover OPENAI_BASE_URL in ~/.hermes/.env can poison auxiliary
+    # clients that use provider:auto. Clear it proactively.  (#5161)
+    if selected_provider not in ("custom", "cancel", "remove-custom") \
+            and not selected_provider.startswith("custom:"):
+        _clear_stale_openai_base_url()
+
+
+def _clear_stale_openai_base_url():
+    """Remove OPENAI_BASE_URL from ~/.hermes/.env if the active provider is not 'custom'.
+
+    After a provider switch, a leftover OPENAI_BASE_URL causes auxiliary
+    clients (compression, vision, delegation) with provider:auto to route
+    requests to the old custom endpoint instead of the newly selected
+    provider.  See issue #5161.
+    """
+    from hermes_cli.config import get_env_value, save_env_value, load_config
+
+    cfg = load_config()
+    model_cfg = cfg.get("model", {})
+    if isinstance(model_cfg, dict):
+        provider = (model_cfg.get("provider") or "").strip().lower()
+    else:
+        provider = ""
+
+    if provider == "custom" or not provider:
+        return  # custom provider legitimately uses OPENAI_BASE_URL
+
+    stale_url = get_env_value("OPENAI_BASE_URL")
+    if stale_url:
+        save_env_value("OPENAI_BASE_URL", "")
+        print(f"Cleared stale OPENAI_BASE_URL from .env (was: {stale_url[:40]}...)"
+              if len(stale_url) > 40
+              else f"Cleared stale OPENAI_BASE_URL from .env (was: {stale_url})")
+

 def _prompt_provider_choice(choices, *, default=0):
    """Show provider selection menu with curses arrow-key navigation.
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -1,9 +1,10 @@
 """Tests for agent.auxiliary_client resolution chain, provider overrides, and model overrides."""

 import json
+import logging
 import os
 from pathlib import Path
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch, MagicMock, AsyncMock

 import pytest

@@ -14,6 +15,7 @@ from agent.auxiliary_client import (
    resolve_provider_client,
    auxiliary_max_tokens_param,
    call_llm,
+    async_call_llm,
    _read_codex_access_token,
    _get_auxiliary_provider,
    _get_provider_chain,
@@ -1122,8 +1124,8 @@ class TestCallLlmPaymentFallback:
        exc.status_code = 402
        return exc

-    def test_402_triggers_fallback(self, monkeypatch):
-        """When the primary provider returns 402, call_llm tries the next one."""
+    def test_402_triggers_fallback_when_auto(self, monkeypatch):
+        """When provider is auto and returns 402, call_llm tries the next one."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")

        primary_client = MagicMock()
@@ -1136,7 +1138,7 @@ class TestCallLlmPaymentFallback:
        with patch("agent.auxiliary_client._get_cached_client",
                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
             patch("agent.auxiliary_client._resolve_task_provider_model",
-                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
+                    return_value=("auto", "google/gemini-3-flash-preview", None, None, None)), \
             patch("agent.auxiliary_client._try_payment_fallback",
                    return_value=(fallback_client, "gpt-5.2-codex", "openai-codex")) as mock_fb:
            result = call_llm(
@@ -1145,13 +1147,62 @@ class TestCallLlmPaymentFallback:
            )

        assert result is fallback_response
-        mock_fb.assert_called_once_with("openrouter", "compression")
+        mock_fb.assert_called_once_with("auto", "compression", reason="payment error")
        # Fallback call should use the fallback model
        fb_kwargs = fallback_client.chat.completions.create.call_args.kwargs
        assert fb_kwargs["model"] == "gpt-5.2-codex"

+    def test_402_no_fallback_when_explicit_provider(self, monkeypatch):
+        """When provider is explicitly configured (not auto), 402 should NOT fallback (#7559)."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        primary_client.chat.completions.create.side_effect = self._make_402_error()
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "local-model")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("custom", "local-model", None, None, None)), \
+             patch("agent.auxiliary_client._try_payment_fallback") as mock_fb:
+            with pytest.raises(Exception, match="insufficient credits"):
+                call_llm(
+                    task="compression",
+                    messages=[{"role": "user", "content": "hello"}],
+                )
+
+        # Fallback should NOT be attempted when provider is explicit
+        mock_fb.assert_not_called()
+
+    def test_connection_error_triggers_fallback_when_auto(self, monkeypatch):
+        """Connection errors also trigger fallback when provider is auto."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        conn_err = Exception("Connection refused")
+        conn_err.status_code = None
+        primary_client.chat.completions.create.side_effect = conn_err
+
+        fallback_client = MagicMock()
+        fallback_response = MagicMock()
+        fallback_client.chat.completions.create.return_value = fallback_response
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "model")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("auto", "model", None, None, None)), \
+             patch("agent.auxiliary_client._is_connection_error", return_value=True), \
+             patch("agent.auxiliary_client._try_payment_fallback",
+                    return_value=(fallback_client, "fb-model", "nous")) as mock_fb:
+            result = call_llm(
+                task="compression",
+                messages=[{"role": "user", "content": "hello"}],
+            )
+
+        assert result is fallback_response
+        mock_fb.assert_called_once_with("auto", "compression", reason="connection error")
+
    def test_non_payment_error_not_caught(self, monkeypatch):
-        """Non-payment errors (500, connection, etc.) should NOT trigger fallback."""
+        """Non-payment/non-connection errors (500) should NOT trigger fallback."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")

        primary_client = MagicMock()
@@ -1162,7 +1213,7 @@ class TestCallLlmPaymentFallback:
        with patch("agent.auxiliary_client._get_cached_client",
                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
             patch("agent.auxiliary_client._resolve_task_provider_model",
-                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)):
+                    return_value=("auto", "google/gemini-3-flash-preview", None, None, None)):
            with pytest.raises(Exception, match="Internal Server Error"):
                call_llm(
                    task="compression",
@@ -1179,7 +1230,7 @@ class TestCallLlmPaymentFallback:
        with patch("agent.auxiliary_client._get_cached_client",
                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
             patch("agent.auxiliary_client._resolve_task_provider_model",
-                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
+                    return_value=("auto", "google/gemini-3-flash-preview", None, None, None)), \
             patch("agent.auxiliary_client._try_payment_fallback",
                    return_value=(None, None, "")):
            with pytest.raises(Exception, match="insufficient credits"):
@@ -1229,3 +1280,283 @@ def test_resolve_api_key_provider_skips_unconfigured_anthropic(monkeypatch):

    assert "anthropic" not in called, \
        "_try_anthropic() should not be called when anthropic is not explicitly configured"
+
+
+# ---------------------------------------------------------------------------
+# model="default" elimination (#7512)
+# ---------------------------------------------------------------------------
+
+
+class TestModelDefaultElimination:
+    """_resolve_api_key_provider must skip providers without known aux models."""
+
+    def test_unknown_provider_skipped(self, monkeypatch):
+        """Providers not in _API_KEY_PROVIDER_AUX_MODELS are skipped, not sent model='default'."""
+        from agent.auxiliary_client import _API_KEY_PROVIDER_AUX_MODELS
+
+        # Verify our known providers have entries
+        assert "gemini" in _API_KEY_PROVIDER_AUX_MODELS
+        assert "kimi-coding" in _API_KEY_PROVIDER_AUX_MODELS
+
+        # A random provider_id not in the dict should return None
+        assert _API_KEY_PROVIDER_AUX_MODELS.get("totally-unknown-provider") is None
+
+    def test_known_provider_gets_real_model(self):
+        """Known providers get a real model name, not 'default'."""
+        from agent.auxiliary_client import _API_KEY_PROVIDER_AUX_MODELS
+
+        for provider_id, model in _API_KEY_PROVIDER_AUX_MODELS.items():
+            assert model != "default", f"{provider_id} should not map to 'default'"
+            assert isinstance(model, str) and model.strip(), \
+                f"{provider_id} should have a non-empty model string"
+
+
+# ---------------------------------------------------------------------------
+# _try_payment_fallback reason parameter (#7512 bug 3)
+# ---------------------------------------------------------------------------
+
+
+class TestTryPaymentFallbackReason:
+    """_try_payment_fallback uses the reason parameter in log messages."""
+
+    def test_reason_parameter_passed_through(self, monkeypatch):
+        """The reason= parameter is accepted without error."""
+        from agent.auxiliary_client import _try_payment_fallback
+
+        # Mock the provider chain to return nothing
+        monkeypatch.setattr(
+            "agent.auxiliary_client._get_provider_chain",
+            lambda: [],
+        )
+        monkeypatch.setattr(
+            "agent.auxiliary_client._read_main_provider",
+            lambda: "",
+        )
+
+        client, model, label = _try_payment_fallback(
+            "openrouter", task="compression", reason="connection error"
+        )
+        assert client is None
+        assert label == ""
+
+
+# ---------------------------------------------------------------------------
+# _is_connection_error coverage
+# ---------------------------------------------------------------------------
+
+
+class TestIsConnectionError:
+    """Tests for _is_connection_error detection."""
+
+    def test_connection_refused(self):
+        from agent.auxiliary_client import _is_connection_error
+        err = Exception("Connection refused")
+        assert _is_connection_error(err) is True
+
+    def test_timeout(self):
+        from agent.auxiliary_client import _is_connection_error
+        err = Exception("Request timed out.")
+        assert _is_connection_error(err) is True
+
+    def test_dns_failure(self):
+        from agent.auxiliary_client import _is_connection_error
+        err = Exception("Name or service not known")
+        assert _is_connection_error(err) is True
+
+    def test_normal_api_error_not_connection(self):
+        from agent.auxiliary_client import _is_connection_error
+        err = Exception("Bad Request: invalid model")
+        err.status_code = 400
+        assert _is_connection_error(err) is False
+
+    def test_500_not_connection(self):
+        from agent.auxiliary_client import _is_connection_error
+        err = Exception("Internal Server Error")
+        err.status_code = 500
+        assert _is_connection_error(err) is False
+
+
+# ---------------------------------------------------------------------------
+# async_call_llm payment / connection fallback (#7512 bug 2)
+# ---------------------------------------------------------------------------
+
+
+class TestAsyncCallLlmFallback:
+    """async_call_llm mirrors call_llm fallback behavior."""
+
+    def _make_402_error(self, msg="Payment Required: insufficient credits"):
+        exc = Exception(msg)
+        exc.status_code = 402
+        return exc
+
+    @pytest.mark.asyncio
+    async def test_402_triggers_async_fallback_when_auto(self, monkeypatch):
+        """When provider is auto and returns 402, async_call_llm tries fallback."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        primary_client.chat.completions.create = AsyncMock(
+            side_effect=self._make_402_error())
+
+        # Fallback client (sync) returned by _try_payment_fallback
+        fb_sync_client = MagicMock()
+        fb_async_client = MagicMock()
+        fb_response = MagicMock()
+        fb_async_client.chat.completions.create = AsyncMock(return_value=fb_response)
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("auto", "google/gemini-3-flash-preview", None, None, None)), \
+             patch("agent.auxiliary_client._try_payment_fallback",
+                    return_value=(fb_sync_client, "gpt-5.2-codex", "openai-codex")) as mock_fb, \
+             patch("agent.auxiliary_client._to_async_client",
+                    return_value=(fb_async_client, "gpt-5.2-codex")):
+            result = await async_call_llm(
+                task="compression",
+                messages=[{"role": "user", "content": "hello"}],
+            )
+
+        assert result is fb_response
+        mock_fb.assert_called_once_with("auto", "compression", reason="payment error")
+
+    @pytest.mark.asyncio
+    async def test_402_no_async_fallback_when_explicit(self, monkeypatch):
+        """When provider is explicit, 402 should NOT trigger async fallback."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        primary_client.chat.completions.create = AsyncMock(
+            side_effect=self._make_402_error())
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "local-model")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("custom", "local-model", None, None, None)), \
+             patch("agent.auxiliary_client._try_payment_fallback") as mock_fb:
+            with pytest.raises(Exception, match="insufficient credits"):
+                await async_call_llm(
+                    task="compression",
+                    messages=[{"role": "user", "content": "hello"}],
+                )
+
+        mock_fb.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_connection_error_triggers_async_fallback(self, monkeypatch):
+        """Connection errors trigger async fallback when provider is auto."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        conn_err = Exception("Connection refused")
+        conn_err.status_code = None
+        primary_client.chat.completions.create = AsyncMock(side_effect=conn_err)
+
+        fb_sync_client = MagicMock()
+        fb_async_client = MagicMock()
+        fb_response = MagicMock()
+        fb_async_client.chat.completions.create = AsyncMock(return_value=fb_response)
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "model")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("auto", "model", None, None, None)), \
+             patch("agent.auxiliary_client._is_connection_error", return_value=True), \
+             patch("agent.auxiliary_client._try_payment_fallback",
+                    return_value=(fb_sync_client, "fb-model", "nous")) as mock_fb, \
+             patch("agent.auxiliary_client._to_async_client",
+                    return_value=(fb_async_client, "fb-model")):
+            result = await async_call_llm(
+                task="compression",
+                messages=[{"role": "user", "content": "hello"}],
+            )
+
+        assert result is fb_response
+        mock_fb.assert_called_once_with("auto", "compression", reason="connection error")
+class TestStaleBaseUrlWarning:
+    """_resolve_auto() warns when OPENAI_BASE_URL conflicts with config provider (#5161)."""
+
+    def test_warns_when_openai_base_url_set_with_named_provider(self, monkeypatch, caplog):
+        """Warning fires when OPENAI_BASE_URL is set but provider is a named provider."""
+        import agent.auxiliary_client as mod
+        # Reset the module-level flag so the warning fires
+        monkeypatch.setattr(mod, "_stale_base_url_warned", False)
+        monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:11434/v1")
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test")
+
+        with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \
+             patch("agent.auxiliary_client._read_main_model", return_value="google/gemini-flash"), \
+             caplog.at_level(logging.WARNING, logger="agent.auxiliary_client"):
+            _resolve_auto()
+
+        assert any("OPENAI_BASE_URL is set" in rec.message for rec in caplog.records), \
+            "Expected a warning about stale OPENAI_BASE_URL"
+        assert mod._stale_base_url_warned is True
+
+    def test_no_warning_when_provider_is_custom(self, monkeypatch, caplog):
+        """No warning when the provider is 'custom' — OPENAI_BASE_URL is expected."""
+        import agent.auxiliary_client as mod
+        monkeypatch.setattr(mod, "_stale_base_url_warned", False)
+        monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:11434/v1")
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client._read_main_provider", return_value="custom"), \
+             patch("agent.auxiliary_client._read_main_model", return_value="llama3"), \
+             patch("agent.auxiliary_client._resolve_custom_runtime",
+                   return_value=("http://localhost:11434/v1", "test-key", None)), \
+             patch("agent.auxiliary_client.OpenAI") as mock_openai, \
+             caplog.at_level(logging.WARNING, logger="agent.auxiliary_client"):
+            mock_openai.return_value = MagicMock()
+            _resolve_auto()
+
+        assert not any("OPENAI_BASE_URL is set" in rec.message for rec in caplog.records), \
+            "Should NOT warn when provider is 'custom'"
+
+    def test_no_warning_when_provider_is_named_custom(self, monkeypatch, caplog):
+        """No warning when the provider is 'custom:myname' — base_url comes from config."""
+        import agent.auxiliary_client as mod
+        monkeypatch.setattr(mod, "_stale_base_url_warned", False)
+        monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:11434/v1")
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client._read_main_provider", return_value="custom:ollama-local"), \
+             patch("agent.auxiliary_client._read_main_model", return_value="llama3"), \
+             patch("agent.auxiliary_client.resolve_provider_client",
+                   return_value=(MagicMock(), "llama3")), \
+             caplog.at_level(logging.WARNING, logger="agent.auxiliary_client"):
+            _resolve_auto()
+
+        assert not any("OPENAI_BASE_URL is set" in rec.message for rec in caplog.records), \
+            "Should NOT warn when provider is 'custom:*'"
+
+    def test_no_warning_when_openai_base_url_not_set(self, monkeypatch, caplog):
+        """No warning when OPENAI_BASE_URL is absent."""
+        import agent.auxiliary_client as mod
+        monkeypatch.setattr(mod, "_stale_base_url_warned", False)
+        monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test")
+
+        with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \
+             patch("agent.auxiliary_client._read_main_model", return_value="google/gemini-flash"), \
+             caplog.at_level(logging.WARNING, logger="agent.auxiliary_client"):
+            _resolve_auto()
+
+        assert not any("OPENAI_BASE_URL is set" in rec.message for rec in caplog.records), \
+            "Should NOT warn when OPENAI_BASE_URL is not set"
+
+    def test_warning_only_fires_once(self, monkeypatch, caplog):
+        """Warning is suppressed after the first invocation."""
+        import agent.auxiliary_client as mod
+        monkeypatch.setattr(mod, "_stale_base_url_warned", False)
+        monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:11434/v1")
+        monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test")
+
+        with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \
+             patch("agent.auxiliary_client._read_main_model", return_value="google/gemini-flash"), \
+             caplog.at_level(logging.WARNING, logger="agent.auxiliary_client"):
+            _resolve_auto()
+            caplog.clear()
+            _resolve_auto()
+
+        assert not any("OPENAI_BASE_URL is set" in rec.message for rec in caplog.records), \
+            "Warning should not fire a second time"
--- a/tests/hermes_cli/test_clear_stale_base_url.py
+++ b/tests/hermes_cli/test_clear_stale_base_url.py
@@ -0,0 +1,75 @@
+"""Tests for _clear_stale_openai_base_url() cleanup after provider switch (#5161)."""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+from hermes_cli.config import load_config, save_config, save_env_value, get_env_value
+
+
+def _write_provider(provider: str, model: str = "test-model"):
+    """Helper: write a provider + model to config.yaml."""
+    cfg = load_config()
+    model_cfg = cfg.get("model", {})
+    if not isinstance(model_cfg, dict):
+        model_cfg = {}
+    model_cfg["provider"] = provider
+    model_cfg["default"] = model
+    cfg["model"] = model_cfg
+    save_config(cfg)
+
+
+class TestClearStaleOpenaiBaseUrl:
+    """_clear_stale_openai_base_url() removes OPENAI_BASE_URL when provider is not custom."""
+
+    def test_clears_when_provider_is_named(self, monkeypatch):
+        """OPENAI_BASE_URL is cleared when config provider is a named provider."""
+        from hermes_cli.main import _clear_stale_openai_base_url
+
+        _write_provider("openrouter")
+        save_env_value("OPENAI_BASE_URL", "http://localhost:11434/v1")
+
+        _clear_stale_openai_base_url()
+
+        result = get_env_value("OPENAI_BASE_URL")
+        assert not result, f"Expected OPENAI_BASE_URL to be cleared, got: {result!r}"
+
+    def test_preserves_when_provider_is_custom(self, monkeypatch):
+        """OPENAI_BASE_URL is NOT cleared when config provider is 'custom'."""
+        from hermes_cli.main import _clear_stale_openai_base_url
+
+        _write_provider("custom")
+        save_env_value("OPENAI_BASE_URL", "http://localhost:11434/v1")
+
+        _clear_stale_openai_base_url()
+
+        result = get_env_value("OPENAI_BASE_URL")
+        assert result == "http://localhost:11434/v1", \
+            f"Expected OPENAI_BASE_URL to be preserved, got: {result!r}"
+
+    def test_noop_when_no_openai_base_url(self, monkeypatch):
+        """No error when OPENAI_BASE_URL is not set."""
+        from hermes_cli.main import _clear_stale_openai_base_url
+
+        _write_provider("openrouter")
+        # Ensure it's not set
+        save_env_value("OPENAI_BASE_URL", "")
+        monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
+
+        # Should not raise
+        _clear_stale_openai_base_url()
+
+    def test_noop_when_provider_empty(self, monkeypatch):
+        """No cleanup when provider is not set in config."""
+        from hermes_cli.main import _clear_stale_openai_base_url
+
+        cfg = load_config()
+        cfg.pop("model", None)
+        save_config(cfg)
+        save_env_value("OPENAI_BASE_URL", "http://localhost:11434/v1")
+
+        _clear_stale_openai_base_url()
+
+        result = get_env_value("OPENAI_BASE_URL")
+        assert result == "http://localhost:11434/v1", \
+            "Should not clear when provider is not configured"
Author	SHA1	Message	Date
Teknium	8f676ea90c	fix: update async fallback test mock to 5-tuple for api_mode	2026-04-11 01:43:01 -07:00
kshitijk4poor	74a07738f1	fix: warn and clear stale OPENAI_BASE_URL on provider switch (#5161 )	2026-04-11 01:38:37 -07:00
kshitijk4poor	ad8c8731b3	fix(auxiliary): validate response shape in call_llm/async_call_llm (#7264 ) async_call_llm (and call_llm) can return non-OpenAI objects from custom providers or adapter shims, crashing downstream consumers with misleading AttributeError ('str' has no attribute 'choices'). Add _validate_llm_response() that checks the response has the expected .choices[0].message shape before returning. Wraps all return paths in call_llm, async_call_llm, and fallback paths. Fails fast with a clear RuntimeError identifying the task, response type, and a preview of the malformed payload. Closes #7264	2026-04-11 01:37:48 -07:00
ran	484b1fc17b	fix: drop incompatible model slugs on auxiliary client cache hit `resolve_provider_client()` already drops OpenRouter-format model slugs (containing "/") when the resolved provider is not OpenRouter (line 1097). However, `_get_cached_client()` returns `model or cached_default` directly on cache hits, bypassing this check entirely. When the main provider is openai-codex, the auto-detection chain (Step 1 of `_resolve_auto`) caches a CodexAuxiliaryClient. Subsequent auxiliary calls for different tasks (e.g. compression with `summary_model: google/gemini-3-flash-preview`) hit the cache and pass the OpenRouter- format model slug straight to the Codex Responses API, which does not understand it and returns an empty `response.output`. This causes two user-visible failures: - "Invalid API response shape" (empty output after 3 retries) - "Context length exceeded, cannot compress further" (compression itself fails through the same path) Add `_compat_model()` helper that mirrors the "/" check from `resolve_provider_client()` and call it on the cache-hit return path.	2026-04-11 01:37:03 -07:00
kshitijk4poor	65d1fbd668	fix(auxiliary): harden fallback behavior for non-OpenRouter users Four fixes to auxiliary_client.py: 1. Respect explicit provider as hard constraint (#7559) When auxiliary.{task}.provider is explicitly set (not 'auto'), connection/payment errors no longer silently fallback to cloud providers. Local-only users (Ollama, vLLM) will no longer get unexpected OpenRouter billing from auxiliary tasks. 2. Eliminate model='default' sentinel (#7512) _resolve_api_key_provider() no longer sends literal 'default' as model name to APIs. Providers without a known aux model in _API_KEY_PROVIDER_AUX_MODELS are skipped instead of producing model_not_supported errors. 3. Add payment/connection fallback to async_call_llm (#7512) async_call_llm now mirrors sync call_llm's fallback logic for payment (402) and connection errors. Previously, async consumers (session_search, web_tools, vision) got hard failures with no recovery. Also fixes hardcoded 'openrouter' fallback to use the full auto-detection chain. 4. Use accurate error reason in fallback logs (#7512) _try_payment_fallback() now accepts a reason parameter and uses it in log messages. Connection timeouts are no longer misleadingly logged as 'payment error'. Closes #7559 Closes #7512	2026-04-11 01:36:56 -07:00
kshitijk4poor	2dc980c676	fix(auxiliary): honor api_mode in auxiliary client (#6800 ) The auxiliary client always calls client.chat.completions.create(), ignoring the api_mode config flag. This breaks codex-family models (e.g. gpt-5.3-codex) on direct OpenAI API keys, which need the /v1/responses endpoint. Changes: - Expand _resolve_task_provider_model to return api_mode (5-tuple) - Read api_mode from auxiliary.{task}.api_mode config and env vars (AUXILIARY_{TASK}_API_MODE) - Pass api_mode through _get_cached_client to resolve_provider_client - Add _needs_codex_wrap/_wrap_if_needed helpers that wrap plain OpenAI clients in CodexAuxiliaryClient when api_mode=codex_responses or when auto-detection finds api.openai.com + codex model pattern - Apply wrapping at all custom endpoint, named custom provider, and API-key provider return paths - Update test mocks for the new 5-tuple return format Users can now set: auxiliary: compression: model: gpt-5.3-codex base_url: https://api.openai.com/v1 api_mode: codex_responses Closes #6800	2026-04-11 01:35:32 -07:00