fix(azure-foundry): auto-route gpt-5.x / codex / o-series to Responses API (#16361)

Azure Foundry deploys GPT-5.x, codex-*, and o1/o3/o4 reasoning models as Responses-API-only. Calling /chat/completions against these deployments returns 400 'The requested operation is unsupported.', which broke any user who ran 'hermes model' on Azure, picked a gpt-5/codex deployment, and kept the default api_mode: chat_completions. Verified in a user debug bundle on 2026-04-26: gpt-5.3-codex failed on synopsisse.openai.azure.com with that exact payload while gpt-4o-pure on the same endpoint worked. Adds azure_foundry_model_api_mode(model_name) that returns codex_responses when the model name starts with gpt-5, codex, o1, o3, or o4 — otherwise None so chat_completions / anthropic_messages stay untouched for gpt-4o, Llama, Claude-via-Anthropic, etc. Resolver (both the direct Azure Foundry path and the pool-entry path) consults it and upgrades api_mode unless the user explicitly picked anthropic_messages. target_model (from /model mid-session switch) takes precedence over the persisted default so switching from gpt-4o to gpt-5.3-codex routes correctly before the next request. Docs: correct the azure-foundry guide which previously claimed Azure keeps gpt-5.x on chat completions — that was only true for early Azure OpenAI, not Azure Foundry codex/o-series deployments. Tests: 14 unit tests for azure_foundry_model_api_mode + 6 integration tests in TestAzureFoundryResolution covering Bob's exact scenario, target_model override, anthropic_messages guard, and o3-mini.
2026-04-28 06:51:16 +08:00 · 2026-04-26 21:33:31 -07:00
parent 235bfb192b
commit c5781d50c7
5 changed files with 251 additions and 2 deletions
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -2226,6 +2226,52 @@ def copilot_model_api_mode(
    return "chat_completions"


+# Azure Foundry model families that require the Responses API.  Azure
+# rejects /chat/completions against these deployments with
+# ``400 "The requested operation is unsupported."`` — the same payload Bob
+# Dobolina hit in April 2026 on ``gpt-5.3-codex`` while ``gpt-4o-pure`` on
+# the same endpoint worked fine.  Keep the patterns broad enough to cover
+# vendor-renamed deployments (e.g. ``gpt-5.3-codex``, ``gpt-5-codex``,
+# ``gpt-5.4``, ``o1-preview``) but tight enough to leave GPT-4 / 3.5 / Llama /
+# Mistral / Grok deployments on chat completions.
+_AZURE_FOUNDRY_RESPONSES_PREFIXES = (
+    "codex",       # codex-*, codex-mini
+    "gpt-5",       # gpt-5, gpt-5.x, gpt-5-codex, gpt-5.x-codex
+    "o1",          # o1, o1-preview, o1-mini
+    "o3",          # o3, o3-mini
+    "o4",          # o4, o4-mini
+)
+
+
+def azure_foundry_model_api_mode(model_name: Optional[str]) -> Optional[str]:
+    """Infer Azure Foundry api_mode from a deployment/model name.
+
+    Returns ``"codex_responses"`` when the model name matches a family that
+    only accepts the Responses API on Azure Foundry (GPT-5.x, codex, o1/o3/o4
+    reasoning models).  Returns ``None`` otherwise — the caller should fall
+    back to the configured/default api_mode (typically ``chat_completions``)
+    so GPT-4o, GPT-4 Turbo, Llama, Mistral, etc. keep working.
+
+    Intentionally does NOT return ``anthropic_messages``; Anthropic-style
+    Azure endpoints are disambiguated by URL (``/anthropic`` suffix) in
+    ``runtime_provider._detect_api_mode_for_url`` and by the user setting
+    ``model.api_mode: anthropic_messages`` explicitly.
+    """
+    raw = str(model_name or "").strip().lower()
+    if not raw:
+        return None
+    # Strip any vendor/ prefix a user may have copied from OpenRouter / Copilot.
+    if "/" in raw:
+        raw = raw.rsplit("/", 1)[-1]
+    # gpt-5-mini speaks chat completions on Copilot but Azure Foundry deploys
+    # the full gpt-5 family uniformly on Responses API — don't carve an
+    # exception here.
+    for prefix in _AZURE_FOUNDRY_RESPONSES_PREFIXES:
+        if raw.startswith(prefix):
+            return "codex_responses"
+    return None
+
+
 def normalize_opencode_model_id(provider_id: Optional[str], model_id: Optional[str]) -> str:
    """Normalize OpenCode config IDs to the bare model slug used in API requests."""
    provider = normalize_provider(provider_id)
--- a/hermes_cli/runtime_provider.py
+++ b/hermes_cli/runtime_provider.py
@@ -231,6 +231,19 @@ def _resolve_runtime_from_pool_entry(
            configured_mode = _parse_api_mode(model_cfg.get("api_mode"))
            if configured_mode:
                api_mode = configured_mode
+        # Model-family inference for GPT-5.x / codex / o1-o4: Azure rejects
+        # /chat/completions on these with 400 "operation unsupported" — see
+        # azure_foundry_model_api_mode() for rationale.  Skip when the user
+        # explicitly picked anthropic_messages (Anthropic-style endpoint).
+        if effective_model and api_mode != "anthropic_messages":
+            try:
+                from hermes_cli.models import azure_foundry_model_api_mode
+
+                inferred = azure_foundry_model_api_mode(effective_model)
+            except Exception:
+                inferred = None
+            if inferred:
+                api_mode = inferred
        # For Anthropic-style endpoints, strip /v1 suffix
        if api_mode == "anthropic_messages":
            base_url = re.sub(r"/v1/?$", "", base_url)
@@ -608,6 +621,7 @@ def _resolve_azure_foundry_runtime(
    model_cfg: Dict[str, Any],
    explicit_api_key: Optional[str] = None,
    explicit_base_url: Optional[str] = None,
+    target_model: Optional[str] = None,
 ) -> Dict[str, Any]:
    """Resolve an Azure Foundry runtime entry.

@@ -628,6 +642,22 @@ def _resolve_azure_foundry_runtime(
        cfg_base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/")
        cfg_api_mode = _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions"

+    # Model-family inference: Azure Foundry deploys GPT-5.x / codex / o1-o4
+    # reasoning models as Responses-API-only.  Calling /chat/completions
+    # against them returns 400 "The requested operation is unsupported."
+    # Upgrade api_mode when the model name matches, unless the user has
+    # explicitly chosen anthropic_messages (Anthropic-style endpoint).
+    effective_model = str(target_model or model_cfg.get("default") or "").strip()
+    if effective_model and cfg_api_mode != "anthropic_messages":
+        try:
+            from hermes_cli.models import azure_foundry_model_api_mode
+
+            inferred = azure_foundry_model_api_mode(effective_model)
+        except Exception:
+            inferred = None
+        if inferred:
+            cfg_api_mode = inferred
+
    env_base_url = os.getenv("AZURE_FOUNDRY_BASE_URL", "").strip().rstrip("/")
    base_url = explicit_base_url_clean or cfg_base_url or env_base_url
    if not base_url:
@@ -864,6 +894,7 @@ def resolve_runtime_provider(
            model_cfg=_get_model_config(),
            explicit_api_key=explicit_api_key,
            explicit_base_url=explicit_base_url,
+            target_model=target_model,
        )
        return azure_runtime