mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(azure-foundry): auto-route gpt-5.x / codex / o-series to Responses API (#16361)
Azure Foundry deploys GPT-5.x, codex-*, and o1/o3/o4 reasoning models as Responses-API-only. Calling /chat/completions against these deployments returns 400 'The requested operation is unsupported.', which broke any user who ran 'hermes model' on Azure, picked a gpt-5/codex deployment, and kept the default api_mode: chat_completions. Verified in a user debug bundle on 2026-04-26: gpt-5.3-codex failed on synopsisse.openai.azure.com with that exact payload while gpt-4o-pure on the same endpoint worked. Adds azure_foundry_model_api_mode(model_name) that returns codex_responses when the model name starts with gpt-5, codex, o1, o3, or o4 — otherwise None so chat_completions / anthropic_messages stay untouched for gpt-4o, Llama, Claude-via-Anthropic, etc. Resolver (both the direct Azure Foundry path and the pool-entry path) consults it and upgrades api_mode unless the user explicitly picked anthropic_messages. target_model (from /model mid-session switch) takes precedence over the persisted default so switching from gpt-4o to gpt-5.3-codex routes correctly before the next request. Docs: correct the azure-foundry guide which previously claimed Azure keeps gpt-5.x on chat completions — that was only true for early Azure OpenAI, not Azure Foundry codex/o-series deployments. Tests: 14 unit tests for azure_foundry_model_api_mode + 6 integration tests in TestAzureFoundryResolution covering Bob's exact scenario, target_model override, anthropic_messages guard, and o3-mini.
This commit is contained in:
@@ -2226,6 +2226,52 @@ def copilot_model_api_mode(
|
||||
return "chat_completions"
|
||||
|
||||
|
||||
# Azure Foundry model families that require the Responses API. Azure
|
||||
# rejects /chat/completions against these deployments with
|
||||
# ``400 "The requested operation is unsupported."`` — the same payload Bob
|
||||
# Dobolina hit in April 2026 on ``gpt-5.3-codex`` while ``gpt-4o-pure`` on
|
||||
# the same endpoint worked fine. Keep the patterns broad enough to cover
|
||||
# vendor-renamed deployments (e.g. ``gpt-5.3-codex``, ``gpt-5-codex``,
|
||||
# ``gpt-5.4``, ``o1-preview``) but tight enough to leave GPT-4 / 3.5 / Llama /
|
||||
# Mistral / Grok deployments on chat completions.
|
||||
_AZURE_FOUNDRY_RESPONSES_PREFIXES = (
|
||||
"codex", # codex-*, codex-mini
|
||||
"gpt-5", # gpt-5, gpt-5.x, gpt-5-codex, gpt-5.x-codex
|
||||
"o1", # o1, o1-preview, o1-mini
|
||||
"o3", # o3, o3-mini
|
||||
"o4", # o4, o4-mini
|
||||
)
|
||||
|
||||
|
||||
def azure_foundry_model_api_mode(model_name: Optional[str]) -> Optional[str]:
|
||||
"""Infer Azure Foundry api_mode from a deployment/model name.
|
||||
|
||||
Returns ``"codex_responses"`` when the model name matches a family that
|
||||
only accepts the Responses API on Azure Foundry (GPT-5.x, codex, o1/o3/o4
|
||||
reasoning models). Returns ``None`` otherwise — the caller should fall
|
||||
back to the configured/default api_mode (typically ``chat_completions``)
|
||||
so GPT-4o, GPT-4 Turbo, Llama, Mistral, etc. keep working.
|
||||
|
||||
Intentionally does NOT return ``anthropic_messages``; Anthropic-style
|
||||
Azure endpoints are disambiguated by URL (``/anthropic`` suffix) in
|
||||
``runtime_provider._detect_api_mode_for_url`` and by the user setting
|
||||
``model.api_mode: anthropic_messages`` explicitly.
|
||||
"""
|
||||
raw = str(model_name or "").strip().lower()
|
||||
if not raw:
|
||||
return None
|
||||
# Strip any vendor/ prefix a user may have copied from OpenRouter / Copilot.
|
||||
if "/" in raw:
|
||||
raw = raw.rsplit("/", 1)[-1]
|
||||
# gpt-5-mini speaks chat completions on Copilot but Azure Foundry deploys
|
||||
# the full gpt-5 family uniformly on Responses API — don't carve an
|
||||
# exception here.
|
||||
for prefix in _AZURE_FOUNDRY_RESPONSES_PREFIXES:
|
||||
if raw.startswith(prefix):
|
||||
return "codex_responses"
|
||||
return None
|
||||
|
||||
|
||||
def normalize_opencode_model_id(provider_id: Optional[str], model_id: Optional[str]) -> str:
|
||||
"""Normalize OpenCode config IDs to the bare model slug used in API requests."""
|
||||
provider = normalize_provider(provider_id)
|
||||
|
||||
@@ -231,6 +231,19 @@ def _resolve_runtime_from_pool_entry(
|
||||
configured_mode = _parse_api_mode(model_cfg.get("api_mode"))
|
||||
if configured_mode:
|
||||
api_mode = configured_mode
|
||||
# Model-family inference for GPT-5.x / codex / o1-o4: Azure rejects
|
||||
# /chat/completions on these with 400 "operation unsupported" — see
|
||||
# azure_foundry_model_api_mode() for rationale. Skip when the user
|
||||
# explicitly picked anthropic_messages (Anthropic-style endpoint).
|
||||
if effective_model and api_mode != "anthropic_messages":
|
||||
try:
|
||||
from hermes_cli.models import azure_foundry_model_api_mode
|
||||
|
||||
inferred = azure_foundry_model_api_mode(effective_model)
|
||||
except Exception:
|
||||
inferred = None
|
||||
if inferred:
|
||||
api_mode = inferred
|
||||
# For Anthropic-style endpoints, strip /v1 suffix
|
||||
if api_mode == "anthropic_messages":
|
||||
base_url = re.sub(r"/v1/?$", "", base_url)
|
||||
@@ -608,6 +621,7 @@ def _resolve_azure_foundry_runtime(
|
||||
model_cfg: Dict[str, Any],
|
||||
explicit_api_key: Optional[str] = None,
|
||||
explicit_base_url: Optional[str] = None,
|
||||
target_model: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve an Azure Foundry runtime entry.
|
||||
|
||||
@@ -628,6 +642,22 @@ def _resolve_azure_foundry_runtime(
|
||||
cfg_base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/")
|
||||
cfg_api_mode = _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions"
|
||||
|
||||
# Model-family inference: Azure Foundry deploys GPT-5.x / codex / o1-o4
|
||||
# reasoning models as Responses-API-only. Calling /chat/completions
|
||||
# against them returns 400 "The requested operation is unsupported."
|
||||
# Upgrade api_mode when the model name matches, unless the user has
|
||||
# explicitly chosen anthropic_messages (Anthropic-style endpoint).
|
||||
effective_model = str(target_model or model_cfg.get("default") or "").strip()
|
||||
if effective_model and cfg_api_mode != "anthropic_messages":
|
||||
try:
|
||||
from hermes_cli.models import azure_foundry_model_api_mode
|
||||
|
||||
inferred = azure_foundry_model_api_mode(effective_model)
|
||||
except Exception:
|
||||
inferred = None
|
||||
if inferred:
|
||||
cfg_api_mode = inferred
|
||||
|
||||
env_base_url = os.getenv("AZURE_FOUNDRY_BASE_URL", "").strip().rstrip("/")
|
||||
base_url = explicit_base_url_clean or cfg_base_url or env_base_url
|
||||
if not base_url:
|
||||
@@ -864,6 +894,7 @@ def resolve_runtime_provider(
|
||||
model_cfg=_get_model_config(),
|
||||
explicit_api_key=explicit_api_key,
|
||||
explicit_base_url=explicit_base_url,
|
||||
target_model=target_model,
|
||||
)
|
||||
return azure_runtime
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
from hermes_cli.models import (
|
||||
azure_foundry_model_api_mode,
|
||||
copilot_model_api_mode,
|
||||
fetch_github_model_catalog,
|
||||
curated_models_for_provider,
|
||||
@@ -414,6 +415,69 @@ class TestCopilotNormalization:
|
||||
assert opencode_model_api_mode("opencode-go", "opencode-go/minimax-m2.5") == "anthropic_messages"
|
||||
|
||||
|
||||
class TestAzureFoundryModelApiMode:
|
||||
"""Azure Foundry deploys GPT-5.x / codex / o-series as Responses-API-only.
|
||||
|
||||
Azure returns ``400 "The requested operation is unsupported."`` when
|
||||
/chat/completions is called against these deployments. Verified in the
|
||||
wild by a user debug bundle on 2026-04-26: gpt-5.3-codex failed with
|
||||
that exact payload while gpt-4o-pure worked on the same endpoint.
|
||||
"""
|
||||
|
||||
def test_gpt5_family_uses_responses(self):
|
||||
assert azure_foundry_model_api_mode("gpt-5") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("gpt-5.3") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("gpt-5.4") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("gpt-5-codex") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("gpt-5.3-codex") == "codex_responses"
|
||||
# gpt-5-mini exceptions are Copilot-specific; Azure deploys the whole
|
||||
# gpt-5 family on Responses API uniformly.
|
||||
assert azure_foundry_model_api_mode("gpt-5-mini") == "codex_responses"
|
||||
|
||||
def test_codex_family_uses_responses(self):
|
||||
assert azure_foundry_model_api_mode("codex") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("codex-mini") == "codex_responses"
|
||||
|
||||
def test_o_series_reasoning_uses_responses(self):
|
||||
assert azure_foundry_model_api_mode("o1") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("o1-preview") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("o1-mini") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("o3") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("o3-mini") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("o4-mini") == "codex_responses"
|
||||
|
||||
def test_gpt4_family_returns_none(self):
|
||||
"""GPT-4, GPT-4o, etc. speak chat completions on Azure."""
|
||||
assert azure_foundry_model_api_mode("gpt-4") is None
|
||||
assert azure_foundry_model_api_mode("gpt-4o") is None
|
||||
assert azure_foundry_model_api_mode("gpt-4o-pure") is None
|
||||
assert azure_foundry_model_api_mode("gpt-4o-mini") is None
|
||||
assert azure_foundry_model_api_mode("gpt-4-turbo") is None
|
||||
assert azure_foundry_model_api_mode("gpt-4.1") is None
|
||||
assert azure_foundry_model_api_mode("gpt-3.5-turbo") is None
|
||||
|
||||
def test_non_openai_deployments_return_none(self):
|
||||
"""Llama, Mistral, Grok, etc. keep the default chat completions."""
|
||||
assert azure_foundry_model_api_mode("llama-3.1-70b") is None
|
||||
assert azure_foundry_model_api_mode("mistral-large") is None
|
||||
assert azure_foundry_model_api_mode("grok-4") is None
|
||||
assert azure_foundry_model_api_mode("phi-3-medium") is None
|
||||
|
||||
def test_vendor_prefix_stripped(self):
|
||||
"""Users who copy-paste ``openai/gpt-5.3-codex`` should still match."""
|
||||
assert azure_foundry_model_api_mode("openai/gpt-5.3-codex") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("openai/gpt-4o") is None
|
||||
|
||||
def test_empty_and_none_return_none(self):
|
||||
assert azure_foundry_model_api_mode(None) is None
|
||||
assert azure_foundry_model_api_mode("") is None
|
||||
assert azure_foundry_model_api_mode(" ") is None
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert azure_foundry_model_api_mode("GPT-5.3-Codex") == "codex_responses"
|
||||
assert azure_foundry_model_api_mode("Codex-Mini") == "codex_responses"
|
||||
|
||||
|
||||
# -- validate — format checks -----------------------------------------------
|
||||
|
||||
class TestValidateFormatChecks:
|
||||
|
||||
@@ -1581,7 +1581,10 @@ class TestAzureFoundryResolution:
|
||||
"provider": "azure-foundry",
|
||||
"base_url": base_url,
|
||||
"api_mode": api_mode,
|
||||
"default": "gpt-5.4",
|
||||
# GPT-4 speaks chat completions on Azure, so this test's assertion
|
||||
# about chat_completions stays valid across the Apr 2026 fix that
|
||||
# upgrades GPT-5.x / codex deployments to codex_responses.
|
||||
"default": "gpt-4.1",
|
||||
}
|
||||
|
||||
def test_azure_foundry_openai_style_explicit(self, monkeypatch):
|
||||
@@ -1643,3 +1646,108 @@ class TestAzureFoundryResolution:
|
||||
|
||||
with pytest.raises(rp.AuthError, match="API key"):
|
||||
rp.resolve_runtime_provider(requested="azure-foundry")
|
||||
|
||||
# -- Model-family api_mode inference -------------------------------------
|
||||
# Azure rejects /chat/completions on GPT-5.x / codex / o-series with
|
||||
# ``400 "The requested operation is unsupported."`` — the resolver must
|
||||
# upgrade api_mode to ``codex_responses`` for those models even when the
|
||||
# config was persisted as ``chat_completions`` (the default the setup
|
||||
# wizard writes when the user didn't pick explicitly).
|
||||
|
||||
def _make_cfg_with_model(self, model: str, api_mode: str = "chat_completions"):
|
||||
return {
|
||||
"provider": "azure-foundry",
|
||||
"base_url": "https://synopsisse.openai.azure.com/openai/v1",
|
||||
"api_mode": api_mode,
|
||||
"default": model,
|
||||
}
|
||||
|
||||
def test_gpt5_codex_upgrades_chat_completions_to_responses(self, monkeypatch):
|
||||
"""Reproduces Bob's April 2026 bug: gpt-5.3-codex on chat_completions."""
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
monkeypatch.setattr(rp, "_get_model_config",
|
||||
lambda: self._make_cfg_with_model("gpt-5.3-codex", "chat_completions"))
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="azure-foundry")
|
||||
|
||||
assert resolved["api_mode"] == "codex_responses"
|
||||
assert resolved["base_url"] == "https://synopsisse.openai.azure.com/openai/v1"
|
||||
|
||||
def test_gpt4o_stays_on_chat_completions(self, monkeypatch):
|
||||
"""gpt-4o-pure worked on Bob's endpoint — must not get upgraded."""
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
monkeypatch.setattr(rp, "_get_model_config",
|
||||
lambda: self._make_cfg_with_model("gpt-4o-pure", "chat_completions"))
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="azure-foundry")
|
||||
|
||||
assert resolved["api_mode"] == "chat_completions"
|
||||
|
||||
def test_anthropic_messages_not_downgraded(self, monkeypatch):
|
||||
"""Anthropic-style endpoint: keep anthropic_messages even for gpt-5 names."""
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
monkeypatch.setattr(rp, "_get_model_config", lambda: {
|
||||
"provider": "azure-foundry",
|
||||
"base_url": "https://my-resource.services.ai.azure.com/anthropic/v1",
|
||||
"api_mode": "anthropic_messages",
|
||||
"default": "gpt-5.3-codex", # nonsensical on Anthropic but tests the guard
|
||||
})
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="azure-foundry")
|
||||
|
||||
assert resolved["api_mode"] == "anthropic_messages"
|
||||
|
||||
def test_target_model_overrides_stale_default(self, monkeypatch):
|
||||
"""/model switch: target_model should drive api_mode, not the stale config default."""
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
# Config still pinned to gpt-4o, but user just ran /model gpt-5.3-codex
|
||||
monkeypatch.setattr(rp, "_get_model_config",
|
||||
lambda: self._make_cfg_with_model("gpt-4o-pure", "chat_completions"))
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(
|
||||
requested="azure-foundry",
|
||||
target_model="gpt-5.3-codex",
|
||||
)
|
||||
|
||||
assert resolved["api_mode"] == "codex_responses"
|
||||
|
||||
def test_target_model_downgrade_path(self, monkeypatch):
|
||||
"""/model switch gpt-5.3-codex → gpt-4o: api_mode follows new model."""
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
# Config was upgraded to codex_responses for the previous model; user
|
||||
# now switches to gpt-4o which speaks chat completions.
|
||||
monkeypatch.setattr(rp, "_get_model_config",
|
||||
lambda: self._make_cfg_with_model("gpt-5.3-codex", "codex_responses"))
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(
|
||||
requested="azure-foundry",
|
||||
target_model="gpt-4o-pure",
|
||||
)
|
||||
|
||||
# codex_responses was persisted; we keep it because gpt-4o can speak
|
||||
# both protocols but the explicit persisted mode is the safer signal.
|
||||
# (gpt-4o returning None from the inference function means "don't
|
||||
# override" — the persisted codex_responses survives.)
|
||||
assert resolved["api_mode"] == "codex_responses"
|
||||
|
||||
def test_o3_mini_upgrades(self, monkeypatch):
|
||||
monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key")
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry")
|
||||
monkeypatch.setattr(rp, "_get_model_config",
|
||||
lambda: self._make_cfg_with_model("o3-mini", "chat_completions"))
|
||||
monkeypatch.setattr(rp, "load_pool", lambda provider: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="azure-foundry")
|
||||
|
||||
assert resolved["api_mode"] == "codex_responses"
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ model:
|
||||
|
||||
Important behaviour:
|
||||
|
||||
- **gpt-5.x stays on `/chat/completions`.** Unlike `api.openai.com`, Azure OpenAI does not support the Responses API — Hermes detects Azure endpoints and keeps gpt-5.x on `chat_completions` where Azure actually serves it.
|
||||
- **GPT-5.x, codex, and o-series auto-route to the Responses API.** Azure Foundry deploys GPT-5 / codex / o1 / o3 / o4 models as Responses-API-only — calling `/chat/completions` against them returns `400 "The requested operation is unsupported."`. Hermes detects these model families by name and upgrades `api_mode` to `codex_responses` transparently, even when `config.yaml` still reads `api_mode: chat_completions`. GPT-4, GPT-4o, Llama, Mistral, and other deployments stay on `/chat/completions`.
|
||||
- **`max_completion_tokens` is used automatically.** Azure OpenAI (like direct OpenAI) requires `max_completion_tokens` for gpt-4o, o-series, and gpt-5.x models. Hermes sends the right parameter based on the endpoint.
|
||||
- **Pre-v1 endpoints that require `api-version`.** If you have a legacy base URL like `https://<resource>.openai.azure.com/openai?api-version=2025-04-01-preview`, Hermes extracts the query string and forwards it via `default_query` on every request (the OpenAI SDK otherwise drops it when joining paths).
|
||||
|
||||
|
||||
Reference in New Issue
Block a user