mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(auxiliary): honor api_mode in auxiliary client (#6800)
The auxiliary client always calls client.chat.completions.create(),
ignoring the api_mode config flag. This breaks codex-family models
(e.g. gpt-5.3-codex) on direct OpenAI API keys, which need the
/v1/responses endpoint.
Changes:
- Expand _resolve_task_provider_model to return api_mode (5-tuple)
- Read api_mode from auxiliary.{task}.api_mode config and env vars
(AUXILIARY_{TASK}_API_MODE)
- Pass api_mode through _get_cached_client to resolve_provider_client
- Add _needs_codex_wrap/_wrap_if_needed helpers that wrap plain OpenAI
clients in CodexAuxiliaryClient when api_mode=codex_responses or
when auto-detection finds api.openai.com + codex model pattern
- Apply wrapping at all custom endpoint, named custom provider, and
API-key provider return paths
- Update test mocks for the new 5-tuple return format
Users can now set:
auxiliary:
compression:
model: gpt-5.3-codex
base_url: https://api.openai.com/v1
api_mode: codex_responses
Closes #6800
This commit is contained in:
@@ -1217,6 +1217,7 @@ def resolve_provider_client(
|
||||
raw_codex: bool = False,
|
||||
explicit_base_url: str = None,
|
||||
explicit_api_key: str = None,
|
||||
api_mode: str = None,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Central router: given a provider name and optional model, return a
|
||||
configured client with the correct auth, base URL, and API format.
|
||||
@@ -1240,6 +1241,10 @@ def resolve_provider_client(
|
||||
the main agent loop).
|
||||
explicit_base_url: Optional direct OpenAI-compatible endpoint.
|
||||
explicit_api_key: Optional API key paired with explicit_base_url.
|
||||
api_mode: API mode override. One of "chat_completions",
|
||||
"codex_responses", or None (auto-detect). When set to
|
||||
"codex_responses", the client is wrapped in
|
||||
CodexAuxiliaryClient to route through the Responses API.
|
||||
|
||||
Returns:
|
||||
(client, resolved_model) or (None, None) if auth is unavailable.
|
||||
@@ -1247,6 +1252,40 @@ def resolve_provider_client(
|
||||
# Normalise aliases
|
||||
provider = _normalize_aux_provider(provider)
|
||||
|
||||
def _needs_codex_wrap(client_obj, base_url_str: str, model_str: str) -> bool:
|
||||
"""Decide if a plain OpenAI client should be wrapped for Responses API.
|
||||
|
||||
Returns True when api_mode is explicitly "codex_responses", or when
|
||||
auto-detection (api.openai.com + codex-family model) suggests it.
|
||||
Already-wrapped clients (CodexAuxiliaryClient) are skipped.
|
||||
"""
|
||||
if isinstance(client_obj, CodexAuxiliaryClient):
|
||||
return False
|
||||
if raw_codex:
|
||||
return False
|
||||
if api_mode == "codex_responses":
|
||||
return True
|
||||
# Auto-detect: api.openai.com + codex model name pattern
|
||||
if api_mode and api_mode != "codex_responses":
|
||||
return False # explicit non-codex mode
|
||||
normalized_base = (base_url_str or "").strip().lower()
|
||||
if "api.openai.com" in normalized_base and "openrouter" not in normalized_base:
|
||||
model_lower = (model_str or "").lower()
|
||||
if "codex" in model_lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""):
|
||||
"""Wrap a plain OpenAI client in CodexAuxiliaryClient if Responses API is needed."""
|
||||
if _needs_codex_wrap(client_obj, base_url_str, final_model_str):
|
||||
logger.debug(
|
||||
"resolve_provider_client: wrapping client in CodexAuxiliaryClient "
|
||||
"(api_mode=%s, model=%s, base_url=%s)",
|
||||
api_mode or "auto-detected", final_model_str,
|
||||
base_url_str[:60] if base_url_str else "")
|
||||
return CodexAuxiliaryClient(client_obj, final_model_str)
|
||||
return client_obj
|
||||
|
||||
# ── Auto: try all providers in priority order ────────────────────
|
||||
if provider == "auto":
|
||||
client, resolved = _resolve_auto()
|
||||
@@ -1336,6 +1375,7 @@ def resolve_provider_client(
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
client = OpenAI(api_key=custom_key, base_url=custom_base, **extra)
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
# Try custom first, then codex, then API-key providers
|
||||
@@ -1344,6 +1384,8 @@ def resolve_provider_client(
|
||||
client, default = try_fn()
|
||||
if client is not None:
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
_cbase = str(getattr(client, "base_url", "") or "")
|
||||
client = _wrap_if_needed(client, final_model, _cbase)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
logger.warning("resolve_provider_client: custom/main requested "
|
||||
@@ -1363,6 +1405,7 @@ def resolve_provider_client(
|
||||
provider,
|
||||
)
|
||||
client = OpenAI(api_key=custom_key, base_url=custom_base)
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
logger.debug(
|
||||
"resolve_provider_client: named custom provider %r (%s)",
|
||||
provider, final_model)
|
||||
@@ -1442,6 +1485,11 @@ def resolve_provider_client(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Honor api_mode for any API-key provider (e.g. direct OpenAI with
|
||||
# codex-family models). The copilot-specific wrapping above handles
|
||||
# copilot; this covers the general case (#6800).
|
||||
client = _wrap_if_needed(client, final_model, base_url)
|
||||
|
||||
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
@@ -1474,12 +1522,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
|
||||
Callers may override the returned model with a per-task env var
|
||||
(e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
|
||||
"""
|
||||
provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
|
||||
provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
|
||||
return resolve_provider_client(
|
||||
provider,
|
||||
model=model,
|
||||
explicit_base_url=base_url,
|
||||
explicit_api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
)
|
||||
|
||||
|
||||
@@ -1490,13 +1539,14 @@ def get_async_text_auxiliary_client(task: str = ""):
|
||||
(AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
|
||||
Returns (None, None) when no provider is available.
|
||||
"""
|
||||
provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
|
||||
provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
|
||||
return resolve_provider_client(
|
||||
provider,
|
||||
model=model,
|
||||
async_mode=True,
|
||||
explicit_base_url=base_url,
|
||||
explicit_api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
)
|
||||
|
||||
|
||||
@@ -1569,7 +1619,7 @@ def resolve_vision_provider_client(
|
||||
backends, so users can intentionally force experimental providers. Auto mode
|
||||
stays conservative and only tries vision backends known to work today.
|
||||
"""
|
||||
requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
|
||||
requested, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
|
||||
"vision", provider, model, base_url, api_key
|
||||
)
|
||||
requested = _normalize_vision_provider(requested)
|
||||
@@ -1791,6 +1841,7 @@ def _get_cached_client(
|
||||
async_mode: bool = False,
|
||||
base_url: str = None,
|
||||
api_key: str = None,
|
||||
api_mode: str = None,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Get or create a cached client for the given provider.
|
||||
|
||||
@@ -1814,7 +1865,7 @@ def _get_cached_client(
|
||||
loop_id = id(current_loop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
cache_key = (provider, async_mode, base_url or "", api_key or "", loop_id)
|
||||
cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id)
|
||||
with _client_cache_lock:
|
||||
if cache_key in _client_cache:
|
||||
cached_client, cached_default, cached_loop = _client_cache[cache_key]
|
||||
@@ -1836,6 +1887,7 @@ def _get_cached_client(
|
||||
async_mode,
|
||||
explicit_base_url=base_url,
|
||||
explicit_api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
)
|
||||
if client is not None:
|
||||
# For async clients, remember which loop they were created on so we
|
||||
@@ -1855,7 +1907,7 @@ def _resolve_task_provider_model(
|
||||
model: str = None,
|
||||
base_url: str = None,
|
||||
api_key: str = None,
|
||||
) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
|
||||
) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
|
||||
"""Determine provider + model for a call.
|
||||
|
||||
Priority:
|
||||
@@ -1864,15 +1916,17 @@ def _resolve_task_provider_model(
|
||||
3. Config file (auxiliary.{task}.* or compression.*)
|
||||
4. "auto" (full auto-detection chain)
|
||||
|
||||
Returns (provider, model, base_url, api_key) where model may be None
|
||||
(use provider default). When base_url is set, provider is forced to
|
||||
"custom" and the task uses that direct endpoint.
|
||||
Returns (provider, model, base_url, api_key, api_mode) where model may
|
||||
be None (use provider default). When base_url is set, provider is forced
|
||||
to "custom" and the task uses that direct endpoint. api_mode is one of
|
||||
"chat_completions", "codex_responses", or None (auto-detect).
|
||||
"""
|
||||
config = {}
|
||||
cfg_provider = None
|
||||
cfg_model = None
|
||||
cfg_base_url = None
|
||||
cfg_api_key = None
|
||||
cfg_api_mode = None
|
||||
|
||||
if task:
|
||||
try:
|
||||
@@ -1889,6 +1943,7 @@ def _resolve_task_provider_model(
|
||||
cfg_model = str(task_config.get("model", "")).strip() or None
|
||||
cfg_base_url = str(task_config.get("base_url", "")).strip() or None
|
||||
cfg_api_key = str(task_config.get("api_key", "")).strip() or None
|
||||
cfg_api_mode = str(task_config.get("api_mode", "")).strip() or None
|
||||
|
||||
# Backwards compat: compression section has its own keys.
|
||||
# The auxiliary.compression defaults to provider="auto", so treat
|
||||
@@ -1902,30 +1957,32 @@ def _resolve_task_provider_model(
|
||||
cfg_base_url = cfg_base_url or _sbu.strip() or None
|
||||
|
||||
env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
|
||||
env_api_mode = _get_auxiliary_env_override(task, "API_MODE") if task else None
|
||||
resolved_model = model or env_model or cfg_model
|
||||
resolved_api_mode = env_api_mode or cfg_api_mode
|
||||
|
||||
if base_url:
|
||||
return "custom", resolved_model, base_url, api_key
|
||||
return "custom", resolved_model, base_url, api_key, resolved_api_mode
|
||||
if provider:
|
||||
return provider, resolved_model, base_url, api_key
|
||||
return provider, resolved_model, base_url, api_key, resolved_api_mode
|
||||
|
||||
if task:
|
||||
env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
|
||||
env_api_key = _get_auxiliary_env_override(task, "API_KEY")
|
||||
if env_base_url:
|
||||
return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key
|
||||
return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key, resolved_api_mode
|
||||
|
||||
env_provider = _get_auxiliary_provider(task)
|
||||
if env_provider != "auto":
|
||||
return env_provider, resolved_model, None, None
|
||||
return env_provider, resolved_model, None, None, resolved_api_mode
|
||||
|
||||
if cfg_base_url:
|
||||
return "custom", resolved_model, cfg_base_url, cfg_api_key
|
||||
return "custom", resolved_model, cfg_base_url, cfg_api_key, resolved_api_mode
|
||||
if cfg_provider and cfg_provider != "auto":
|
||||
return cfg_provider, resolved_model, None, None
|
||||
return "auto", resolved_model, None, None
|
||||
return cfg_provider, resolved_model, None, None, resolved_api_mode
|
||||
return "auto", resolved_model, None, None, resolved_api_mode
|
||||
|
||||
return "auto", resolved_model, None, None
|
||||
return "auto", resolved_model, None, None, resolved_api_mode
|
||||
|
||||
|
||||
_DEFAULT_AUX_TIMEOUT = 30.0
|
||||
@@ -2035,7 +2092,7 @@ def call_llm(
|
||||
Raises:
|
||||
RuntimeError: If no provider is configured.
|
||||
"""
|
||||
resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
|
||||
resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
|
||||
task, provider, model, base_url, api_key)
|
||||
|
||||
if task == "vision":
|
||||
@@ -2068,6 +2125,7 @@ def call_llm(
|
||||
resolved_model,
|
||||
base_url=resolved_base_url,
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
)
|
||||
if client is None:
|
||||
# When the user explicitly chose a non-OpenRouter provider but no
|
||||
@@ -2229,7 +2287,7 @@ async def async_call_llm(
|
||||
|
||||
Same as call_llm() but async. See call_llm() for full documentation.
|
||||
"""
|
||||
resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
|
||||
resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
|
||||
task, provider, model, base_url, api_key)
|
||||
|
||||
if task == "vision":
|
||||
@@ -2263,6 +2321,7 @@ async def async_call_llm(
|
||||
async_mode=True,
|
||||
base_url=resolved_base_url,
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
)
|
||||
if client is None:
|
||||
_explicit = (resolved_provider or "").strip().lower()
|
||||
|
||||
@@ -1136,7 +1136,7 @@ class TestCallLlmPaymentFallback:
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None, None)), \
|
||||
patch("agent.auxiliary_client._try_payment_fallback",
|
||||
return_value=(fallback_client, "gpt-5.2-codex", "openai-codex")) as mock_fb:
|
||||
result = call_llm(
|
||||
@@ -1162,7 +1162,7 @@ class TestCallLlmPaymentFallback:
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)):
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None, None)):
|
||||
with pytest.raises(Exception, match="Internal Server Error"):
|
||||
call_llm(
|
||||
task="compression",
|
||||
@@ -1179,7 +1179,7 @@ class TestCallLlmPaymentFallback:
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None, None)), \
|
||||
patch("agent.auxiliary_client._try_payment_fallback",
|
||||
return_value=(None, None, "")):
|
||||
with pytest.raises(Exception, match="insufficient credits"):
|
||||
|
||||
Reference in New Issue
Block a user