diff --git a/cli.py b/cli.py index 015e5bde72..659fa97410 100644 --- a/cli.py +++ b/cli.py @@ -2572,7 +2572,7 @@ class HermesCLI: def _resolve_turn_agent_config(self, user_message: str) -> dict: """Resolve model/runtime overrides for a single user turn.""" from agent.smart_model_routing import resolve_turn_route - from hermes_cli.models import resolve_fast_mode_runtime + from hermes_cli.models import resolve_fast_mode_overrides route = resolve_turn_route( user_message, @@ -2595,27 +2595,10 @@ class HermesCLI: return route try: - fast_runtime = resolve_fast_mode_runtime(route.get("model")) + overrides = resolve_fast_mode_overrides(route.get("model")) except Exception: - route["request_overrides"] = None - return route - if not fast_runtime: - route["request_overrides"] = None - return route - - runtime = fast_runtime["runtime"] - route["runtime"] = runtime - route["request_overrides"] = fast_runtime["request_overrides"] - route["label"] = f"fast route → {route.get('model')} ({runtime.get('provider')})" - route["signature"] = ( - route.get("model"), - runtime.get("provider"), - runtime.get("base_url"), - runtime.get("api_mode"), - runtime.get("command"), - tuple(runtime.get("args") or ()), - json.dumps(route["request_overrides"], sort_keys=True), - ) + overrides = None + route["request_overrides"] = overrides return route def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool: @@ -5662,15 +5645,15 @@ class HermesCLI: _cprint(f" {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}") def _handle_fast_command(self, cmd: str): - """Handle /fast — choose the Codex Responses service tier.""" + """Handle /fast — toggle OpenAI Priority Processing (service_tier).""" if not self._fast_command_available(): - _cprint(" (._.) /fast is only available for models that explicitly expose a fast backend.") + _cprint(" (._.) /fast is only available for OpenAI models that support Priority Processing.") return parts = cmd.strip().split(maxsplit=1) if len(parts) < 2 or parts[1].strip().lower() == "status": status = "fast" if self.service_tier == "priority" else "normal" - _cprint(f" {_GOLD}Codex inference tier: {status}{_RST}") + _cprint(f" {_GOLD}Priority Processing: {status}{_RST}") _cprint(f" {_DIM}Usage: /fast [normal|fast|status]{_RST}") return @@ -5691,9 +5674,9 @@ class HermesCLI: self.agent = None # Force agent re-init with new service-tier config if save_config_value("agent.service_tier", saved_value): - _cprint(f" {_GOLD}✓ Codex inference tier set to {label} (saved to config){_RST}") + _cprint(f" {_GOLD}✓ Priority Processing set to {label} (saved to config){_RST}") else: - _cprint(f" {_GOLD}✓ Codex inference tier set to {label} (session only){_RST}") + _cprint(f" {_GOLD}✓ Priority Processing set to {label} (session only){_RST}") def _on_reasoning(self, reasoning_text: str): """Callback for intermediate reasoning display during tool-call loops.""" diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 9260a6c6f7..e0368440ff 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -100,7 +100,7 @@ COMMAND_REGISTRY: list[CommandDef] = [ CommandDef("reasoning", "Manage reasoning effort and display", "Configuration", args_hint="[level|show|hide]", subcommands=("none", "minimal", "low", "medium", "high", "xhigh", "show", "hide", "on", "off")), - CommandDef("fast", "Choose Codex inference tier (Normal/Fast)", "Configuration", + CommandDef("fast", "Toggle OpenAI Priority Processing (Normal/Fast)", "Configuration", cli_only=True, args_hint="[normal|fast|status]", subcommands=("normal", "fast", "status", "on", "off")), CommandDef("skin", "Show or change the display skin/theme", "Configuration", diff --git a/hermes_cli/models.py b/hermes_cli/models.py index b5485ab892..530c1ec6ce 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -1017,58 +1017,45 @@ def provider_label(provider: Optional[str]) -> str: return _PROVIDER_LABELS.get(normalized, original or "OpenRouter") -_FAST_MODE_BACKEND_CONFIG: dict[str, dict[str, Any]] = { - "gpt-5.4": { - "provider": "openai-codex", - "request_overrides": {"service_tier": "priority"}, - }, -} - - -def fast_mode_backend_config(model_id: Optional[str]) -> dict[str, Any] | None: - """Return backend config for models that expose Fast mode. - - To expose Fast mode for a new model, add its normalized model slug to - ``_FAST_MODE_BACKEND_CONFIG`` along with the backend runtime selection and - backend-specific request overrides Hermes should apply. - """ - raw = str(model_id or "").strip().lower() - if "/" in raw: - raw = raw.split("/", 1)[1] - config = _FAST_MODE_BACKEND_CONFIG.get(raw) - return dict(config) if config else None +# Models that support OpenAI Priority Processing (service_tier="priority"). +# See https://openai.com/api-priority-processing/ for the canonical list. +# Only the bare model slug is stored (no vendor prefix). +_PRIORITY_PROCESSING_MODELS: frozenset[str] = frozenset({ + "gpt-5.4", + "gpt-5.4-mini", + "gpt-5.2", + "gpt-5.1", + "gpt-5", + "gpt-5-mini", + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", + "gpt-4o", + "gpt-4o-mini", + "o3", + "o4-mini", +}) def model_supports_fast_mode(model_id: Optional[str]) -> bool: - """Return whether Hermes should expose Fast mode for the active model.""" - return fast_mode_backend_config(model_id) is not None + """Return whether Hermes should expose the /fast (Priority Processing) toggle.""" + raw = str(model_id or "").strip().lower() + if "/" in raw: + raw = raw.split("/", 1)[1] + return raw in _PRIORITY_PROCESSING_MODELS -def resolve_fast_mode_runtime(model_id: Optional[str]) -> dict[str, Any] | None: - """Resolve runtime selection and request overrides for a fast-mode model.""" - cfg = fast_mode_backend_config(model_id) - if not cfg: +def resolve_fast_mode_overrides(model_id: Optional[str]) -> dict[str, Any] | None: + """Return request_overrides for Priority Processing, or None if unsupported. + + Unlike the previous ``resolve_fast_mode_runtime``, this does NOT force a + provider/backend switch. The ``service_tier`` parameter is injected into + whatever API path the user is already on (Codex Responses, Chat Completions, + or OpenRouter passthrough). + """ + if not model_supports_fast_mode(model_id): return None - - from hermes_cli.runtime_provider import resolve_runtime_provider - - runtime = resolve_runtime_provider( - requested=cfg.get("provider"), - explicit_base_url=cfg.get("base_url"), - explicit_api_key=cfg.get("api_key"), - ) - return { - "runtime": { - "api_key": runtime.get("api_key"), - "base_url": runtime.get("base_url"), - "provider": runtime.get("provider"), - "api_mode": runtime.get("api_mode"), - "command": runtime.get("command"), - "args": list(runtime.get("args") or []), - "credential_pool": runtime.get("credential_pool"), - }, - "request_overrides": dict(cfg.get("request_overrides") or {}), - } + return {"service_tier": "priority"} def _resolve_copilot_catalog_api_key() -> str: diff --git a/run_agent.py b/run_agent.py index bee98ed007..448b0004b1 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5686,6 +5686,11 @@ class AIAgent: if "x.ai" in self._base_url_lower and hasattr(self, "session_id") and self.session_id: api_kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id} + # Priority Processing / generic request overrides (e.g. service_tier). + # Applied last so overrides win over any defaults set above. + if self.request_overrides: + api_kwargs.update(self.request_overrides) + return api_kwargs def _supports_reasoning_extra_body(self) -> bool: diff --git a/tests/cli/test_fast_command.py b/tests/cli/test_fast_command.py index 0305bf599c..907808d32a 100644 --- a/tests/cli/test_fast_command.py +++ b/tests/cli/test_fast_command.py @@ -108,15 +108,52 @@ class TestHandleFastCommand(unittest.TestCase): self.assertTrue(mock_cprint.called) -class TestFastModeRegistry(unittest.TestCase): - def test_only_gpt_5_4_is_enabled_for_codex(self): - from hermes_cli.models import fast_mode_backend_config +class TestPriorityProcessingModels(unittest.TestCase): + """Verify the expanded Priority Processing model registry.""" - assert fast_mode_backend_config("gpt-5.4") == { - "provider": "openai-codex", - "request_overrides": {"service_tier": "priority"}, - } - assert fast_mode_backend_config("gpt-5.3-codex") is None + def test_all_documented_models_supported(self): + from hermes_cli.models import model_supports_fast_mode + + # All models from OpenAI's Priority Processing pricing table + supported = [ + "gpt-5.4", "gpt-5.4-mini", "gpt-5.2", + "gpt-5.1", "gpt-5", "gpt-5-mini", + "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4o", "gpt-4o-mini", + "o3", "o4-mini", + ] + for model in supported: + assert model_supports_fast_mode(model), f"{model} should support fast mode" + + def test_vendor_prefix_stripped(self): + from hermes_cli.models import model_supports_fast_mode + + assert model_supports_fast_mode("openai/gpt-5.4") is True + assert model_supports_fast_mode("openai/gpt-4.1") is True + assert model_supports_fast_mode("openai/o3") is True + + def test_non_priority_models_rejected(self): + from hermes_cli.models import model_supports_fast_mode + + assert model_supports_fast_mode("gpt-5.3-codex") is False + assert model_supports_fast_mode("claude-sonnet-4") is False + assert model_supports_fast_mode("") is False + assert model_supports_fast_mode(None) is False + + def test_resolve_overrides_returns_service_tier(self): + from hermes_cli.models import resolve_fast_mode_overrides + + result = resolve_fast_mode_overrides("gpt-5.4") + assert result == {"service_tier": "priority"} + + result = resolve_fast_mode_overrides("gpt-4.1") + assert result == {"service_tier": "priority"} + + def test_resolve_overrides_none_for_unsupported(self): + from hermes_cli.models import resolve_fast_mode_overrides + + assert resolve_fast_mode_overrides("gpt-5.3-codex") is None + assert resolve_fast_mode_overrides("claude-sonnet-4") is None class TestFastModeRouting(unittest.TestCase): @@ -126,7 +163,16 @@ class TestFastModeRouting(unittest.TestCase): assert cli_mod.HermesCLI._fast_command_available(stub) is True - def test_turn_route_switches_to_model_backend_when_fast_enabled(self): + def test_fast_command_exposed_for_non_codex_models(self): + cli_mod = _import_cli() + stub = SimpleNamespace(provider="openai", requested_provider="openai", model="gpt-4.1", agent=None) + assert cli_mod.HermesCLI._fast_command_available(stub) is True + + stub = SimpleNamespace(provider="openrouter", requested_provider="openrouter", model="o3", agent=None) + assert cli_mod.HermesCLI._fast_command_available(stub) is True + + def test_turn_route_injects_overrides_without_provider_switch(self): + """Fast mode should add request_overrides but NOT change the provider/runtime.""" cli_mod = _import_cli() stub = SimpleNamespace( model="gpt-5.4", @@ -141,35 +187,28 @@ class TestFastModeRouting(unittest.TestCase): service_tier="priority", ) - with ( - patch("agent.smart_model_routing.resolve_turn_route", return_value={ - "model": "gpt-5.4", - "runtime": { - "api_key": "primary-key", - "base_url": "https://openrouter.ai/api/v1", - "provider": "openrouter", - "api_mode": "chat_completions", - "command": None, - "args": [], - "credential_pool": None, - }, - "label": None, - "signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), - }), - patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value={ - "provider": "openai-codex", - "api_mode": "codex_responses", - "base_url": "https://chatgpt.com/backend-api/codex", - "api_key": "codex-key", - "command": None, - "args": [], - "credential_pool": None, - }), - ): + original_runtime = { + "api_key": "***", + "base_url": "https://openrouter.ai/api/v1", + "provider": "openrouter", + "api_mode": "chat_completions", + "command": None, + "args": [], + "credential_pool": None, + } + + with patch("agent.smart_model_routing.resolve_turn_route", return_value={ + "model": "gpt-5.4", + "runtime": dict(original_runtime), + "label": None, + "signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), + }): route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") - assert route["runtime"]["provider"] == "openai-codex" - assert route["runtime"]["api_mode"] == "codex_responses" + # Provider should NOT have changed + assert route["runtime"]["provider"] == "openrouter" + assert route["runtime"]["api_mode"] == "chat_completions" + # But request_overrides should be set assert route["request_overrides"] == {"service_tier": "priority"} def test_turn_route_keeps_primary_runtime_when_model_has_no_fast_backend(self): @@ -190,7 +229,7 @@ class TestFastModeRouting(unittest.TestCase): primary_route = { "model": "gpt-5.3-codex", "runtime": { - "api_key": "primary-key", + "api_key": "***", "base_url": "https://openrouter.ai/api/v1", "provider": "openrouter", "api_mode": "chat_completions", diff --git a/tests/run_agent/test_provider_parity.py b/tests/run_agent/test_provider_parity.py index 0948525303..067ecf6720 100644 --- a/tests/run_agent/test_provider_parity.py +++ b/tests/run_agent/test_provider_parity.py @@ -225,6 +225,26 @@ class TestDeveloperRoleSwap: assert kwargs["messages"][0]["role"] == "developer" +class TestBuildApiKwargsChatCompletionsServiceTier: + """service_tier via request_overrides works on the chat_completions path.""" + + def test_includes_service_tier_via_request_overrides(self, monkeypatch): + agent = _make_agent(monkeypatch, "openrouter") + agent.model = "gpt-4.1" + agent.request_overrides = {"service_tier": "priority"} + messages = [{"role": "user", "content": "hi"}] + kwargs = agent._build_api_kwargs(messages) + assert kwargs["service_tier"] == "priority" + + def test_no_service_tier_when_overrides_empty(self, monkeypatch): + agent = _make_agent(monkeypatch, "openrouter") + agent.model = "gpt-4.1" + agent.request_overrides = {} + messages = [{"role": "user", "content": "hi"}] + kwargs = agent._build_api_kwargs(messages) + assert "service_tier" not in kwargs + + class TestBuildApiKwargsAIGateway: def test_uses_chat_completions_format(self, monkeypatch): agent = _make_agent(monkeypatch, "ai-gateway", base_url="https://ai-gateway.vercel.sh/v1")