Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
7d001a2da2 feat: expand /fast to all OpenAI Priority Processing models
Previously /fast only supported gpt-5.4 and forced a provider switch to
openai-codex. Now supports all 13 models from OpenAI's Priority Processing
pricing table (gpt-5.4, gpt-5.4-mini, gpt-5.2, gpt-5.1, gpt-5, gpt-5-mini,
gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, o3, o4-mini).

Key changes:
- Replaced _FAST_MODE_BACKEND_CONFIG with _PRIORITY_PROCESSING_MODELS frozenset
- Removed provider-forcing logic — service_tier is now injected into whatever
  API path the user is already on (Codex Responses, Chat Completions, or
  OpenRouter passthrough)
- Added request_overrides support to chat_completions path in run_agent.py
- Updated messaging from 'Codex inference tier' to 'Priority Processing'
- Expanded test coverage for all supported models
2026-04-09 22:06:08 -07:00
6 changed files with 144 additions and 110 deletions

35
cli.py
View File

@@ -2572,7 +2572,7 @@ class HermesCLI:
def _resolve_turn_agent_config(self, user_message: str) -> dict:
"""Resolve model/runtime overrides for a single user turn."""
from agent.smart_model_routing import resolve_turn_route
from hermes_cli.models import resolve_fast_mode_runtime
from hermes_cli.models import resolve_fast_mode_overrides
route = resolve_turn_route(
user_message,
@@ -2595,27 +2595,10 @@ class HermesCLI:
return route
try:
fast_runtime = resolve_fast_mode_runtime(route.get("model"))
overrides = resolve_fast_mode_overrides(route.get("model"))
except Exception:
route["request_overrides"] = None
return route
if not fast_runtime:
route["request_overrides"] = None
return route
runtime = fast_runtime["runtime"]
route["runtime"] = runtime
route["request_overrides"] = fast_runtime["request_overrides"]
route["label"] = f"fast route → {route.get('model')} ({runtime.get('provider')})"
route["signature"] = (
route.get("model"),
runtime.get("provider"),
runtime.get("base_url"),
runtime.get("api_mode"),
runtime.get("command"),
tuple(runtime.get("args") or ()),
json.dumps(route["request_overrides"], sort_keys=True),
)
overrides = None
route["request_overrides"] = overrides
return route
def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool:
@@ -5662,15 +5645,15 @@ class HermesCLI:
_cprint(f" {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}")
def _handle_fast_command(self, cmd: str):
"""Handle /fast — choose the Codex Responses service tier."""
"""Handle /fast — toggle OpenAI Priority Processing (service_tier)."""
if not self._fast_command_available():
_cprint(" (._.) /fast is only available for models that explicitly expose a fast backend.")
_cprint(" (._.) /fast is only available for OpenAI models that support Priority Processing.")
return
parts = cmd.strip().split(maxsplit=1)
if len(parts) < 2 or parts[1].strip().lower() == "status":
status = "fast" if self.service_tier == "priority" else "normal"
_cprint(f" {_GOLD}Codex inference tier: {status}{_RST}")
_cprint(f" {_GOLD}Priority Processing: {status}{_RST}")
_cprint(f" {_DIM}Usage: /fast [normal|fast|status]{_RST}")
return
@@ -5691,9 +5674,9 @@ class HermesCLI:
self.agent = None # Force agent re-init with new service-tier config
if save_config_value("agent.service_tier", saved_value):
_cprint(f" {_GOLD}Codex inference tier set to {label} (saved to config){_RST}")
_cprint(f" {_GOLD}Priority Processing set to {label} (saved to config){_RST}")
else:
_cprint(f" {_GOLD}Codex inference tier set to {label} (session only){_RST}")
_cprint(f" {_GOLD}Priority Processing set to {label} (session only){_RST}")
def _on_reasoning(self, reasoning_text: str):
"""Callback for intermediate reasoning display during tool-call loops."""

View File

@@ -100,7 +100,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
CommandDef("reasoning", "Manage reasoning effort and display", "Configuration",
args_hint="[level|show|hide]",
subcommands=("none", "minimal", "low", "medium", "high", "xhigh", "show", "hide", "on", "off")),
CommandDef("fast", "Choose Codex inference tier (Normal/Fast)", "Configuration",
CommandDef("fast", "Toggle OpenAI Priority Processing (Normal/Fast)", "Configuration",
cli_only=True, args_hint="[normal|fast|status]",
subcommands=("normal", "fast", "status", "on", "off")),
CommandDef("skin", "Show or change the display skin/theme", "Configuration",

View File

@@ -1017,58 +1017,45 @@ def provider_label(provider: Optional[str]) -> str:
return _PROVIDER_LABELS.get(normalized, original or "OpenRouter")
_FAST_MODE_BACKEND_CONFIG: dict[str, dict[str, Any]] = {
"gpt-5.4": {
"provider": "openai-codex",
"request_overrides": {"service_tier": "priority"},
},
}
def fast_mode_backend_config(model_id: Optional[str]) -> dict[str, Any] | None:
"""Return backend config for models that expose Fast mode.
To expose Fast mode for a new model, add its normalized model slug to
``_FAST_MODE_BACKEND_CONFIG`` along with the backend runtime selection and
backend-specific request overrides Hermes should apply.
"""
raw = str(model_id or "").strip().lower()
if "/" in raw:
raw = raw.split("/", 1)[1]
config = _FAST_MODE_BACKEND_CONFIG.get(raw)
return dict(config) if config else None
# Models that support OpenAI Priority Processing (service_tier="priority").
# See https://openai.com/api-priority-processing/ for the canonical list.
# Only the bare model slug is stored (no vendor prefix).
_PRIORITY_PROCESSING_MODELS: frozenset[str] = frozenset({
"gpt-5.4",
"gpt-5.4-mini",
"gpt-5.2",
"gpt-5.1",
"gpt-5",
"gpt-5-mini",
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
"gpt-4o",
"gpt-4o-mini",
"o3",
"o4-mini",
})
def model_supports_fast_mode(model_id: Optional[str]) -> bool:
"""Return whether Hermes should expose Fast mode for the active model."""
return fast_mode_backend_config(model_id) is not None
"""Return whether Hermes should expose the /fast (Priority Processing) toggle."""
raw = str(model_id or "").strip().lower()
if "/" in raw:
raw = raw.split("/", 1)[1]
return raw in _PRIORITY_PROCESSING_MODELS
def resolve_fast_mode_runtime(model_id: Optional[str]) -> dict[str, Any] | None:
"""Resolve runtime selection and request overrides for a fast-mode model."""
cfg = fast_mode_backend_config(model_id)
if not cfg:
def resolve_fast_mode_overrides(model_id: Optional[str]) -> dict[str, Any] | None:
"""Return request_overrides for Priority Processing, or None if unsupported.
Unlike the previous ``resolve_fast_mode_runtime``, this does NOT force a
provider/backend switch. The ``service_tier`` parameter is injected into
whatever API path the user is already on (Codex Responses, Chat Completions,
or OpenRouter passthrough).
"""
if not model_supports_fast_mode(model_id):
return None
from hermes_cli.runtime_provider import resolve_runtime_provider
runtime = resolve_runtime_provider(
requested=cfg.get("provider"),
explicit_base_url=cfg.get("base_url"),
explicit_api_key=cfg.get("api_key"),
)
return {
"runtime": {
"api_key": runtime.get("api_key"),
"base_url": runtime.get("base_url"),
"provider": runtime.get("provider"),
"api_mode": runtime.get("api_mode"),
"command": runtime.get("command"),
"args": list(runtime.get("args") or []),
"credential_pool": runtime.get("credential_pool"),
},
"request_overrides": dict(cfg.get("request_overrides") or {}),
}
return {"service_tier": "priority"}
def _resolve_copilot_catalog_api_key() -> str:

View File

@@ -5686,6 +5686,11 @@ class AIAgent:
if "x.ai" in self._base_url_lower and hasattr(self, "session_id") and self.session_id:
api_kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id}
# Priority Processing / generic request overrides (e.g. service_tier).
# Applied last so overrides win over any defaults set above.
if self.request_overrides:
api_kwargs.update(self.request_overrides)
return api_kwargs
def _supports_reasoning_extra_body(self) -> bool:

View File

@@ -108,15 +108,52 @@ class TestHandleFastCommand(unittest.TestCase):
self.assertTrue(mock_cprint.called)
class TestFastModeRegistry(unittest.TestCase):
def test_only_gpt_5_4_is_enabled_for_codex(self):
from hermes_cli.models import fast_mode_backend_config
class TestPriorityProcessingModels(unittest.TestCase):
"""Verify the expanded Priority Processing model registry."""
assert fast_mode_backend_config("gpt-5.4") == {
"provider": "openai-codex",
"request_overrides": {"service_tier": "priority"},
}
assert fast_mode_backend_config("gpt-5.3-codex") is None
def test_all_documented_models_supported(self):
from hermes_cli.models import model_supports_fast_mode
# All models from OpenAI's Priority Processing pricing table
supported = [
"gpt-5.4", "gpt-5.4-mini", "gpt-5.2",
"gpt-5.1", "gpt-5", "gpt-5-mini",
"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
"gpt-4o", "gpt-4o-mini",
"o3", "o4-mini",
]
for model in supported:
assert model_supports_fast_mode(model), f"{model} should support fast mode"
def test_vendor_prefix_stripped(self):
from hermes_cli.models import model_supports_fast_mode
assert model_supports_fast_mode("openai/gpt-5.4") is True
assert model_supports_fast_mode("openai/gpt-4.1") is True
assert model_supports_fast_mode("openai/o3") is True
def test_non_priority_models_rejected(self):
from hermes_cli.models import model_supports_fast_mode
assert model_supports_fast_mode("gpt-5.3-codex") is False
assert model_supports_fast_mode("claude-sonnet-4") is False
assert model_supports_fast_mode("") is False
assert model_supports_fast_mode(None) is False
def test_resolve_overrides_returns_service_tier(self):
from hermes_cli.models import resolve_fast_mode_overrides
result = resolve_fast_mode_overrides("gpt-5.4")
assert result == {"service_tier": "priority"}
result = resolve_fast_mode_overrides("gpt-4.1")
assert result == {"service_tier": "priority"}
def test_resolve_overrides_none_for_unsupported(self):
from hermes_cli.models import resolve_fast_mode_overrides
assert resolve_fast_mode_overrides("gpt-5.3-codex") is None
assert resolve_fast_mode_overrides("claude-sonnet-4") is None
class TestFastModeRouting(unittest.TestCase):
@@ -126,7 +163,16 @@ class TestFastModeRouting(unittest.TestCase):
assert cli_mod.HermesCLI._fast_command_available(stub) is True
def test_turn_route_switches_to_model_backend_when_fast_enabled(self):
def test_fast_command_exposed_for_non_codex_models(self):
cli_mod = _import_cli()
stub = SimpleNamespace(provider="openai", requested_provider="openai", model="gpt-4.1", agent=None)
assert cli_mod.HermesCLI._fast_command_available(stub) is True
stub = SimpleNamespace(provider="openrouter", requested_provider="openrouter", model="o3", agent=None)
assert cli_mod.HermesCLI._fast_command_available(stub) is True
def test_turn_route_injects_overrides_without_provider_switch(self):
"""Fast mode should add request_overrides but NOT change the provider/runtime."""
cli_mod = _import_cli()
stub = SimpleNamespace(
model="gpt-5.4",
@@ -141,35 +187,28 @@ class TestFastModeRouting(unittest.TestCase):
service_tier="priority",
)
with (
patch("agent.smart_model_routing.resolve_turn_route", return_value={
"model": "gpt-5.4",
"runtime": {
"api_key": "primary-key",
"base_url": "https://openrouter.ai/api/v1",
"provider": "openrouter",
"api_mode": "chat_completions",
"command": None,
"args": [],
"credential_pool": None,
},
"label": None,
"signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()),
}),
patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value={
"provider": "openai-codex",
"api_mode": "codex_responses",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_key": "codex-key",
"command": None,
"args": [],
"credential_pool": None,
}),
):
original_runtime = {
"api_key": "***",
"base_url": "https://openrouter.ai/api/v1",
"provider": "openrouter",
"api_mode": "chat_completions",
"command": None,
"args": [],
"credential_pool": None,
}
with patch("agent.smart_model_routing.resolve_turn_route", return_value={
"model": "gpt-5.4",
"runtime": dict(original_runtime),
"label": None,
"signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()),
}):
route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi")
assert route["runtime"]["provider"] == "openai-codex"
assert route["runtime"]["api_mode"] == "codex_responses"
# Provider should NOT have changed
assert route["runtime"]["provider"] == "openrouter"
assert route["runtime"]["api_mode"] == "chat_completions"
# But request_overrides should be set
assert route["request_overrides"] == {"service_tier": "priority"}
def test_turn_route_keeps_primary_runtime_when_model_has_no_fast_backend(self):
@@ -190,7 +229,7 @@ class TestFastModeRouting(unittest.TestCase):
primary_route = {
"model": "gpt-5.3-codex",
"runtime": {
"api_key": "primary-key",
"api_key": "***",
"base_url": "https://openrouter.ai/api/v1",
"provider": "openrouter",
"api_mode": "chat_completions",

View File

@@ -225,6 +225,26 @@ class TestDeveloperRoleSwap:
assert kwargs["messages"][0]["role"] == "developer"
class TestBuildApiKwargsChatCompletionsServiceTier:
"""service_tier via request_overrides works on the chat_completions path."""
def test_includes_service_tier_via_request_overrides(self, monkeypatch):
agent = _make_agent(monkeypatch, "openrouter")
agent.model = "gpt-4.1"
agent.request_overrides = {"service_tier": "priority"}
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
assert kwargs["service_tier"] == "priority"
def test_no_service_tier_when_overrides_empty(self, monkeypatch):
agent = _make_agent(monkeypatch, "openrouter")
agent.model = "gpt-4.1"
agent.request_overrides = {}
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
assert "service_tier" not in kwargs
class TestBuildApiKwargsAIGateway:
def test_uses_chat_completions_format(self, monkeypatch):
agent = _make_agent(monkeypatch, "ai-gateway", base_url="https://ai-gateway.vercel.sh/v1")