Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
7d001a2da2 feat: expand /fast to all OpenAI Priority Processing models
Previously /fast only supported gpt-5.4 and forced a provider switch to
openai-codex. Now supports all 13 models from OpenAI's Priority Processing
pricing table (gpt-5.4, gpt-5.4-mini, gpt-5.2, gpt-5.1, gpt-5, gpt-5-mini,
gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, o3, o4-mini).

Key changes:
- Replaced _FAST_MODE_BACKEND_CONFIG with _PRIORITY_PROCESSING_MODELS frozenset
- Removed provider-forcing logic — service_tier is now injected into whatever
  API path the user is already on (Codex Responses, Chat Completions, or
  OpenRouter passthrough)
- Added request_overrides support to chat_completions path in run_agent.py
- Updated messaging from 'Codex inference tier' to 'Priority Processing'
- Expanded test coverage for all supported models
2026-04-09 22:06:08 -07:00
6 changed files with 144 additions and 110 deletions

35
cli.py
View File

@@ -2572,7 +2572,7 @@ class HermesCLI:
def _resolve_turn_agent_config(self, user_message: str) -> dict: def _resolve_turn_agent_config(self, user_message: str) -> dict:
"""Resolve model/runtime overrides for a single user turn.""" """Resolve model/runtime overrides for a single user turn."""
from agent.smart_model_routing import resolve_turn_route from agent.smart_model_routing import resolve_turn_route
from hermes_cli.models import resolve_fast_mode_runtime from hermes_cli.models import resolve_fast_mode_overrides
route = resolve_turn_route( route = resolve_turn_route(
user_message, user_message,
@@ -2595,27 +2595,10 @@ class HermesCLI:
return route return route
try: try:
fast_runtime = resolve_fast_mode_runtime(route.get("model")) overrides = resolve_fast_mode_overrides(route.get("model"))
except Exception: except Exception:
route["request_overrides"] = None overrides = None
return route route["request_overrides"] = overrides
if not fast_runtime:
route["request_overrides"] = None
return route
runtime = fast_runtime["runtime"]
route["runtime"] = runtime
route["request_overrides"] = fast_runtime["request_overrides"]
route["label"] = f"fast route → {route.get('model')} ({runtime.get('provider')})"
route["signature"] = (
route.get("model"),
runtime.get("provider"),
runtime.get("base_url"),
runtime.get("api_mode"),
runtime.get("command"),
tuple(runtime.get("args") or ()),
json.dumps(route["request_overrides"], sort_keys=True),
)
return route return route
def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool: def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool:
@@ -5662,15 +5645,15 @@ class HermesCLI:
_cprint(f" {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}") _cprint(f" {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}")
def _handle_fast_command(self, cmd: str): def _handle_fast_command(self, cmd: str):
"""Handle /fast — choose the Codex Responses service tier.""" """Handle /fast — toggle OpenAI Priority Processing (service_tier)."""
if not self._fast_command_available(): if not self._fast_command_available():
_cprint(" (._.) /fast is only available for models that explicitly expose a fast backend.") _cprint(" (._.) /fast is only available for OpenAI models that support Priority Processing.")
return return
parts = cmd.strip().split(maxsplit=1) parts = cmd.strip().split(maxsplit=1)
if len(parts) < 2 or parts[1].strip().lower() == "status": if len(parts) < 2 or parts[1].strip().lower() == "status":
status = "fast" if self.service_tier == "priority" else "normal" status = "fast" if self.service_tier == "priority" else "normal"
_cprint(f" {_GOLD}Codex inference tier: {status}{_RST}") _cprint(f" {_GOLD}Priority Processing: {status}{_RST}")
_cprint(f" {_DIM}Usage: /fast [normal|fast|status]{_RST}") _cprint(f" {_DIM}Usage: /fast [normal|fast|status]{_RST}")
return return
@@ -5691,9 +5674,9 @@ class HermesCLI:
self.agent = None # Force agent re-init with new service-tier config self.agent = None # Force agent re-init with new service-tier config
if save_config_value("agent.service_tier", saved_value): if save_config_value("agent.service_tier", saved_value):
_cprint(f" {_GOLD}Codex inference tier set to {label} (saved to config){_RST}") _cprint(f" {_GOLD}Priority Processing set to {label} (saved to config){_RST}")
else: else:
_cprint(f" {_GOLD}Codex inference tier set to {label} (session only){_RST}") _cprint(f" {_GOLD}Priority Processing set to {label} (session only){_RST}")
def _on_reasoning(self, reasoning_text: str): def _on_reasoning(self, reasoning_text: str):
"""Callback for intermediate reasoning display during tool-call loops.""" """Callback for intermediate reasoning display during tool-call loops."""

View File

@@ -100,7 +100,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
CommandDef("reasoning", "Manage reasoning effort and display", "Configuration", CommandDef("reasoning", "Manage reasoning effort and display", "Configuration",
args_hint="[level|show|hide]", args_hint="[level|show|hide]",
subcommands=("none", "minimal", "low", "medium", "high", "xhigh", "show", "hide", "on", "off")), subcommands=("none", "minimal", "low", "medium", "high", "xhigh", "show", "hide", "on", "off")),
CommandDef("fast", "Choose Codex inference tier (Normal/Fast)", "Configuration", CommandDef("fast", "Toggle OpenAI Priority Processing (Normal/Fast)", "Configuration",
cli_only=True, args_hint="[normal|fast|status]", cli_only=True, args_hint="[normal|fast|status]",
subcommands=("normal", "fast", "status", "on", "off")), subcommands=("normal", "fast", "status", "on", "off")),
CommandDef("skin", "Show or change the display skin/theme", "Configuration", CommandDef("skin", "Show or change the display skin/theme", "Configuration",

View File

@@ -1017,58 +1017,45 @@ def provider_label(provider: Optional[str]) -> str:
return _PROVIDER_LABELS.get(normalized, original or "OpenRouter") return _PROVIDER_LABELS.get(normalized, original or "OpenRouter")
_FAST_MODE_BACKEND_CONFIG: dict[str, dict[str, Any]] = { # Models that support OpenAI Priority Processing (service_tier="priority").
"gpt-5.4": { # See https://openai.com/api-priority-processing/ for the canonical list.
"provider": "openai-codex", # Only the bare model slug is stored (no vendor prefix).
"request_overrides": {"service_tier": "priority"}, _PRIORITY_PROCESSING_MODELS: frozenset[str] = frozenset({
}, "gpt-5.4",
} "gpt-5.4-mini",
"gpt-5.2",
"gpt-5.1",
def fast_mode_backend_config(model_id: Optional[str]) -> dict[str, Any] | None: "gpt-5",
"""Return backend config for models that expose Fast mode. "gpt-5-mini",
"gpt-4.1",
To expose Fast mode for a new model, add its normalized model slug to "gpt-4.1-mini",
``_FAST_MODE_BACKEND_CONFIG`` along with the backend runtime selection and "gpt-4.1-nano",
backend-specific request overrides Hermes should apply. "gpt-4o",
""" "gpt-4o-mini",
raw = str(model_id or "").strip().lower() "o3",
if "/" in raw: "o4-mini",
raw = raw.split("/", 1)[1] })
config = _FAST_MODE_BACKEND_CONFIG.get(raw)
return dict(config) if config else None
def model_supports_fast_mode(model_id: Optional[str]) -> bool: def model_supports_fast_mode(model_id: Optional[str]) -> bool:
"""Return whether Hermes should expose Fast mode for the active model.""" """Return whether Hermes should expose the /fast (Priority Processing) toggle."""
return fast_mode_backend_config(model_id) is not None raw = str(model_id or "").strip().lower()
if "/" in raw:
raw = raw.split("/", 1)[1]
return raw in _PRIORITY_PROCESSING_MODELS
def resolve_fast_mode_runtime(model_id: Optional[str]) -> dict[str, Any] | None: def resolve_fast_mode_overrides(model_id: Optional[str]) -> dict[str, Any] | None:
"""Resolve runtime selection and request overrides for a fast-mode model.""" """Return request_overrides for Priority Processing, or None if unsupported.
cfg = fast_mode_backend_config(model_id)
if not cfg: Unlike the previous ``resolve_fast_mode_runtime``, this does NOT force a
provider/backend switch. The ``service_tier`` parameter is injected into
whatever API path the user is already on (Codex Responses, Chat Completions,
or OpenRouter passthrough).
"""
if not model_supports_fast_mode(model_id):
return None return None
return {"service_tier": "priority"}
from hermes_cli.runtime_provider import resolve_runtime_provider
runtime = resolve_runtime_provider(
requested=cfg.get("provider"),
explicit_base_url=cfg.get("base_url"),
explicit_api_key=cfg.get("api_key"),
)
return {
"runtime": {
"api_key": runtime.get("api_key"),
"base_url": runtime.get("base_url"),
"provider": runtime.get("provider"),
"api_mode": runtime.get("api_mode"),
"command": runtime.get("command"),
"args": list(runtime.get("args") or []),
"credential_pool": runtime.get("credential_pool"),
},
"request_overrides": dict(cfg.get("request_overrides") or {}),
}
def _resolve_copilot_catalog_api_key() -> str: def _resolve_copilot_catalog_api_key() -> str:

View File

@@ -5686,6 +5686,11 @@ class AIAgent:
if "x.ai" in self._base_url_lower and hasattr(self, "session_id") and self.session_id: if "x.ai" in self._base_url_lower and hasattr(self, "session_id") and self.session_id:
api_kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id} api_kwargs["extra_headers"] = {"x-grok-conv-id": self.session_id}
# Priority Processing / generic request overrides (e.g. service_tier).
# Applied last so overrides win over any defaults set above.
if self.request_overrides:
api_kwargs.update(self.request_overrides)
return api_kwargs return api_kwargs
def _supports_reasoning_extra_body(self) -> bool: def _supports_reasoning_extra_body(self) -> bool:

View File

@@ -108,15 +108,52 @@ class TestHandleFastCommand(unittest.TestCase):
self.assertTrue(mock_cprint.called) self.assertTrue(mock_cprint.called)
class TestFastModeRegistry(unittest.TestCase): class TestPriorityProcessingModels(unittest.TestCase):
def test_only_gpt_5_4_is_enabled_for_codex(self): """Verify the expanded Priority Processing model registry."""
from hermes_cli.models import fast_mode_backend_config
assert fast_mode_backend_config("gpt-5.4") == { def test_all_documented_models_supported(self):
"provider": "openai-codex", from hermes_cli.models import model_supports_fast_mode
"request_overrides": {"service_tier": "priority"},
} # All models from OpenAI's Priority Processing pricing table
assert fast_mode_backend_config("gpt-5.3-codex") is None supported = [
"gpt-5.4", "gpt-5.4-mini", "gpt-5.2",
"gpt-5.1", "gpt-5", "gpt-5-mini",
"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
"gpt-4o", "gpt-4o-mini",
"o3", "o4-mini",
]
for model in supported:
assert model_supports_fast_mode(model), f"{model} should support fast mode"
def test_vendor_prefix_stripped(self):
from hermes_cli.models import model_supports_fast_mode
assert model_supports_fast_mode("openai/gpt-5.4") is True
assert model_supports_fast_mode("openai/gpt-4.1") is True
assert model_supports_fast_mode("openai/o3") is True
def test_non_priority_models_rejected(self):
from hermes_cli.models import model_supports_fast_mode
assert model_supports_fast_mode("gpt-5.3-codex") is False
assert model_supports_fast_mode("claude-sonnet-4") is False
assert model_supports_fast_mode("") is False
assert model_supports_fast_mode(None) is False
def test_resolve_overrides_returns_service_tier(self):
from hermes_cli.models import resolve_fast_mode_overrides
result = resolve_fast_mode_overrides("gpt-5.4")
assert result == {"service_tier": "priority"}
result = resolve_fast_mode_overrides("gpt-4.1")
assert result == {"service_tier": "priority"}
def test_resolve_overrides_none_for_unsupported(self):
from hermes_cli.models import resolve_fast_mode_overrides
assert resolve_fast_mode_overrides("gpt-5.3-codex") is None
assert resolve_fast_mode_overrides("claude-sonnet-4") is None
class TestFastModeRouting(unittest.TestCase): class TestFastModeRouting(unittest.TestCase):
@@ -126,7 +163,16 @@ class TestFastModeRouting(unittest.TestCase):
assert cli_mod.HermesCLI._fast_command_available(stub) is True assert cli_mod.HermesCLI._fast_command_available(stub) is True
def test_turn_route_switches_to_model_backend_when_fast_enabled(self): def test_fast_command_exposed_for_non_codex_models(self):
cli_mod = _import_cli()
stub = SimpleNamespace(provider="openai", requested_provider="openai", model="gpt-4.1", agent=None)
assert cli_mod.HermesCLI._fast_command_available(stub) is True
stub = SimpleNamespace(provider="openrouter", requested_provider="openrouter", model="o3", agent=None)
assert cli_mod.HermesCLI._fast_command_available(stub) is True
def test_turn_route_injects_overrides_without_provider_switch(self):
"""Fast mode should add request_overrides but NOT change the provider/runtime."""
cli_mod = _import_cli() cli_mod = _import_cli()
stub = SimpleNamespace( stub = SimpleNamespace(
model="gpt-5.4", model="gpt-5.4",
@@ -141,35 +187,28 @@ class TestFastModeRouting(unittest.TestCase):
service_tier="priority", service_tier="priority",
) )
with ( original_runtime = {
patch("agent.smart_model_routing.resolve_turn_route", return_value={ "api_key": "***",
"model": "gpt-5.4", "base_url": "https://openrouter.ai/api/v1",
"runtime": { "provider": "openrouter",
"api_key": "primary-key", "api_mode": "chat_completions",
"base_url": "https://openrouter.ai/api/v1", "command": None,
"provider": "openrouter", "args": [],
"api_mode": "chat_completions", "credential_pool": None,
"command": None, }
"args": [],
"credential_pool": None, with patch("agent.smart_model_routing.resolve_turn_route", return_value={
}, "model": "gpt-5.4",
"label": None, "runtime": dict(original_runtime),
"signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), "label": None,
}), "signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()),
patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value={ }):
"provider": "openai-codex",
"api_mode": "codex_responses",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_key": "codex-key",
"command": None,
"args": [],
"credential_pool": None,
}),
):
route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi")
assert route["runtime"]["provider"] == "openai-codex" # Provider should NOT have changed
assert route["runtime"]["api_mode"] == "codex_responses" assert route["runtime"]["provider"] == "openrouter"
assert route["runtime"]["api_mode"] == "chat_completions"
# But request_overrides should be set
assert route["request_overrides"] == {"service_tier": "priority"} assert route["request_overrides"] == {"service_tier": "priority"}
def test_turn_route_keeps_primary_runtime_when_model_has_no_fast_backend(self): def test_turn_route_keeps_primary_runtime_when_model_has_no_fast_backend(self):
@@ -190,7 +229,7 @@ class TestFastModeRouting(unittest.TestCase):
primary_route = { primary_route = {
"model": "gpt-5.3-codex", "model": "gpt-5.3-codex",
"runtime": { "runtime": {
"api_key": "primary-key", "api_key": "***",
"base_url": "https://openrouter.ai/api/v1", "base_url": "https://openrouter.ai/api/v1",
"provider": "openrouter", "provider": "openrouter",
"api_mode": "chat_completions", "api_mode": "chat_completions",

View File

@@ -225,6 +225,26 @@ class TestDeveloperRoleSwap:
assert kwargs["messages"][0]["role"] == "developer" assert kwargs["messages"][0]["role"] == "developer"
class TestBuildApiKwargsChatCompletionsServiceTier:
"""service_tier via request_overrides works on the chat_completions path."""
def test_includes_service_tier_via_request_overrides(self, monkeypatch):
agent = _make_agent(monkeypatch, "openrouter")
agent.model = "gpt-4.1"
agent.request_overrides = {"service_tier": "priority"}
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
assert kwargs["service_tier"] == "priority"
def test_no_service_tier_when_overrides_empty(self, monkeypatch):
agent = _make_agent(monkeypatch, "openrouter")
agent.model = "gpt-4.1"
agent.request_overrides = {}
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
assert "service_tier" not in kwargs
class TestBuildApiKwargsAIGateway: class TestBuildApiKwargsAIGateway:
def test_uses_chat_completions_format(self, monkeypatch): def test_uses_chat_completions_format(self, monkeypatch):
agent = _make_agent(monkeypatch, "ai-gateway", base_url="https://ai-gateway.vercel.sh/v1") agent = _make_agent(monkeypatch, "ai-gateway", base_url="https://ai-gateway.vercel.sh/v1")