Compare commits

...

1 Commits

Author SHA1 Message Date
teknium1
4b00561590 feat: add optional smart model routing
Add a conservative cheap-vs-strong routing option that can send very short/simple turns to a cheaper model across providers while keeping the primary model for complex work. Wire it through CLI, gateway, and cron, and document the config.yaml workflow.
2026-03-16 06:39:03 -07:00
9 changed files with 505 additions and 29 deletions

View File

@@ -0,0 +1,184 @@
"""Helpers for optional cheap-vs-strong model routing."""
from __future__ import annotations
import os
import re
from typing import Any, Dict, Optional
_COMPLEX_KEYWORDS = {
"debug",
"debugging",
"implement",
"implementation",
"refactor",
"patch",
"traceback",
"stacktrace",
"exception",
"error",
"analyze",
"analysis",
"investigate",
"architecture",
"design",
"compare",
"benchmark",
"optimize",
"optimise",
"review",
"terminal",
"shell",
"tool",
"tools",
"pytest",
"test",
"tests",
"plan",
"planning",
"delegate",
"subagent",
"cron",
"docker",
"kubernetes",
}
_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE)
def _coerce_bool(value: Any, default: bool = False) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "yes", "on"}
return bool(value)
def _coerce_int(value: Any, default: int) -> int:
try:
return int(value)
except (TypeError, ValueError):
return default
def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""Return the configured cheap-model route when a message looks simple.
Conservative by design: if the message has signs of code/tool/debugging/
long-form work, keep the primary model.
"""
cfg = routing_config or {}
if not _coerce_bool(cfg.get("enabled"), False):
return None
cheap_model = cfg.get("cheap_model") or {}
if not isinstance(cheap_model, dict):
return None
provider = str(cheap_model.get("provider") or "").strip().lower()
model = str(cheap_model.get("model") or "").strip()
if not provider or not model:
return None
text = (user_message or "").strip()
if not text:
return None
max_chars = _coerce_int(cfg.get("max_simple_chars"), 160)
max_words = _coerce_int(cfg.get("max_simple_words"), 28)
if len(text) > max_chars:
return None
if len(text.split()) > max_words:
return None
if text.count("\n") > 1:
return None
if "```" in text or "`" in text:
return None
if _URL_RE.search(text):
return None
lowered = text.lower()
words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()}
if words & _COMPLEX_KEYWORDS:
return None
route = dict(cheap_model)
route["provider"] = provider
route["model"] = model
route["routing_reason"] = "simple_turn"
return route
def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
"""Resolve the effective model/runtime for one turn.
Returns a dict with model/runtime/signature/label fields.
"""
route = choose_cheap_model_route(user_message, routing_config)
if not route:
return {
"model": primary.get("model"),
"runtime": {
"api_key": primary.get("api_key"),
"base_url": primary.get("base_url"),
"provider": primary.get("provider"),
"api_mode": primary.get("api_mode"),
},
"label": None,
"signature": (
primary.get("model"),
primary.get("provider"),
primary.get("base_url"),
primary.get("api_mode"),
),
}
from hermes_cli.runtime_provider import resolve_runtime_provider
explicit_api_key = None
api_key_env = str(route.get("api_key_env") or "").strip()
if api_key_env:
explicit_api_key = os.getenv(api_key_env) or None
try:
runtime = resolve_runtime_provider(
requested=route.get("provider"),
explicit_api_key=explicit_api_key,
explicit_base_url=route.get("base_url"),
)
except Exception:
return {
"model": primary.get("model"),
"runtime": {
"api_key": primary.get("api_key"),
"base_url": primary.get("base_url"),
"provider": primary.get("provider"),
"api_mode": primary.get("api_mode"),
},
"label": None,
"signature": (
primary.get("model"),
primary.get("provider"),
primary.get("base_url"),
primary.get("api_mode"),
),
}
return {
"model": route.get("model"),
"runtime": {
"api_key": runtime.get("api_key"),
"base_url": runtime.get("base_url"),
"provider": runtime.get("provider"),
"api_mode": runtime.get("api_mode"),
},
"label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
"signature": (
route.get("model"),
runtime.get("provider"),
runtime.get("base_url"),
runtime.get("api_mode"),
),
}

View File

@@ -51,6 +51,20 @@ model:
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
# # data_collection: "deny"
# =============================================================================
# Smart Model Routing (optional)
# =============================================================================
# Use a cheaper model for short/simple turns while keeping your main model for
# more complex requests. Disabled by default.
#
# smart_model_routing:
# enabled: true
# max_simple_chars: 160
# max_simple_words: 28
# cheap_model:
# provider: openrouter
# model: google/gemini-2.5-flash
# =============================================================================
# Git Worktree Isolation
# =============================================================================

98
cli.py
View File

@@ -176,6 +176,12 @@ def load_cli_config() -> Dict[str, Any]:
"threshold": 0.50, # Compress at 50% of model's context limit
"summary_model": "google/gemini-3-flash-preview", # Fast/cheap model for summaries
},
"smart_model_routing": {
"enabled": False,
"max_simple_chars": 160,
"max_simple_words": 28,
"cheap_model": {},
},
"agent": {
"max_turns": 90, # Default max tool-calling iterations (shared with subagents)
"verbose": False,
@@ -1117,6 +1123,10 @@ class HermesCLI:
fb = CLI_CONFIG.get("fallback_model") or {}
self._fallback_model = fb if fb.get("provider") and fb.get("model") else None
# Optional cheap-vs-strong routing for simple turns
self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
self._active_agent_route_signature = None
# Agent will be initialized on first use
self.agent: Optional[AIAgent] = None
self._app = None # prompt_toolkit Application (set in run())
@@ -1496,10 +1506,27 @@ class HermesCLI:
# routing, or the effective model changed.
if (credentials_changed or routing_changed or model_changed) and self.agent is not None:
self.agent = None
self._active_agent_route_signature = None
return True
def _init_agent(self) -> bool:
def _resolve_turn_agent_config(self, user_message: str) -> dict:
"""Resolve model/runtime overrides for a single user turn."""
from agent.smart_model_routing import resolve_turn_route
return resolve_turn_route(
user_message,
self._smart_model_routing,
{
"model": self.model,
"api_key": self.api_key,
"base_url": self.base_url,
"provider": self.provider,
"api_mode": self.api_mode,
},
)
def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None) -> bool:
"""
Initialize the agent on first use.
When resuming a session, restores conversation history from SQLite.
@@ -1559,12 +1586,19 @@ class HermesCLI:
pass
try:
runtime = runtime_override or {
"api_key": self.api_key,
"base_url": self.base_url,
"provider": self.provider,
"api_mode": self.api_mode,
}
effective_model = model_override or self.model
self.agent = AIAgent(
model=self.model,
api_key=self.api_key,
base_url=self.base_url,
provider=self.provider,
api_mode=self.api_mode,
model=effective_model,
api_key=runtime.get("api_key"),
base_url=runtime.get("base_url"),
provider=runtime.get("provider"),
api_mode=runtime.get("api_mode"),
max_iterations=self.max_turns,
enabled_toolsets=self.enabled_toolsets,
verbose_logging=self.verbose,
@@ -1591,7 +1625,13 @@ class HermesCLI:
pass_session_id=self.pass_session_id,
tool_progress_callback=self._on_tool_progress,
)
# Apply any pending title now that the session exists in the DB
self._active_agent_route_signature = (
effective_model,
runtime.get("provider"),
runtime.get("base_url"),
runtime.get("api_mode"),
)
if self._pending_title and self._session_db:
try:
self._session_db.set_session_title(self.session_id, self._pending_title)
@@ -3369,14 +3409,16 @@ class HermesCLI:
_cprint(f" Task ID: {task_id}")
_cprint(f" You can continue chatting — results will appear when done.\n")
turn_route = self._resolve_turn_agent_config(prompt)
def run_background():
try:
bg_agent = AIAgent(
model=self.model,
api_key=self.api_key,
base_url=self.base_url,
provider=self.provider,
api_mode=self.api_mode,
model=turn_route["model"],
api_key=turn_route["runtime"].get("api_key"),
base_url=turn_route["runtime"].get("base_url"),
provider=turn_route["runtime"].get("provider"),
api_mode=turn_route["runtime"].get("api_mode"),
max_iterations=self.max_turns,
enabled_toolsets=self.enabled_toolsets,
quiet_mode=True,
@@ -4596,8 +4638,16 @@ class HermesCLI:
if not self._ensure_runtime_credentials():
return None
turn_route = self._resolve_turn_agent_config(message)
if turn_route["signature"] != self._active_agent_route_signature:
self.agent = None
# Initialize agent if needed
if not self._init_agent():
if not self._init_agent(
model_override=turn_route["model"],
runtime_override=turn_route["runtime"],
route_label=turn_route["label"],
):
return None
# Pre-process images through the vision tool (Gemini Flash) so the
@@ -6326,13 +6376,21 @@ def main(
# Quiet mode: suppress banner, spinner, tool previews.
# Only print the final response and parseable session info.
cli.tool_progress_mode = "off"
if cli._init_agent():
cli.agent.quiet_mode = True
result = cli.agent.run_conversation(query)
response = result.get("final_response", "") if isinstance(result, dict) else str(result)
if response:
print(response)
print(f"\nsession_id: {cli.session_id}")
if cli._ensure_runtime_credentials():
turn_route = cli._resolve_turn_agent_config(query)
if turn_route["signature"] != cli._active_agent_route_signature:
cli.agent = None
if cli._init_agent(
model_override=turn_route["model"],
runtime_override=turn_route["runtime"],
route_label=turn_route["label"],
):
cli.agent.quiet_mode = True
result = cli.agent.run_conversation(query)
response = result.get("final_response", "") if isinstance(result, dict) else str(result)
if response:
print(response)
print(f"\nsession_id: {cli.session_id}")
else:
cli.show_banner()
cli.console.print(f"[bold blue]Query:[/] {query}")

View File

@@ -315,6 +315,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
# Provider routing
pr = _cfg.get("provider_routing", {})
smart_routing = _cfg.get("smart_model_routing", {}) or {}
from hermes_cli.runtime_provider import (
resolve_runtime_provider,
@@ -331,12 +332,25 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
message = format_runtime_provider_error(exc)
raise RuntimeError(message) from exc
from agent.smart_model_routing import resolve_turn_route
turn_route = resolve_turn_route(
prompt,
smart_routing,
{
"model": model,
"api_key": runtime.get("api_key"),
"base_url": runtime.get("base_url"),
"provider": runtime.get("provider"),
"api_mode": runtime.get("api_mode"),
},
)
agent = AIAgent(
model=model,
api_key=runtime.get("api_key"),
base_url=runtime.get("base_url"),
provider=runtime.get("provider"),
api_mode=runtime.get("api_mode"),
model=turn_route["model"],
api_key=turn_route["runtime"].get("api_key"),
base_url=turn_route["runtime"].get("base_url"),
provider=turn_route["runtime"].get("provider"),
api_mode=turn_route["runtime"].get("api_mode"),
max_iterations=max_iterations,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,

View File

@@ -318,6 +318,7 @@ class GatewayRunner:
self._show_reasoning = self._load_show_reasoning()
self._provider_routing = self._load_provider_routing()
self._fallback_model = self._load_fallback_model()
self._smart_model_routing = self._load_smart_model_routing()
# Wire process registry into session store for reset protection
from tools.process_registry import process_registry
@@ -587,6 +588,18 @@ class GatewayRunner:
group_sessions_per_user=getattr(config, "group_sessions_per_user", True),
)
def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
from agent.smart_model_routing import resolve_turn_route
primary = {
"model": model,
"api_key": runtime_kwargs.get("api_key"),
"base_url": runtime_kwargs.get("base_url"),
"provider": runtime_kwargs.get("provider"),
"api_mode": runtime_kwargs.get("api_mode"),
}
return resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary)
async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
"""React to a non-retryable adapter failure after startup."""
logger.error(
@@ -789,6 +802,20 @@ class GatewayRunner:
pass
return None
@staticmethod
def _load_smart_model_routing() -> dict:
"""Load optional smart cheap-vs-strong model routing config."""
try:
import yaml as _y
cfg_path = _hermes_home / "config.yaml"
if cfg_path.exists():
with open(cfg_path, encoding="utf-8") as _f:
cfg = _y.safe_load(_f) or {}
return cfg.get("smart_model_routing", {}) or {}
except Exception:
pass
return {}
async def start(self) -> bool:
"""
Start the gateway and all configured platform adapters.
@@ -2894,11 +2921,12 @@ class GatewayRunner:
max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
reasoning_config = self._load_reasoning_config()
self._reasoning_config = reasoning_config
turn_route = self._resolve_turn_agent_config(prompt, model, runtime_kwargs)
def run_sync():
agent = AIAgent(
model=model,
**runtime_kwargs,
model=turn_route["model"],
**turn_route["runtime"],
max_iterations=max_iterations,
quiet_mode=True,
verbose_logging=False,
@@ -4132,9 +4160,10 @@ class GatewayRunner:
honcho_manager, honcho_config = self._get_or_create_gateway_honcho(session_key)
reasoning_config = self._load_reasoning_config()
self._reasoning_config = reasoning_config
turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
agent = AIAgent(
model=model,
**runtime_kwargs,
model=turn_route["model"],
**turn_route["runtime"],
max_iterations=max_iterations,
quiet_mode=True,
verbose_logging=False,

View File

@@ -147,6 +147,12 @@ DEFAULT_CONFIG = {
"summary_model": "google/gemini-3-flash-preview",
"summary_provider": "auto",
},
"smart_model_routing": {
"enabled": False,
"max_simple_chars": 160,
"max_simple_words": 28,
"cheap_model": {},
},
# Auxiliary model config — provider:model for each side task.
# Format: provider is the provider name, model is the model slug.
@@ -990,6 +996,19 @@ _FALLBACK_COMMENT = """
# fallback_model:
# provider: openrouter
# model: anthropic/claude-sonnet-4
#
# ── Smart Model Routing ────────────────────────────────────────────────
# Optional cheap-vs-strong routing for simple turns.
# Keeps the primary model for complex work, but can route short/simple
# messages to a cheaper model across providers.
#
# smart_model_routing:
# enabled: true
# max_simple_chars: 160
# max_simple_words: 28
# cheap_model:
# provider: openrouter
# model: google/gemini-2.5-flash
"""
@@ -1020,6 +1039,19 @@ _COMMENTED_SECTIONS = """
# fallback_model:
# provider: openrouter
# model: anthropic/claude-sonnet-4
#
# ── Smart Model Routing ────────────────────────────────────────────────
# Optional cheap-vs-strong routing for simple turns.
# Keeps the primary model for complex work, but can route short/simple
# messages to a cheaper model across providers.
#
# smart_model_routing:
# enabled: true
# max_simple_chars: 160
# max_simple_words: 28
# cheap_model:
# provider: openrouter
# model: google/gemini-2.5-flash
"""

View File

@@ -0,0 +1,61 @@
from agent.smart_model_routing import choose_cheap_model_route
_BASE_CONFIG = {
"enabled": True,
"cheap_model": {
"provider": "openrouter",
"model": "google/gemini-2.5-flash",
},
}
def test_returns_none_when_disabled():
cfg = {**_BASE_CONFIG, "enabled": False}
assert choose_cheap_model_route("what time is it in tokyo?", cfg) is None
def test_routes_short_simple_prompt():
result = choose_cheap_model_route("what time is it in tokyo?", _BASE_CONFIG)
assert result is not None
assert result["provider"] == "openrouter"
assert result["model"] == "google/gemini-2.5-flash"
assert result["routing_reason"] == "simple_turn"
def test_skips_long_prompt():
prompt = "please summarize this carefully " * 20
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
def test_skips_code_like_prompt():
prompt = "debug this traceback: ```python\nraise ValueError('bad')\n```"
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
def test_skips_tool_heavy_prompt_keywords():
prompt = "implement a patch for this docker error"
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
def test_resolve_turn_route_falls_back_to_primary_when_route_runtime_cannot_be_resolved(monkeypatch):
from agent.smart_model_routing import resolve_turn_route
monkeypatch.setattr(
"hermes_cli.runtime_provider.resolve_runtime_provider",
lambda **kwargs: (_ for _ in ()).throw(RuntimeError("bad route")),
)
result = resolve_turn_route(
"what time is it in tokyo?",
_BASE_CONFIG,
{
"model": "anthropic/claude-sonnet-4",
"provider": "openrouter",
"base_url": "https://openrouter.ai/api/v1",
"api_mode": "chat_completions",
"api_key": "sk-primary",
},
)
assert result["model"] == "anthropic/claude-sonnet-4"
assert result["runtime"]["provider"] == "openrouter"
assert result["label"] is None

View File

@@ -162,6 +162,57 @@ def test_runtime_resolution_rebuilds_agent_on_routing_change(monkeypatch):
assert shell.api_mode == "codex_responses"
def test_cli_turn_routing_uses_primary_when_disabled(monkeypatch):
cli = _import_cli()
shell = cli.HermesCLI(model="gpt-5", compact=True, max_turns=1)
shell.provider = "openrouter"
shell.api_mode = "chat_completions"
shell.base_url = "https://openrouter.ai/api/v1"
shell.api_key = "sk-primary"
shell._smart_model_routing = {"enabled": False}
result = shell._resolve_turn_agent_config("what time is it in tokyo?")
assert result["model"] == "gpt-5"
assert result["runtime"]["provider"] == "openrouter"
assert result["label"] is None
def test_cli_turn_routing_uses_cheap_model_when_simple(monkeypatch):
cli = _import_cli()
def _runtime_resolve(**kwargs):
assert kwargs["requested"] == "zai"
return {
"provider": "zai",
"api_mode": "chat_completions",
"base_url": "https://open.z.ai/api/v1",
"api_key": "cheap-key",
"source": "env/config",
}
monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve)
shell = cli.HermesCLI(model="anthropic/claude-sonnet-4", compact=True, max_turns=1)
shell.provider = "openrouter"
shell.api_mode = "chat_completions"
shell.base_url = "https://openrouter.ai/api/v1"
shell.api_key = "primary-key"
shell._smart_model_routing = {
"enabled": True,
"cheap_model": {"provider": "zai", "model": "glm-5-air"},
"max_simple_chars": 160,
"max_simple_words": 28,
}
result = shell._resolve_turn_agent_config("what time is it in tokyo?")
assert result["model"] == "glm-5-air"
assert result["runtime"]["provider"] == "zai"
assert result["runtime"]["api_key"] == "cheap-key"
assert result["label"] is not None
def test_cli_prefers_config_provider_over_stale_env_override(monkeypatch):
cli = _import_cli()

View File

@@ -441,6 +441,39 @@ Supported providers: `openrouter`, `nous`, `openai-codex`, `anthropic`, `zai`, `
Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers).
:::
## Smart Model Routing
Optional cheap-vs-strong routing lets Hermes keep your main model for complex work while sending very short/simple turns to a cheaper model.
```yaml
smart_model_routing:
enabled: true
max_simple_chars: 160
max_simple_words: 28
cheap_model:
provider: openrouter
model: google/gemini-2.5-flash
# base_url: http://localhost:8000/v1 # optional custom endpoint
# api_key_env: MY_CUSTOM_KEY # optional env var name for that endpoint's API key
```
How it works:
- If a turn is short, single-line, and does not look code/tool/debug heavy, Hermes may route it to `cheap_model`
- If the turn looks complex, Hermes stays on your primary model/provider
- If the cheap route cannot be resolved cleanly, Hermes falls back to the primary model automatically
This is intentionally conservative. It is meant for quick, low-stakes turns like:
- short factual questions
- quick rewrites
- lightweight summaries
It will avoid routing prompts that look like:
- coding/debugging work
- tool-heavy requests
- long or multi-line analysis asks
Use this when you want lower latency or cost without fully changing your default model.
## Terminal Backend Configuration
Configure which environment the agent uses for terminal commands: