mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
Compare commits
1 Commits
codex-port
...
fix/smart-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b00561590 |
184
agent/smart_model_routing.py
Normal file
184
agent/smart_model_routing.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Helpers for optional cheap-vs-strong model routing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
_COMPLEX_KEYWORDS = {
|
||||
"debug",
|
||||
"debugging",
|
||||
"implement",
|
||||
"implementation",
|
||||
"refactor",
|
||||
"patch",
|
||||
"traceback",
|
||||
"stacktrace",
|
||||
"exception",
|
||||
"error",
|
||||
"analyze",
|
||||
"analysis",
|
||||
"investigate",
|
||||
"architecture",
|
||||
"design",
|
||||
"compare",
|
||||
"benchmark",
|
||||
"optimize",
|
||||
"optimise",
|
||||
"review",
|
||||
"terminal",
|
||||
"shell",
|
||||
"tool",
|
||||
"tools",
|
||||
"pytest",
|
||||
"test",
|
||||
"tests",
|
||||
"plan",
|
||||
"planning",
|
||||
"delegate",
|
||||
"subagent",
|
||||
"cron",
|
||||
"docker",
|
||||
"kubernetes",
|
||||
}
|
||||
|
||||
_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE)
|
||||
|
||||
|
||||
def _coerce_bool(value: Any, default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
return bool(value)
|
||||
|
||||
|
||||
def _coerce_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""Return the configured cheap-model route when a message looks simple.
|
||||
|
||||
Conservative by design: if the message has signs of code/tool/debugging/
|
||||
long-form work, keep the primary model.
|
||||
"""
|
||||
cfg = routing_config or {}
|
||||
if not _coerce_bool(cfg.get("enabled"), False):
|
||||
return None
|
||||
|
||||
cheap_model = cfg.get("cheap_model") or {}
|
||||
if not isinstance(cheap_model, dict):
|
||||
return None
|
||||
provider = str(cheap_model.get("provider") or "").strip().lower()
|
||||
model = str(cheap_model.get("model") or "").strip()
|
||||
if not provider or not model:
|
||||
return None
|
||||
|
||||
text = (user_message or "").strip()
|
||||
if not text:
|
||||
return None
|
||||
|
||||
max_chars = _coerce_int(cfg.get("max_simple_chars"), 160)
|
||||
max_words = _coerce_int(cfg.get("max_simple_words"), 28)
|
||||
|
||||
if len(text) > max_chars:
|
||||
return None
|
||||
if len(text.split()) > max_words:
|
||||
return None
|
||||
if text.count("\n") > 1:
|
||||
return None
|
||||
if "```" in text or "`" in text:
|
||||
return None
|
||||
if _URL_RE.search(text):
|
||||
return None
|
||||
|
||||
lowered = text.lower()
|
||||
words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()}
|
||||
if words & _COMPLEX_KEYWORDS:
|
||||
return None
|
||||
|
||||
route = dict(cheap_model)
|
||||
route["provider"] = provider
|
||||
route["model"] = model
|
||||
route["routing_reason"] = "simple_turn"
|
||||
return route
|
||||
|
||||
|
||||
def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Resolve the effective model/runtime for one turn.
|
||||
|
||||
Returns a dict with model/runtime/signature/label fields.
|
||||
"""
|
||||
route = choose_cheap_model_route(user_message, routing_config)
|
||||
if not route:
|
||||
return {
|
||||
"model": primary.get("model"),
|
||||
"runtime": {
|
||||
"api_key": primary.get("api_key"),
|
||||
"base_url": primary.get("base_url"),
|
||||
"provider": primary.get("provider"),
|
||||
"api_mode": primary.get("api_mode"),
|
||||
},
|
||||
"label": None,
|
||||
"signature": (
|
||||
primary.get("model"),
|
||||
primary.get("provider"),
|
||||
primary.get("base_url"),
|
||||
primary.get("api_mode"),
|
||||
),
|
||||
}
|
||||
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
explicit_api_key = None
|
||||
api_key_env = str(route.get("api_key_env") or "").strip()
|
||||
if api_key_env:
|
||||
explicit_api_key = os.getenv(api_key_env) or None
|
||||
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=route.get("provider"),
|
||||
explicit_api_key=explicit_api_key,
|
||||
explicit_base_url=route.get("base_url"),
|
||||
)
|
||||
except Exception:
|
||||
return {
|
||||
"model": primary.get("model"),
|
||||
"runtime": {
|
||||
"api_key": primary.get("api_key"),
|
||||
"base_url": primary.get("base_url"),
|
||||
"provider": primary.get("provider"),
|
||||
"api_mode": primary.get("api_mode"),
|
||||
},
|
||||
"label": None,
|
||||
"signature": (
|
||||
primary.get("model"),
|
||||
primary.get("provider"),
|
||||
primary.get("base_url"),
|
||||
primary.get("api_mode"),
|
||||
),
|
||||
}
|
||||
|
||||
return {
|
||||
"model": route.get("model"),
|
||||
"runtime": {
|
||||
"api_key": runtime.get("api_key"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
},
|
||||
"label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
|
||||
"signature": (
|
||||
route.get("model"),
|
||||
runtime.get("provider"),
|
||||
runtime.get("base_url"),
|
||||
runtime.get("api_mode"),
|
||||
),
|
||||
}
|
||||
@@ -51,6 +51,20 @@ model:
|
||||
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
|
||||
# # data_collection: "deny"
|
||||
|
||||
# =============================================================================
|
||||
# Smart Model Routing (optional)
|
||||
# =============================================================================
|
||||
# Use a cheaper model for short/simple turns while keeping your main model for
|
||||
# more complex requests. Disabled by default.
|
||||
#
|
||||
# smart_model_routing:
|
||||
# enabled: true
|
||||
# max_simple_chars: 160
|
||||
# max_simple_words: 28
|
||||
# cheap_model:
|
||||
# provider: openrouter
|
||||
# model: google/gemini-2.5-flash
|
||||
|
||||
# =============================================================================
|
||||
# Git Worktree Isolation
|
||||
# =============================================================================
|
||||
|
||||
98
cli.py
98
cli.py
@@ -176,6 +176,12 @@ def load_cli_config() -> Dict[str, Any]:
|
||||
"threshold": 0.50, # Compress at 50% of model's context limit
|
||||
"summary_model": "google/gemini-3-flash-preview", # Fast/cheap model for summaries
|
||||
},
|
||||
"smart_model_routing": {
|
||||
"enabled": False,
|
||||
"max_simple_chars": 160,
|
||||
"max_simple_words": 28,
|
||||
"cheap_model": {},
|
||||
},
|
||||
"agent": {
|
||||
"max_turns": 90, # Default max tool-calling iterations (shared with subagents)
|
||||
"verbose": False,
|
||||
@@ -1117,6 +1123,10 @@ class HermesCLI:
|
||||
fb = CLI_CONFIG.get("fallback_model") or {}
|
||||
self._fallback_model = fb if fb.get("provider") and fb.get("model") else None
|
||||
|
||||
# Optional cheap-vs-strong routing for simple turns
|
||||
self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
|
||||
self._active_agent_route_signature = None
|
||||
|
||||
# Agent will be initialized on first use
|
||||
self.agent: Optional[AIAgent] = None
|
||||
self._app = None # prompt_toolkit Application (set in run())
|
||||
@@ -1496,10 +1506,27 @@ class HermesCLI:
|
||||
# routing, or the effective model changed.
|
||||
if (credentials_changed or routing_changed or model_changed) and self.agent is not None:
|
||||
self.agent = None
|
||||
self._active_agent_route_signature = None
|
||||
|
||||
return True
|
||||
|
||||
def _init_agent(self) -> bool:
|
||||
def _resolve_turn_agent_config(self, user_message: str) -> dict:
|
||||
"""Resolve model/runtime overrides for a single user turn."""
|
||||
from agent.smart_model_routing import resolve_turn_route
|
||||
|
||||
return resolve_turn_route(
|
||||
user_message,
|
||||
self._smart_model_routing,
|
||||
{
|
||||
"model": self.model,
|
||||
"api_key": self.api_key,
|
||||
"base_url": self.base_url,
|
||||
"provider": self.provider,
|
||||
"api_mode": self.api_mode,
|
||||
},
|
||||
)
|
||||
|
||||
def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None) -> bool:
|
||||
"""
|
||||
Initialize the agent on first use.
|
||||
When resuming a session, restores conversation history from SQLite.
|
||||
@@ -1559,12 +1586,19 @@ class HermesCLI:
|
||||
pass
|
||||
|
||||
try:
|
||||
runtime = runtime_override or {
|
||||
"api_key": self.api_key,
|
||||
"base_url": self.base_url,
|
||||
"provider": self.provider,
|
||||
"api_mode": self.api_mode,
|
||||
}
|
||||
effective_model = model_override or self.model
|
||||
self.agent = AIAgent(
|
||||
model=self.model,
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
provider=self.provider,
|
||||
api_mode=self.api_mode,
|
||||
model=effective_model,
|
||||
api_key=runtime.get("api_key"),
|
||||
base_url=runtime.get("base_url"),
|
||||
provider=runtime.get("provider"),
|
||||
api_mode=runtime.get("api_mode"),
|
||||
max_iterations=self.max_turns,
|
||||
enabled_toolsets=self.enabled_toolsets,
|
||||
verbose_logging=self.verbose,
|
||||
@@ -1591,7 +1625,13 @@ class HermesCLI:
|
||||
pass_session_id=self.pass_session_id,
|
||||
tool_progress_callback=self._on_tool_progress,
|
||||
)
|
||||
# Apply any pending title now that the session exists in the DB
|
||||
self._active_agent_route_signature = (
|
||||
effective_model,
|
||||
runtime.get("provider"),
|
||||
runtime.get("base_url"),
|
||||
runtime.get("api_mode"),
|
||||
)
|
||||
|
||||
if self._pending_title and self._session_db:
|
||||
try:
|
||||
self._session_db.set_session_title(self.session_id, self._pending_title)
|
||||
@@ -3369,14 +3409,16 @@ class HermesCLI:
|
||||
_cprint(f" Task ID: {task_id}")
|
||||
_cprint(f" You can continue chatting — results will appear when done.\n")
|
||||
|
||||
turn_route = self._resolve_turn_agent_config(prompt)
|
||||
|
||||
def run_background():
|
||||
try:
|
||||
bg_agent = AIAgent(
|
||||
model=self.model,
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
provider=self.provider,
|
||||
api_mode=self.api_mode,
|
||||
model=turn_route["model"],
|
||||
api_key=turn_route["runtime"].get("api_key"),
|
||||
base_url=turn_route["runtime"].get("base_url"),
|
||||
provider=turn_route["runtime"].get("provider"),
|
||||
api_mode=turn_route["runtime"].get("api_mode"),
|
||||
max_iterations=self.max_turns,
|
||||
enabled_toolsets=self.enabled_toolsets,
|
||||
quiet_mode=True,
|
||||
@@ -4596,8 +4638,16 @@ class HermesCLI:
|
||||
if not self._ensure_runtime_credentials():
|
||||
return None
|
||||
|
||||
turn_route = self._resolve_turn_agent_config(message)
|
||||
if turn_route["signature"] != self._active_agent_route_signature:
|
||||
self.agent = None
|
||||
|
||||
# Initialize agent if needed
|
||||
if not self._init_agent():
|
||||
if not self._init_agent(
|
||||
model_override=turn_route["model"],
|
||||
runtime_override=turn_route["runtime"],
|
||||
route_label=turn_route["label"],
|
||||
):
|
||||
return None
|
||||
|
||||
# Pre-process images through the vision tool (Gemini Flash) so the
|
||||
@@ -6326,13 +6376,21 @@ def main(
|
||||
# Quiet mode: suppress banner, spinner, tool previews.
|
||||
# Only print the final response and parseable session info.
|
||||
cli.tool_progress_mode = "off"
|
||||
if cli._init_agent():
|
||||
cli.agent.quiet_mode = True
|
||||
result = cli.agent.run_conversation(query)
|
||||
response = result.get("final_response", "") if isinstance(result, dict) else str(result)
|
||||
if response:
|
||||
print(response)
|
||||
print(f"\nsession_id: {cli.session_id}")
|
||||
if cli._ensure_runtime_credentials():
|
||||
turn_route = cli._resolve_turn_agent_config(query)
|
||||
if turn_route["signature"] != cli._active_agent_route_signature:
|
||||
cli.agent = None
|
||||
if cli._init_agent(
|
||||
model_override=turn_route["model"],
|
||||
runtime_override=turn_route["runtime"],
|
||||
route_label=turn_route["label"],
|
||||
):
|
||||
cli.agent.quiet_mode = True
|
||||
result = cli.agent.run_conversation(query)
|
||||
response = result.get("final_response", "") if isinstance(result, dict) else str(result)
|
||||
if response:
|
||||
print(response)
|
||||
print(f"\nsession_id: {cli.session_id}")
|
||||
else:
|
||||
cli.show_banner()
|
||||
cli.console.print(f"[bold blue]Query:[/] {query}")
|
||||
|
||||
@@ -315,6 +315,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
|
||||
# Provider routing
|
||||
pr = _cfg.get("provider_routing", {})
|
||||
smart_routing = _cfg.get("smart_model_routing", {}) or {}
|
||||
|
||||
from hermes_cli.runtime_provider import (
|
||||
resolve_runtime_provider,
|
||||
@@ -331,12 +332,25 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
message = format_runtime_provider_error(exc)
|
||||
raise RuntimeError(message) from exc
|
||||
|
||||
from agent.smart_model_routing import resolve_turn_route
|
||||
turn_route = resolve_turn_route(
|
||||
prompt,
|
||||
smart_routing,
|
||||
{
|
||||
"model": model,
|
||||
"api_key": runtime.get("api_key"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
},
|
||||
)
|
||||
|
||||
agent = AIAgent(
|
||||
model=model,
|
||||
api_key=runtime.get("api_key"),
|
||||
base_url=runtime.get("base_url"),
|
||||
provider=runtime.get("provider"),
|
||||
api_mode=runtime.get("api_mode"),
|
||||
model=turn_route["model"],
|
||||
api_key=turn_route["runtime"].get("api_key"),
|
||||
base_url=turn_route["runtime"].get("base_url"),
|
||||
provider=turn_route["runtime"].get("provider"),
|
||||
api_mode=turn_route["runtime"].get("api_mode"),
|
||||
max_iterations=max_iterations,
|
||||
reasoning_config=reasoning_config,
|
||||
prefill_messages=prefill_messages,
|
||||
|
||||
@@ -318,6 +318,7 @@ class GatewayRunner:
|
||||
self._show_reasoning = self._load_show_reasoning()
|
||||
self._provider_routing = self._load_provider_routing()
|
||||
self._fallback_model = self._load_fallback_model()
|
||||
self._smart_model_routing = self._load_smart_model_routing()
|
||||
|
||||
# Wire process registry into session store for reset protection
|
||||
from tools.process_registry import process_registry
|
||||
@@ -587,6 +588,18 @@ class GatewayRunner:
|
||||
group_sessions_per_user=getattr(config, "group_sessions_per_user", True),
|
||||
)
|
||||
|
||||
def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
|
||||
from agent.smart_model_routing import resolve_turn_route
|
||||
|
||||
primary = {
|
||||
"model": model,
|
||||
"api_key": runtime_kwargs.get("api_key"),
|
||||
"base_url": runtime_kwargs.get("base_url"),
|
||||
"provider": runtime_kwargs.get("provider"),
|
||||
"api_mode": runtime_kwargs.get("api_mode"),
|
||||
}
|
||||
return resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary)
|
||||
|
||||
async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
|
||||
"""React to a non-retryable adapter failure after startup."""
|
||||
logger.error(
|
||||
@@ -789,6 +802,20 @@ class GatewayRunner:
|
||||
pass
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _load_smart_model_routing() -> dict:
|
||||
"""Load optional smart cheap-vs-strong model routing config."""
|
||||
try:
|
||||
import yaml as _y
|
||||
cfg_path = _hermes_home / "config.yaml"
|
||||
if cfg_path.exists():
|
||||
with open(cfg_path, encoding="utf-8") as _f:
|
||||
cfg = _y.safe_load(_f) or {}
|
||||
return cfg.get("smart_model_routing", {}) or {}
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
async def start(self) -> bool:
|
||||
"""
|
||||
Start the gateway and all configured platform adapters.
|
||||
@@ -2894,11 +2921,12 @@ class GatewayRunner:
|
||||
max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
|
||||
reasoning_config = self._load_reasoning_config()
|
||||
self._reasoning_config = reasoning_config
|
||||
turn_route = self._resolve_turn_agent_config(prompt, model, runtime_kwargs)
|
||||
|
||||
def run_sync():
|
||||
agent = AIAgent(
|
||||
model=model,
|
||||
**runtime_kwargs,
|
||||
model=turn_route["model"],
|
||||
**turn_route["runtime"],
|
||||
max_iterations=max_iterations,
|
||||
quiet_mode=True,
|
||||
verbose_logging=False,
|
||||
@@ -4132,9 +4160,10 @@ class GatewayRunner:
|
||||
honcho_manager, honcho_config = self._get_or_create_gateway_honcho(session_key)
|
||||
reasoning_config = self._load_reasoning_config()
|
||||
self._reasoning_config = reasoning_config
|
||||
turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
|
||||
agent = AIAgent(
|
||||
model=model,
|
||||
**runtime_kwargs,
|
||||
model=turn_route["model"],
|
||||
**turn_route["runtime"],
|
||||
max_iterations=max_iterations,
|
||||
quiet_mode=True,
|
||||
verbose_logging=False,
|
||||
|
||||
@@ -147,6 +147,12 @@ DEFAULT_CONFIG = {
|
||||
"summary_model": "google/gemini-3-flash-preview",
|
||||
"summary_provider": "auto",
|
||||
},
|
||||
"smart_model_routing": {
|
||||
"enabled": False,
|
||||
"max_simple_chars": 160,
|
||||
"max_simple_words": 28,
|
||||
"cheap_model": {},
|
||||
},
|
||||
|
||||
# Auxiliary model config — provider:model for each side task.
|
||||
# Format: provider is the provider name, model is the model slug.
|
||||
@@ -990,6 +996,19 @@ _FALLBACK_COMMENT = """
|
||||
# fallback_model:
|
||||
# provider: openrouter
|
||||
# model: anthropic/claude-sonnet-4
|
||||
#
|
||||
# ── Smart Model Routing ────────────────────────────────────────────────
|
||||
# Optional cheap-vs-strong routing for simple turns.
|
||||
# Keeps the primary model for complex work, but can route short/simple
|
||||
# messages to a cheaper model across providers.
|
||||
#
|
||||
# smart_model_routing:
|
||||
# enabled: true
|
||||
# max_simple_chars: 160
|
||||
# max_simple_words: 28
|
||||
# cheap_model:
|
||||
# provider: openrouter
|
||||
# model: google/gemini-2.5-flash
|
||||
"""
|
||||
|
||||
|
||||
@@ -1020,6 +1039,19 @@ _COMMENTED_SECTIONS = """
|
||||
# fallback_model:
|
||||
# provider: openrouter
|
||||
# model: anthropic/claude-sonnet-4
|
||||
#
|
||||
# ── Smart Model Routing ────────────────────────────────────────────────
|
||||
# Optional cheap-vs-strong routing for simple turns.
|
||||
# Keeps the primary model for complex work, but can route short/simple
|
||||
# messages to a cheaper model across providers.
|
||||
#
|
||||
# smart_model_routing:
|
||||
# enabled: true
|
||||
# max_simple_chars: 160
|
||||
# max_simple_words: 28
|
||||
# cheap_model:
|
||||
# provider: openrouter
|
||||
# model: google/gemini-2.5-flash
|
||||
"""
|
||||
|
||||
|
||||
|
||||
61
tests/agent/test_smart_model_routing.py
Normal file
61
tests/agent/test_smart_model_routing.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from agent.smart_model_routing import choose_cheap_model_route
|
||||
|
||||
|
||||
_BASE_CONFIG = {
|
||||
"enabled": True,
|
||||
"cheap_model": {
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-2.5-flash",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_returns_none_when_disabled():
|
||||
cfg = {**_BASE_CONFIG, "enabled": False}
|
||||
assert choose_cheap_model_route("what time is it in tokyo?", cfg) is None
|
||||
|
||||
|
||||
def test_routes_short_simple_prompt():
|
||||
result = choose_cheap_model_route("what time is it in tokyo?", _BASE_CONFIG)
|
||||
assert result is not None
|
||||
assert result["provider"] == "openrouter"
|
||||
assert result["model"] == "google/gemini-2.5-flash"
|
||||
assert result["routing_reason"] == "simple_turn"
|
||||
|
||||
|
||||
def test_skips_long_prompt():
|
||||
prompt = "please summarize this carefully " * 20
|
||||
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
|
||||
|
||||
|
||||
def test_skips_code_like_prompt():
|
||||
prompt = "debug this traceback: ```python\nraise ValueError('bad')\n```"
|
||||
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
|
||||
|
||||
|
||||
def test_skips_tool_heavy_prompt_keywords():
|
||||
prompt = "implement a patch for this docker error"
|
||||
assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
|
||||
|
||||
|
||||
def test_resolve_turn_route_falls_back_to_primary_when_route_runtime_cannot_be_resolved(monkeypatch):
|
||||
from agent.smart_model_routing import resolve_turn_route
|
||||
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.runtime_provider.resolve_runtime_provider",
|
||||
lambda **kwargs: (_ for _ in ()).throw(RuntimeError("bad route")),
|
||||
)
|
||||
result = resolve_turn_route(
|
||||
"what time is it in tokyo?",
|
||||
_BASE_CONFIG,
|
||||
{
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_mode": "chat_completions",
|
||||
"api_key": "sk-primary",
|
||||
},
|
||||
)
|
||||
assert result["model"] == "anthropic/claude-sonnet-4"
|
||||
assert result["runtime"]["provider"] == "openrouter"
|
||||
assert result["label"] is None
|
||||
@@ -162,6 +162,57 @@ def test_runtime_resolution_rebuilds_agent_on_routing_change(monkeypatch):
|
||||
assert shell.api_mode == "codex_responses"
|
||||
|
||||
|
||||
def test_cli_turn_routing_uses_primary_when_disabled(monkeypatch):
|
||||
cli = _import_cli()
|
||||
shell = cli.HermesCLI(model="gpt-5", compact=True, max_turns=1)
|
||||
shell.provider = "openrouter"
|
||||
shell.api_mode = "chat_completions"
|
||||
shell.base_url = "https://openrouter.ai/api/v1"
|
||||
shell.api_key = "sk-primary"
|
||||
shell._smart_model_routing = {"enabled": False}
|
||||
|
||||
result = shell._resolve_turn_agent_config("what time is it in tokyo?")
|
||||
|
||||
assert result["model"] == "gpt-5"
|
||||
assert result["runtime"]["provider"] == "openrouter"
|
||||
assert result["label"] is None
|
||||
|
||||
|
||||
def test_cli_turn_routing_uses_cheap_model_when_simple(monkeypatch):
|
||||
cli = _import_cli()
|
||||
|
||||
def _runtime_resolve(**kwargs):
|
||||
assert kwargs["requested"] == "zai"
|
||||
return {
|
||||
"provider": "zai",
|
||||
"api_mode": "chat_completions",
|
||||
"base_url": "https://open.z.ai/api/v1",
|
||||
"api_key": "cheap-key",
|
||||
"source": "env/config",
|
||||
}
|
||||
|
||||
monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve)
|
||||
|
||||
shell = cli.HermesCLI(model="anthropic/claude-sonnet-4", compact=True, max_turns=1)
|
||||
shell.provider = "openrouter"
|
||||
shell.api_mode = "chat_completions"
|
||||
shell.base_url = "https://openrouter.ai/api/v1"
|
||||
shell.api_key = "primary-key"
|
||||
shell._smart_model_routing = {
|
||||
"enabled": True,
|
||||
"cheap_model": {"provider": "zai", "model": "glm-5-air"},
|
||||
"max_simple_chars": 160,
|
||||
"max_simple_words": 28,
|
||||
}
|
||||
|
||||
result = shell._resolve_turn_agent_config("what time is it in tokyo?")
|
||||
|
||||
assert result["model"] == "glm-5-air"
|
||||
assert result["runtime"]["provider"] == "zai"
|
||||
assert result["runtime"]["api_key"] == "cheap-key"
|
||||
assert result["label"] is not None
|
||||
|
||||
|
||||
def test_cli_prefers_config_provider_over_stale_env_override(monkeypatch):
|
||||
cli = _import_cli()
|
||||
|
||||
|
||||
@@ -441,6 +441,39 @@ Supported providers: `openrouter`, `nous`, `openai-codex`, `anthropic`, `zai`, `
|
||||
Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers).
|
||||
:::
|
||||
|
||||
## Smart Model Routing
|
||||
|
||||
Optional cheap-vs-strong routing lets Hermes keep your main model for complex work while sending very short/simple turns to a cheaper model.
|
||||
|
||||
```yaml
|
||||
smart_model_routing:
|
||||
enabled: true
|
||||
max_simple_chars: 160
|
||||
max_simple_words: 28
|
||||
cheap_model:
|
||||
provider: openrouter
|
||||
model: google/gemini-2.5-flash
|
||||
# base_url: http://localhost:8000/v1 # optional custom endpoint
|
||||
# api_key_env: MY_CUSTOM_KEY # optional env var name for that endpoint's API key
|
||||
```
|
||||
|
||||
How it works:
|
||||
- If a turn is short, single-line, and does not look code/tool/debug heavy, Hermes may route it to `cheap_model`
|
||||
- If the turn looks complex, Hermes stays on your primary model/provider
|
||||
- If the cheap route cannot be resolved cleanly, Hermes falls back to the primary model automatically
|
||||
|
||||
This is intentionally conservative. It is meant for quick, low-stakes turns like:
|
||||
- short factual questions
|
||||
- quick rewrites
|
||||
- lightweight summaries
|
||||
|
||||
It will avoid routing prompts that look like:
|
||||
- coding/debugging work
|
||||
- tool-heavy requests
|
||||
- long or multi-line analysis asks
|
||||
|
||||
Use this when you want lower latency or cost without fully changing your default model.
|
||||
|
||||
## Terminal Backend Configuration
|
||||
|
||||
Configure which environment the agent uses for terminal commands:
|
||||
|
||||
Reference in New Issue
Block a user