mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 20:29:00 +08:00
Compare commits
2 Commits
main
...
bb/model-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69f4291892 | ||
|
|
57177544ff |
@@ -368,6 +368,71 @@ def _get_continuation_prompt(is_partial_stub: bool, dropped_tools: Optional[List
|
||||
)
|
||||
|
||||
|
||||
def _maybe_apply_session_routing(agent, user_message, conversation_history) -> None:
|
||||
"""Smart model routing at session start (cache-safe).
|
||||
|
||||
Fires at most once per agent, and only on the FIRST turn of a *fresh*
|
||||
session (empty ``conversation_history`` → no cached prefix to break).
|
||||
Picks a tier-appropriate model BEFORE the system prompt is built, then
|
||||
applies it via the same ``switch_model`` path ``/model`` uses (which
|
||||
nulls ``_cached_system_prompt`` so the prompt rebuilds for the new
|
||||
model). Everything is fail-open: any error leaves the agent untouched.
|
||||
"""
|
||||
if getattr(agent, "_smart_routing_applied", False):
|
||||
return
|
||||
# Only route a genuinely fresh session — never swap the model into a
|
||||
# resumed conversation, which would invalidate its cached history.
|
||||
if conversation_history:
|
||||
agent._smart_routing_applied = True
|
||||
return
|
||||
try:
|
||||
from agent import model_router
|
||||
|
||||
routing_cfg = model_router.get_routing_config()
|
||||
if not routing_cfg.get("enabled") or not routing_cfg.get("apply_to_sessions", True):
|
||||
agent._smart_routing_applied = True
|
||||
return
|
||||
|
||||
decision = model_router.route(
|
||||
user_message,
|
||||
current_model=getattr(agent, "model", "") or "",
|
||||
current_provider=getattr(agent, "provider", "") or "",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("session routing: classification failed: %s", exc)
|
||||
agent._smart_routing_applied = True
|
||||
return
|
||||
|
||||
# Mark applied regardless of outcome so we never re-classify this agent.
|
||||
agent._smart_routing_applied = True
|
||||
if decision is None:
|
||||
return
|
||||
|
||||
try:
|
||||
agent.switch_model(
|
||||
new_model=decision.model,
|
||||
new_provider=decision.provider,
|
||||
api_key=decision.api_key or "",
|
||||
base_url=decision.base_url or "",
|
||||
api_mode=decision.api_mode or "",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("session routing: switch_model failed (%s) — staying", exc)
|
||||
return
|
||||
|
||||
logger.info(
|
||||
"session routing: tier=%s → %s (%s)",
|
||||
decision.tier, decision.model, decision.provider,
|
||||
)
|
||||
if routing_cfg.get("announce", True) and not getattr(agent, "quiet_mode", False):
|
||||
try:
|
||||
agent._safe_print(
|
||||
f"\n🧭 Auto-routed to {decision.model} ({decision.tier} tier)"
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
def run_conversation(
|
||||
agent,
|
||||
user_message: str,
|
||||
@@ -396,6 +461,12 @@ def run_conversation(
|
||||
Returns:
|
||||
Dict: Complete conversation result with final response and message history
|
||||
"""
|
||||
# ── Smart model routing (session start, cache-safe) ──
|
||||
# Runs BEFORE build_turn_context so the (model-specific) system prompt is
|
||||
# built for the routed model. No-op unless smart_model_routing.enabled and
|
||||
# this is the first turn of a fresh session. See _maybe_apply_session_routing.
|
||||
_maybe_apply_session_routing(agent, user_message, conversation_history)
|
||||
|
||||
# ── Per-turn setup (the prologue) ──
|
||||
# All once-per-turn setup — stdio guarding, retry-counter resets, user
|
||||
# message sanitization, todo/nudge hydration, system-prompt restore-or-
|
||||
|
||||
329
agent/model_router.py
Normal file
329
agent/model_router.py
Normal file
@@ -0,0 +1,329 @@
|
||||
"""Smart model routing — the cheap "picker" behind ``smart_model_routing``.
|
||||
|
||||
A lightweight classifier labels an incoming request's complexity tier
|
||||
(``light`` / ``standard`` / ``heavy``) and maps it to a tier-appropriate
|
||||
model. This mirrors the Cursor "Auto" idea — right-size the model to the
|
||||
task — while respecting Hermes' sacred per-conversation prompt cache.
|
||||
|
||||
**Nous Portal only.** Routing is a Nous Portal feature: every tier resolves
|
||||
to a model served by the Nous Portal (``provider: nous``), and the router
|
||||
only engages when the active model is itself on Nous Portal. If the current
|
||||
model is on any other provider the router is a strict no-op, so it never
|
||||
silently switches a non-Nous user onto Nous. The Portal already fronts the
|
||||
frontier models across vendors (``anthropic/…``, ``openai/…``,
|
||||
``google/…``, ``x-ai/…``), so a single Nous credential covers every tier.
|
||||
|
||||
The router is consulted ONLY at points where there is no cached prefix to
|
||||
invalidate:
|
||||
|
||||
* at the start of a *fresh* session, before the first API call
|
||||
(:func:`run_conversation` gates on empty ``conversation_history``), and
|
||||
* at each ``delegate_task`` boundary, where subagents get fresh context.
|
||||
|
||||
It never swaps the main model mid-conversation — that is ``/model``'s job
|
||||
and it deliberately resets the cache.
|
||||
|
||||
Everything here fails open: a broken/slow/misconfigured classifier must
|
||||
never wedge a turn. On any failure the caller stays on the current model.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Ordered cheapest/smallest → most capable. Order is load-bearing: the
|
||||
# ``min_tier`` floor and tier comparisons rely on it.
|
||||
TIERS: Tuple[str, ...] = ("light", "standard", "heavy")
|
||||
|
||||
# Smart routing is a Nous Portal feature — every tier resolves through this
|
||||
# provider, and routing only engages when the active model is on it too.
|
||||
NOUS_PROVIDER = "nous"
|
||||
|
||||
|
||||
def _is_nous_provider(provider: str) -> bool:
|
||||
"""True when ``provider`` names the Nous Portal.
|
||||
|
||||
The router is Nous-only, so this gates both the active-model check (only
|
||||
route a session/parent already on Nous) and is the implied provider for
|
||||
every tier target.
|
||||
"""
|
||||
return (provider or "").strip().lower() == NOUS_PROVIDER
|
||||
|
||||
_CLASSIFIER_SYSTEM_PROMPT = (
|
||||
"You are a routing classifier for an autonomous AI coding agent. Read the "
|
||||
"user's request and label how much model capability it needs, as exactly "
|
||||
"one of these tiers:\n"
|
||||
"- light: trivial or quick — simple questions, tiny edits, lookups, "
|
||||
"formatting, one-line answers.\n"
|
||||
"- standard: ordinary coding and analysis — implement a function, explain "
|
||||
"code, write a normal test, routine debugging.\n"
|
||||
"- heavy: hard or sprawling — multi-file refactors, architecture/design, "
|
||||
"subtle debugging, deep multi-step reasoning, security-sensitive work.\n"
|
||||
"Bias toward the HIGHER tier when unsure; quality matters more than saving "
|
||||
"a little money. Respond with ONLY the single tier word, nothing else."
|
||||
)
|
||||
|
||||
# Cap the message we send to the classifier — the opening request can be huge
|
||||
# (pasted logs, files). The first ~4k chars carry the intent.
|
||||
_MAX_CLASSIFY_CHARS = 4000
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingDecision:
|
||||
"""A resolved decision to run on a specific model.
|
||||
|
||||
``base_url`` / ``api_key`` / ``api_mode`` are resolved credentials ready
|
||||
to hand to ``AIAgent.switch_model`` (session routing) or to
|
||||
``_build_child_agent`` overrides (delegation routing).
|
||||
"""
|
||||
|
||||
tier: str
|
||||
provider: str
|
||||
model: str
|
||||
base_url: Optional[str]
|
||||
api_key: Optional[str]
|
||||
api_mode: Optional[str]
|
||||
reason: str = ""
|
||||
|
||||
|
||||
def get_routing_config(config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
"""Return the ``smart_model_routing`` config dict (never None)."""
|
||||
if config is None:
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
config = load_config()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("model_router: load_config failed: %s", exc)
|
||||
return {}
|
||||
cfg = config.get("smart_model_routing") if isinstance(config, dict) else None
|
||||
return cfg if isinstance(cfg, dict) else {}
|
||||
|
||||
|
||||
def is_enabled(config: Optional[Dict[str, Any]] = None) -> bool:
|
||||
return bool(get_routing_config(config).get("enabled"))
|
||||
|
||||
|
||||
def _tier_index(tier: str) -> int:
|
||||
try:
|
||||
return TIERS.index(tier)
|
||||
except ValueError:
|
||||
return TIERS.index("standard")
|
||||
|
||||
|
||||
def _apply_min_tier_floor(tier: str, routing_cfg: Dict[str, Any]) -> str:
|
||||
"""Bump ``tier`` up to ``min_tier`` when a floor is configured."""
|
||||
floor = str(routing_cfg.get("min_tier") or "").strip().lower()
|
||||
if floor in TIERS and _tier_index(tier) < _tier_index(floor):
|
||||
return floor
|
||||
return tier
|
||||
|
||||
|
||||
def _parse_tier(raw: str, default_tier: str) -> str:
|
||||
"""Extract a tier word from a classifier response. Fail-open to default."""
|
||||
text = (raw or "").strip().lower()
|
||||
if not text:
|
||||
return default_tier
|
||||
# Exact single-word answer (the happy path) or first tier word mentioned.
|
||||
for tier in TIERS:
|
||||
if tier in text:
|
||||
return tier
|
||||
return default_tier
|
||||
|
||||
|
||||
def classify_complexity(
|
||||
message: str,
|
||||
*,
|
||||
routing_cfg: Optional[Dict[str, Any]] = None,
|
||||
timeout: float = 20.0,
|
||||
) -> Tuple[str, str]:
|
||||
"""Classify ``message`` into a complexity tier.
|
||||
|
||||
Returns ``(tier, reason)``. Always returns a valid tier — on any failure
|
||||
it returns the configured ``default_tier`` with a diagnostic reason.
|
||||
"""
|
||||
routing_cfg = routing_cfg if routing_cfg is not None else get_routing_config()
|
||||
default_tier = str(routing_cfg.get("default_tier") or "standard").strip().lower()
|
||||
if default_tier not in TIERS:
|
||||
default_tier = "standard"
|
||||
|
||||
if not (message or "").strip():
|
||||
return default_tier, "empty message"
|
||||
|
||||
try:
|
||||
from agent.auxiliary_client import (
|
||||
get_auxiliary_extra_body,
|
||||
get_text_auxiliary_client,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("model_router: auxiliary client import failed: %s", exc)
|
||||
return default_tier, "auxiliary client unavailable"
|
||||
|
||||
try:
|
||||
client, model = get_text_auxiliary_client("routing_classifier")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("model_router: get_text_auxiliary_client failed: %s", exc)
|
||||
return default_tier, "auxiliary client unavailable"
|
||||
|
||||
if client is None or not model:
|
||||
return default_tier, "no auxiliary client configured"
|
||||
|
||||
snippet = message.strip()[:_MAX_CLASSIFY_CHARS]
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": _CLASSIFIER_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": snippet},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=16,
|
||||
timeout=timeout,
|
||||
extra_body=get_auxiliary_extra_body() or None,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.info(
|
||||
"model_router: classifier call failed (%s) — using default tier %r",
|
||||
type(exc).__name__,
|
||||
default_tier,
|
||||
)
|
||||
return default_tier, f"classifier error: {type(exc).__name__}"
|
||||
|
||||
try:
|
||||
raw = resp.choices[0].message.content or ""
|
||||
except Exception: # noqa: BLE001
|
||||
raw = ""
|
||||
|
||||
tier = _parse_tier(raw, default_tier)
|
||||
logger.info("model_router: classified tier=%s (raw=%r)", tier, (raw or "")[:40])
|
||||
return tier, "classified"
|
||||
|
||||
|
||||
def _tier_model(tier: str, routing_cfg: Dict[str, Any]) -> str:
|
||||
"""Return the configured Nous Portal model for a tier ('' when unset).
|
||||
|
||||
A tier maps to a bare Nous model id (``"anthropic/claude-opus-4.8"``).
|
||||
For backward compatibility a ``{"model": "..."}`` dict is also accepted;
|
||||
any ``provider`` key is ignored — tiers always run on the Nous Portal.
|
||||
An empty value means "stay on the current/parent model" for that tier.
|
||||
"""
|
||||
tiers = routing_cfg.get("tiers")
|
||||
if not isinstance(tiers, dict):
|
||||
return ""
|
||||
entry = tiers.get(tier)
|
||||
if isinstance(entry, dict):
|
||||
entry = entry.get("model")
|
||||
return str(entry or "").strip()
|
||||
|
||||
|
||||
def _resolve_tier_credentials(provider: str, model: str) -> Optional[Dict[str, Any]]:
|
||||
"""Resolve full credentials for a tier's Nous Portal model.
|
||||
|
||||
``provider`` is always :data:`NOUS_PROVIDER` — tiers are Nous-only. Reuses
|
||||
the same runtime-provider resolver delegation uses, so a routed tier
|
||||
behaves identically to ``delegation.provider``/``model``. Returns None
|
||||
(fail-open) when Nous credentials can't be resolved.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
runtime = resolve_runtime_provider(requested=provider, target_model=model)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"model_router: cannot resolve tier provider %r (model %r): %s — "
|
||||
"staying on current model",
|
||||
provider,
|
||||
model,
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
api_key = runtime.get("api_key", "")
|
||||
if not api_key:
|
||||
logger.warning(
|
||||
"model_router: tier provider %r resolved but has no API key — "
|
||||
"staying on current model",
|
||||
provider,
|
||||
)
|
||||
return None
|
||||
|
||||
return {
|
||||
"provider": runtime.get("provider") or provider,
|
||||
"model": model or runtime.get("model") or "",
|
||||
"base_url": runtime.get("base_url"),
|
||||
"api_key": api_key,
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
}
|
||||
|
||||
|
||||
def route(
|
||||
message: str,
|
||||
*,
|
||||
current_model: str,
|
||||
current_provider: str,
|
||||
config: Optional[Dict[str, Any]] = None,
|
||||
timeout: Optional[float] = None,
|
||||
) -> Optional[RoutingDecision]:
|
||||
"""Decide which model ``message`` should run on.
|
||||
|
||||
Returns a :class:`RoutingDecision` when the request should run on a
|
||||
*different* Nous Portal model than the current one, or ``None`` to stay
|
||||
put (routing disabled, not on Nous, tier unconfigured, no-op, or any
|
||||
resolution failure). ``None`` is the cache-safe outcome — the caller
|
||||
makes no change.
|
||||
"""
|
||||
routing_cfg = get_routing_config(config)
|
||||
if not routing_cfg.get("enabled"):
|
||||
return None
|
||||
|
||||
# Nous Portal only: never route a session/parent that isn't already on
|
||||
# Nous, so the feature can't silently move a user onto another provider.
|
||||
if not _is_nous_provider(current_provider):
|
||||
logger.debug(
|
||||
"model_router: current provider %r is not Nous Portal — staying",
|
||||
current_provider,
|
||||
)
|
||||
return None
|
||||
|
||||
if timeout is None:
|
||||
try:
|
||||
timeout = float(
|
||||
(config or {}).get("auxiliary", {})
|
||||
.get("routing_classifier", {})
|
||||
.get("timeout", 20)
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
timeout = 20.0
|
||||
|
||||
tier, reason = classify_complexity(message, routing_cfg=routing_cfg, timeout=timeout)
|
||||
tier = _apply_min_tier_floor(tier, routing_cfg)
|
||||
|
||||
model = _tier_model(tier, routing_cfg)
|
||||
if not model:
|
||||
# Tier intentionally maps to "stay on the current/parent model".
|
||||
logger.debug("model_router: tier %s has no target — staying", tier)
|
||||
return None
|
||||
|
||||
# Current provider is already known to be Nous (gated above), so a model
|
||||
# match alone means we're on the right tier — never break the cache.
|
||||
if model == (current_model or "").strip():
|
||||
logger.debug("model_router: tier %s already active (%s) — no-op", tier, model)
|
||||
return None
|
||||
|
||||
creds = _resolve_tier_credentials(NOUS_PROVIDER, model)
|
||||
if creds is None:
|
||||
return None
|
||||
|
||||
return RoutingDecision(
|
||||
tier=tier,
|
||||
provider=creds["provider"],
|
||||
model=creds["model"],
|
||||
base_url=creds["base_url"],
|
||||
api_key=creds["api_key"],
|
||||
api_mode=creds["api_mode"],
|
||||
reason=reason,
|
||||
)
|
||||
@@ -1341,6 +1341,20 @@ DEFAULT_CONFIG = {
|
||||
"timeout": 600,
|
||||
"extra_body": {},
|
||||
},
|
||||
# Routing classifier — the cheap "picker" that smart_model_routing
|
||||
# consults to label an incoming request's complexity tier (light /
|
||||
# standard / heavy). Runs on the Nous Portal (smart routing is
|
||||
# Nous-only). Point this at a small, fast Portal model: it runs once
|
||||
# per fresh session and per delegated subtask, so an expensive model
|
||||
# here defeats the purpose. "auto" = use the main chat model.
|
||||
"routing_classifier": {
|
||||
"provider": "nous",
|
||||
"model": "google/gemini-3.5-flash",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 20,
|
||||
"extra_body": {},
|
||||
},
|
||||
},
|
||||
|
||||
"display": {
|
||||
@@ -1718,6 +1732,46 @@ DEFAULT_CONFIG = {
|
||||
"subagent_auto_approve": False,
|
||||
},
|
||||
|
||||
# Smart model routing — a cheap "picker" classifies an incoming request's
|
||||
# complexity tier and routes it to a tier-appropriate model. Mirrors the
|
||||
# Cursor "Auto" idea (right-size the model to the task) while respecting
|
||||
# Hermes' sacred prompt-cache: routing only ever happens at points where
|
||||
# there is no cached prefix to invalidate — at the START of a fresh
|
||||
# session (before the first API call) and at each delegate_task boundary
|
||||
# (subagents get fresh context). It never swaps the main model mid-
|
||||
# conversation (that is what `/model` is for, and it resets the cache).
|
||||
#
|
||||
# Nous Portal only: every tier runs on the Nous Portal, and routing only
|
||||
# engages when the active model is itself on Nous Portal (otherwise it is
|
||||
# a strict no-op — it never moves a non-Nous user onto Nous). The Portal
|
||||
# fronts the frontier models across vendors, so one credential covers
|
||||
# every tier.
|
||||
#
|
||||
# Off by default. The classifier runs via auxiliary.routing_classifier —
|
||||
# point that at a cheap, fast Portal model (see its comment above).
|
||||
"smart_model_routing": {
|
||||
"enabled": False, # master switch
|
||||
"apply_to_sessions": True, # route at the start of a fresh session
|
||||
"apply_to_delegation": True, # route delegated subtasks by their goal
|
||||
# Which Nous Portal model each complexity tier runs on. Leave a tier
|
||||
# empty to "stay on the current/parent model" — the natural baseline
|
||||
# for `standard`. Credentials resolve automatically from the Nous
|
||||
# provider, exactly like delegation.model.
|
||||
"tiers": {
|
||||
"light": "google/gemini-3.5-flash", # fast + cheap
|
||||
"standard": "", # empty = main model
|
||||
"heavy": "anthropic/claude-opus-4.8", # frontier
|
||||
},
|
||||
# Tier used when the classifier is unreachable or returns garbage.
|
||||
# Fail-open: a broken picker must never wedge a turn.
|
||||
"default_tier": "standard",
|
||||
# Quality-first guardrail: never route below this tier. Set to
|
||||
# "standard" to forbid the "light" tier entirely. Empty = no floor.
|
||||
"min_tier": "",
|
||||
# Surface the routing decision to the user (tier + chosen model).
|
||||
"announce": True,
|
||||
},
|
||||
|
||||
# Ephemeral prefill messages file — JSON list of {role, content} dicts
|
||||
# injected at the start of every API call for few-shot priming.
|
||||
# Never saved to sessions, logs, or trajectories.
|
||||
|
||||
329
tests/agent/test_model_router.py
Normal file
329
tests/agent/test_model_router.py
Normal file
@@ -0,0 +1,329 @@
|
||||
"""Tests for smart model routing (agent/model_router.py + wiring).
|
||||
|
||||
All tests are hermetic — the classifier and credential resolution are
|
||||
stubbed, so nothing hits the network. The invariants under test:
|
||||
|
||||
* routing is a strict no-op when disabled (the default),
|
||||
* routing is Nous-only — a non-Nous session is never touched, and every
|
||||
tier resolves through the Nous provider,
|
||||
* a tier that maps to the current model never triggers a switch
|
||||
(the cache-safety guarantee),
|
||||
* the min_tier floor is honored,
|
||||
* classification fails open to the default tier,
|
||||
* explicit delegation/model pins beat routing,
|
||||
* the session-start helper fires at most once and skips resumed sessions.
|
||||
"""
|
||||
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
from agent import model_router
|
||||
from agent.model_router import RoutingDecision
|
||||
|
||||
|
||||
def _cfg(**routing):
|
||||
# Tiers are Nous Portal model ids (smart routing is Nous-only).
|
||||
base = {
|
||||
"enabled": True,
|
||||
"apply_to_sessions": True,
|
||||
"apply_to_delegation": True,
|
||||
"tiers": {
|
||||
"light": "google/gemini-3.5-flash",
|
||||
"standard": "",
|
||||
"heavy": "anthropic/claude-opus-4.8",
|
||||
},
|
||||
"default_tier": "standard",
|
||||
"min_tier": "",
|
||||
"announce": True,
|
||||
}
|
||||
base.update(routing)
|
||||
return {"smart_model_routing": base}
|
||||
|
||||
|
||||
# ── pure helpers ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_parse_tier_exact_and_embedded():
|
||||
assert model_router._parse_tier("heavy", "standard") == "heavy"
|
||||
assert model_router._parse_tier(" Light\n", "standard") == "light"
|
||||
assert model_router._parse_tier("I think this is standard work", "heavy") == "standard"
|
||||
|
||||
|
||||
def test_parse_tier_fails_open_to_default():
|
||||
assert model_router._parse_tier("", "standard") == "standard"
|
||||
assert model_router._parse_tier("banana", "heavy") == "heavy"
|
||||
|
||||
|
||||
def test_min_tier_floor_bumps_up():
|
||||
cfg = _cfg(min_tier="standard")["smart_model_routing"]
|
||||
assert model_router._apply_min_tier_floor("light", cfg) == "standard"
|
||||
assert model_router._apply_min_tier_floor("heavy", cfg) == "heavy"
|
||||
|
||||
|
||||
def test_min_tier_floor_ignores_invalid():
|
||||
cfg = _cfg(min_tier="bogus")["smart_model_routing"]
|
||||
assert model_router._apply_min_tier_floor("light", cfg) == "light"
|
||||
|
||||
|
||||
def test_tier_model_reads_config():
|
||||
cfg = _cfg()["smart_model_routing"]
|
||||
assert model_router._tier_model("light", cfg) == "google/gemini-3.5-flash"
|
||||
assert model_router._tier_model("standard", cfg) == ""
|
||||
|
||||
|
||||
def test_tier_model_accepts_legacy_dict_and_ignores_provider():
|
||||
cfg = _cfg(
|
||||
tiers={"heavy": {"provider": "anthropic", "model": "anthropic/claude-opus-4.8"}}
|
||||
)["smart_model_routing"]
|
||||
assert model_router._tier_model("heavy", cfg) == "anthropic/claude-opus-4.8"
|
||||
|
||||
|
||||
def test_is_nous_provider():
|
||||
assert model_router._is_nous_provider("nous")
|
||||
assert model_router._is_nous_provider(" Nous ")
|
||||
assert not model_router._is_nous_provider("openrouter")
|
||||
assert not model_router._is_nous_provider("")
|
||||
|
||||
|
||||
# ── route() behavior ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_route_disabled_is_noop():
|
||||
decision = model_router.route(
|
||||
"anything",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(enabled=False),
|
||||
)
|
||||
assert decision is None
|
||||
|
||||
|
||||
def test_route_noop_when_not_on_nous(monkeypatch):
|
||||
# Nous-only: an enabled router never touches a non-Nous session.
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("heavy", "x"))
|
||||
decision = model_router.route(
|
||||
"hard refactor",
|
||||
current_model="gpt-5.4",
|
||||
current_provider="openrouter",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert decision is None
|
||||
|
||||
|
||||
def test_route_tier_with_no_target_stays(monkeypatch):
|
||||
# standard tier maps to empty → stay on current model.
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("standard", "x"))
|
||||
decision = model_router.route(
|
||||
"normal task",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert decision is None
|
||||
|
||||
|
||||
def test_route_noop_when_tier_matches_current(monkeypatch):
|
||||
# heavy tier resolves to the model we're already on → no switch (cache-safe).
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("heavy", "x"))
|
||||
decision = model_router.route(
|
||||
"hard refactor",
|
||||
current_model="anthropic/claude-opus-4.8",
|
||||
current_provider="nous",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert decision is None
|
||||
|
||||
|
||||
def test_route_returns_decision_on_tier_change(monkeypatch):
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("heavy", "x"))
|
||||
monkeypatch.setattr(
|
||||
model_router,
|
||||
"_resolve_tier_credentials",
|
||||
lambda p, m: {"provider": "nous", "model": m,
|
||||
"base_url": "https://inference-api.nousresearch.com/v1",
|
||||
"api_key": "sk", "api_mode": None},
|
||||
)
|
||||
decision = model_router.route(
|
||||
"hard refactor",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert isinstance(decision, RoutingDecision)
|
||||
assert decision.tier == "heavy"
|
||||
assert decision.model == "anthropic/claude-opus-4.8"
|
||||
assert decision.provider == "nous"
|
||||
|
||||
|
||||
def test_route_resolves_tier_against_nous(monkeypatch):
|
||||
# The tier model is always resolved through the Nous provider.
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("light", "x"))
|
||||
captured = {}
|
||||
|
||||
def _fake_resolve(provider, model):
|
||||
captured["provider"] = provider
|
||||
captured["model"] = model
|
||||
return {"provider": provider, "model": model, "base_url": None,
|
||||
"api_key": "sk", "api_mode": None}
|
||||
|
||||
monkeypatch.setattr(model_router, "_resolve_tier_credentials", _fake_resolve)
|
||||
decision = model_router.route(
|
||||
"tiny edit",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert decision is not None
|
||||
assert captured["provider"] == model_router.NOUS_PROVIDER
|
||||
assert captured["model"] == "google/gemini-3.5-flash"
|
||||
|
||||
|
||||
def test_route_fails_open_when_credentials_unresolved(monkeypatch):
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("light", "x"))
|
||||
monkeypatch.setattr(model_router, "_resolve_tier_credentials", lambda p, m: None)
|
||||
decision = model_router.route(
|
||||
"tiny edit",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(),
|
||||
)
|
||||
assert decision is None
|
||||
|
||||
|
||||
def test_route_honors_min_tier(monkeypatch):
|
||||
# classifier says light, but min_tier=heavy forces heavy.
|
||||
monkeypatch.setattr(model_router, "classify_complexity", lambda *a, **k: ("light", "x"))
|
||||
captured = {}
|
||||
|
||||
def _fake_resolve(provider, model):
|
||||
captured["provider"] = provider
|
||||
captured["model"] = model
|
||||
return {"provider": provider, "model": model, "base_url": None,
|
||||
"api_key": "sk", "api_mode": None}
|
||||
|
||||
monkeypatch.setattr(model_router, "_resolve_tier_credentials", _fake_resolve)
|
||||
decision = model_router.route(
|
||||
"tiny edit",
|
||||
current_model="openai/gpt-5.5",
|
||||
current_provider="nous",
|
||||
config=_cfg(min_tier="heavy"),
|
||||
)
|
||||
assert decision is not None
|
||||
assert decision.tier == "heavy"
|
||||
assert captured["model"] == "anthropic/claude-opus-4.8"
|
||||
|
||||
|
||||
# ── classify_complexity fail-open ─────────────────────────────────────────
|
||||
|
||||
|
||||
def test_classify_fails_open_without_aux_client(monkeypatch):
|
||||
import agent.auxiliary_client as aux
|
||||
|
||||
monkeypatch.setattr(aux, "get_text_auxiliary_client", lambda task: (None, None))
|
||||
tier, reason = model_router.classify_complexity(
|
||||
"do something", routing_cfg=_cfg()["smart_model_routing"]
|
||||
)
|
||||
assert tier == "standard"
|
||||
assert "no auxiliary client" in reason
|
||||
|
||||
|
||||
def test_classify_empty_message_returns_default():
|
||||
tier, reason = model_router.classify_complexity(
|
||||
" ", routing_cfg=_cfg(default_tier="heavy")["smart_model_routing"]
|
||||
)
|
||||
assert tier == "heavy"
|
||||
|
||||
|
||||
# ── session-start wiring (_maybe_apply_session_routing) ───────────────────
|
||||
|
||||
|
||||
class _FakeAgent:
|
||||
def __init__(self):
|
||||
self.model = "openai/gpt-5.5"
|
||||
self.provider = "nous"
|
||||
self.quiet_mode = True
|
||||
self.switched = None
|
||||
self._smart_routing_applied = False
|
||||
|
||||
def switch_model(self, **kwargs):
|
||||
self.switched = kwargs
|
||||
self.model = kwargs["new_model"]
|
||||
self.provider = kwargs["new_provider"]
|
||||
|
||||
|
||||
def test_session_routing_skips_resumed_session(monkeypatch):
|
||||
from agent import conversation_loop
|
||||
|
||||
agent = _FakeAgent()
|
||||
# Non-empty history → must not classify or switch, but must mark applied.
|
||||
conversation_loop._maybe_apply_session_routing(agent, "hi", [{"role": "user", "content": "x"}])
|
||||
assert agent.switched is None
|
||||
assert agent._smart_routing_applied is True
|
||||
|
||||
|
||||
def test_session_routing_applies_once_and_switches(monkeypatch):
|
||||
from agent import conversation_loop
|
||||
|
||||
agent = _FakeAgent()
|
||||
monkeypatch.setattr(model_router, "get_routing_config", lambda config=None: _cfg()["smart_model_routing"])
|
||||
monkeypatch.setattr(
|
||||
model_router,
|
||||
"route",
|
||||
lambda *a, **k: RoutingDecision(
|
||||
tier="heavy", provider="nous", model="anthropic/claude-opus-4.8",
|
||||
base_url=None, api_key="sk", api_mode=None, reason="classified",
|
||||
),
|
||||
)
|
||||
conversation_loop._maybe_apply_session_routing(agent, "hard task", None)
|
||||
assert agent.switched is not None
|
||||
assert agent.model == "anthropic/claude-opus-4.8"
|
||||
assert agent._smart_routing_applied is True
|
||||
|
||||
# Second call must be a no-op (flag already set).
|
||||
agent.switched = None
|
||||
conversation_loop._maybe_apply_session_routing(agent, "another", None)
|
||||
assert agent.switched is None
|
||||
|
||||
|
||||
# ── delegation wiring (_route_task_creds) ─────────────────────────────────
|
||||
|
||||
|
||||
def test_delegation_routing_respects_explicit_model():
|
||||
from tools import delegate_tool
|
||||
|
||||
base = {"model": "pinned/model", "provider": "nous", "base_url": None,
|
||||
"api_key": None, "api_mode": None}
|
||||
parent = types.SimpleNamespace(model="openai/gpt-5.5", provider="nous")
|
||||
out = delegate_tool._route_task_creds(base, "anything", parent)
|
||||
assert out is base # unchanged — explicit delegation.model wins
|
||||
|
||||
|
||||
def test_delegation_routing_sets_model_when_unpinned(monkeypatch):
|
||||
from tools import delegate_tool
|
||||
|
||||
monkeypatch.setattr(model_router, "get_routing_config", lambda config=None: _cfg()["smart_model_routing"])
|
||||
monkeypatch.setattr(
|
||||
model_router,
|
||||
"route",
|
||||
lambda *a, **k: RoutingDecision(
|
||||
tier="light", provider="nous", model="google/gemini-3.5-flash",
|
||||
base_url=None, api_key="sk", api_mode=None, reason="classified",
|
||||
),
|
||||
)
|
||||
base = {"model": None, "provider": None, "base_url": None, "api_key": None, "api_mode": None}
|
||||
parent = types.SimpleNamespace(model="openai/gpt-5.5", provider="nous")
|
||||
out = delegate_tool._route_task_creds(base, "tiny task", parent)
|
||||
assert out["model"] == "google/gemini-3.5-flash"
|
||||
assert out["provider"] == "nous"
|
||||
|
||||
|
||||
def test_delegation_routing_noop_returns_base(monkeypatch):
|
||||
from tools import delegate_tool
|
||||
|
||||
monkeypatch.setattr(model_router, "get_routing_config", lambda config=None: _cfg()["smart_model_routing"])
|
||||
monkeypatch.setattr(model_router, "route", lambda *a, **k: None)
|
||||
base = {"model": None, "provider": None, "base_url": None, "api_key": None, "api_mode": None}
|
||||
parent = types.SimpleNamespace(model="openai/gpt-5.5", provider="nous")
|
||||
out = delegate_tool._route_task_creds(base, "task", parent)
|
||||
assert out is base
|
||||
@@ -2111,19 +2111,23 @@ def delegate_task(
|
||||
# Per-task role beats top-level; normalise again so unknown
|
||||
# per-task values warn and degrade to leaf uniformly.
|
||||
effective_role = _normalize_role(t.get("role") or top_role)
|
||||
# Smart model routing: pick a tier-appropriate model per task by
|
||||
# its goal (no-op unless smart_model_routing.enabled and delegation
|
||||
# didn't pin a model). Cache-safe — children start fresh.
|
||||
task_creds = _route_task_creds(creds, str(t.get("goal") or ""), parent_agent)
|
||||
child = _build_child_agent(
|
||||
task_index=i,
|
||||
goal=t["goal"],
|
||||
context=t.get("context"),
|
||||
toolsets=t.get("toolsets") or toolsets,
|
||||
model=creds["model"],
|
||||
model=task_creds["model"],
|
||||
max_iterations=effective_max_iter,
|
||||
task_count=n_tasks,
|
||||
parent_agent=parent_agent,
|
||||
override_provider=creds["provider"],
|
||||
override_base_url=creds["base_url"],
|
||||
override_api_key=creds["api_key"],
|
||||
override_api_mode=creds["api_mode"],
|
||||
override_provider=task_creds["provider"],
|
||||
override_base_url=task_creds["base_url"],
|
||||
override_api_key=task_creds["api_key"],
|
||||
override_api_mode=task_creds["api_mode"],
|
||||
override_acp_command=t.get("acp_command")
|
||||
or acp_command
|
||||
or creds.get("command"),
|
||||
@@ -2569,6 +2573,54 @@ def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _route_task_creds(base_creds: dict, goal: str, parent_agent) -> dict:
|
||||
"""Apply smart_model_routing to one delegated task's credentials.
|
||||
|
||||
Cache-safe by construction: subagents start from a fresh context, so
|
||||
picking a per-task model never invalidates any cached prefix. Only acts
|
||||
when delegation didn't already pin a model (explicit ``delegation.model``
|
||||
wins), ``smart_model_routing`` is enabled with ``apply_to_delegation``,
|
||||
and the parent is on the Nous Portal (routing is Nous-only). Fail-open:
|
||||
returns ``base_creds`` unchanged on any miss or error, so the child
|
||||
inherits the parent model exactly as before.
|
||||
"""
|
||||
if base_creds.get("model"):
|
||||
return base_creds # explicit delegation model wins over routing
|
||||
try:
|
||||
from agent import model_router
|
||||
|
||||
rcfg = model_router.get_routing_config()
|
||||
if not rcfg.get("enabled") or not rcfg.get("apply_to_delegation", True):
|
||||
return base_creds
|
||||
decision = model_router.route(
|
||||
goal or "",
|
||||
current_model=getattr(parent_agent, "model", "") or "",
|
||||
current_provider=getattr(parent_agent, "provider", "") or "",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("delegation routing: classification failed: %s", exc)
|
||||
return base_creds
|
||||
|
||||
if decision is None:
|
||||
return base_creds # no-op / stays on parent model
|
||||
|
||||
logger.info(
|
||||
"delegation routing: tier=%s → %s (%s)",
|
||||
decision.tier, decision.model, decision.provider,
|
||||
)
|
||||
routed = dict(base_creds)
|
||||
routed.update(
|
||||
{
|
||||
"model": decision.model,
|
||||
"provider": decision.provider,
|
||||
"base_url": decision.base_url,
|
||||
"api_key": decision.api_key,
|
||||
"api_mode": decision.api_mode,
|
||||
}
|
||||
)
|
||||
return routed
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
"""Load delegation config from CLI_CONFIG or persistent config.
|
||||
|
||||
|
||||
114
website/docs/user-guide/features/smart-model-routing.md
Normal file
114
website/docs/user-guide/features/smart-model-routing.md
Normal file
@@ -0,0 +1,114 @@
|
||||
---
|
||||
title: Smart Model Routing
|
||||
description: Auto-pick a tier-appropriate model per request without breaking your prompt cache.
|
||||
sidebar_label: Smart Model Routing
|
||||
sidebar_position: 9
|
||||
---
|
||||
|
||||
# Smart Model Routing
|
||||
|
||||
Smart model routing is Hermes' take on a Cursor-style **"Auto"** model picker:
|
||||
a cheap classifier reads an incoming request, labels how much capability it
|
||||
needs (`light` / `standard` / `heavy`), and runs it on a model you've mapped to
|
||||
that tier. Hard tasks get a frontier model; trivial ones get something small
|
||||
and fast.
|
||||
|
||||
It is a **Nous Portal feature**: every tier runs on the
|
||||
[Nous Portal](https://portal.nousresearch.com), which fronts the frontier
|
||||
models across vendors (`anthropic/…`, `openai/…`, `google/…`, `x-ai/…`) behind
|
||||
a single credential — so one Portal key covers every tier. Routing only
|
||||
engages when your active model is itself on the Nous Portal; if you're on any
|
||||
other provider it stays out of the way entirely (it never moves you onto Nous).
|
||||
|
||||
It is **off by default**, and when on it is **prompt-cache-safe by design**.
|
||||
|
||||
## When does it route?
|
||||
|
||||
This is the part that makes it different from a naive "switch the model every
|
||||
turn" router. Hermes' per-conversation prompt caching is
|
||||
sacred: swapping the main model mid-conversation throws away the cached prefix
|
||||
and re-pays full input price on the new model — which, in a long thread, can
|
||||
cost *more* than it saves. So routing only ever happens where there is **no
|
||||
cached prefix to invalidate**:
|
||||
|
||||
| Where | What it does | Cache impact |
|
||||
|-------|--------------|--------------|
|
||||
| **Session start** | Classifies the first message of a *fresh* session and picks the model **before the first API call**. | None — nothing is cached yet. |
|
||||
| **Delegation** | Classifies each `delegate_task` subtask's goal and picks the subagent's model. | None — subagents start from fresh context. |
|
||||
|
||||
It does **not** swap your main model mid-conversation. That remains the job of
|
||||
the explicit [`/model`](../../reference/slash-commands.md) command (which
|
||||
deliberately resets the cache). Resumed sessions are never re-routed.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
You need the **Nous Portal** configured as a provider (`hermes auth add nous`
|
||||
or `hermes model` → Nous Portal) and your main model running on it. Routing is
|
||||
Nous-only — it stays inert on every other provider.
|
||||
|
||||
## Enabling it
|
||||
|
||||
Add a `smart_model_routing` block to `~/.hermes/config.yaml`. Tiers are just
|
||||
Nous Portal model ids:
|
||||
|
||||
```yaml
|
||||
smart_model_routing:
|
||||
enabled: true
|
||||
apply_to_sessions: true # route at the start of a fresh session
|
||||
apply_to_delegation: true # route delegated subtasks by their goal
|
||||
tiers:
|
||||
light: google/gemini-3.5-flash # fast + cheap
|
||||
standard: "" # empty = stay on your main model
|
||||
heavy: anthropic/claude-opus-4.8 # frontier
|
||||
default_tier: standard # used when the classifier can't be reached
|
||||
min_tier: "" # set to "standard" to forbid the light tier
|
||||
announce: true # print the routing decision
|
||||
|
||||
# Point the picker at a small, fast Portal model — it runs once per fresh
|
||||
# session and per delegated subtask, so an expensive classifier defeats the
|
||||
# purpose.
|
||||
auxiliary:
|
||||
routing_classifier:
|
||||
provider: nous
|
||||
model: google/gemini-3.5-flash
|
||||
```
|
||||
|
||||
### Tiers
|
||||
|
||||
There are three ordered tiers — `light`, `standard`, `heavy`. Each maps to a
|
||||
**Nous Portal model id**; the provider is always Nous, so you only specify the
|
||||
model. Credentials (`base_url`, `api_key`, `api_mode`) resolve automatically
|
||||
from your Nous credential, exactly like [`delegation.model`](./delegation.md).
|
||||
Leave a tier empty to mean **"stay on the current/parent model"** — that's the
|
||||
natural baseline for `standard`.
|
||||
|
||||
### The classifier
|
||||
|
||||
The picker runs through the `auxiliary.routing_classifier` task (see
|
||||
[Auxiliary Models](../configuration.md#auxiliary-models)). It sends the request
|
||||
to the configured Portal model and asks for a one-word tier label. It is
|
||||
**fail-open**: if the classifier is unreachable, slow, or returns garbage,
|
||||
Hermes falls back to `default_tier` and never wedges your turn.
|
||||
|
||||
## Tuning
|
||||
|
||||
- **`min_tier`** is a quality-first guardrail. Set it to `standard` to forbid
|
||||
the `light` tier entirely, so the router can upgrade but never downgrade below
|
||||
your floor. Empty means no floor.
|
||||
- **`default_tier`** is where requests land when classification fails. Keep it
|
||||
at `standard` (or higher) so a flaky classifier degrades toward quality.
|
||||
- The classifier is told to **bias toward the higher tier when unsure** — the
|
||||
most common complaint about auto-routers is picking a weak model for a hard
|
||||
task, so the default leans conservative.
|
||||
|
||||
## Relationship to other model controls
|
||||
|
||||
| Feature | What it controls |
|
||||
|---------|------------------|
|
||||
| `smart_model_routing` | Auto-picks a tier model at session start / delegation. |
|
||||
| [`/model`](../../reference/slash-commands.md) | Manual, explicit switch for the current session (always wins; resets cache). |
|
||||
| [`fallback_providers`](./fallback-providers.md) | Failover when a model **errors** (rate limit, outage) — not task-based. |
|
||||
| [`delegation.model`](./delegation.md) | Pins a fixed model for all subagents. An explicit pin **beats** routing. |
|
||||
|
||||
An explicit `delegation.model` always wins over delegation routing, and an
|
||||
explicit `/model` always wins over session routing.
|
||||
Reference in New Issue
Block a user