mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 10:17:17 +08:00
Compare commits
3 Commits
opencode-p
...
hermes/her
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2ef06a04bb | ||
|
|
d2305cee21 | ||
|
|
c31be913e1 |
@@ -5,7 +5,7 @@ Instructions for AI coding assistants and developers working on the hermes-agent
|
||||
## Development Environment
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate # ALWAYS activate before running Python
|
||||
source venv/bin/activate # ALWAYS activate before running Python
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
@@ -23,6 +23,7 @@ hermes-agent/
|
||||
│ ├── prompt_caching.py # Anthropic prompt caching
|
||||
│ ├── auxiliary_client.py # Auxiliary LLM client (vision, summarization)
|
||||
│ ├── model_metadata.py # Model context lengths, token estimation
|
||||
│ ├── models_dev.py # models.dev registry integration (provider-aware context)
|
||||
│ ├── display.py # KawaiiSpinner, tool preview formatting
|
||||
│ ├── skill_commands.py # Skill slash commands (shared CLI/gateway)
|
||||
│ └── trajectory.py # Trajectory saving helpers
|
||||
@@ -377,7 +378,7 @@ The `_isolate_hermes_home` autouse fixture in `tests/conftest.py` redirects `HER
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m pytest tests/ -q # Full suite (~3000 tests, ~3 min)
|
||||
python -m pytest tests/test_model_tools.py -q # Toolset resolution
|
||||
python -m pytest tests/test_cli_init.py -q # CLI config loading
|
||||
|
||||
@@ -146,8 +146,8 @@ git clone https://github.com/NousResearch/hermes-agent.git
|
||||
cd hermes-agent
|
||||
git submodule update --init mini-swe-agent # required terminal backend
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
uv venv .venv --python 3.11
|
||||
source .venv/bin/activate
|
||||
uv venv venv --python 3.11
|
||||
source venv/bin/activate
|
||||
uv pip install -e ".[all,dev]"
|
||||
uv pip install -e "./mini-swe-agent"
|
||||
python -m pytest tests/ -q
|
||||
|
||||
@@ -47,10 +47,12 @@ class ContextCompressor:
|
||||
base_url: str = "",
|
||||
api_key: str = "",
|
||||
config_context_length: int | None = None,
|
||||
provider: str = "",
|
||||
):
|
||||
self.model = model
|
||||
self.base_url = base_url
|
||||
self.api_key = api_key
|
||||
self.provider = provider
|
||||
self.threshold_percent = threshold_percent
|
||||
self.protect_first_n = protect_first_n
|
||||
self.protect_last_n = protect_last_n
|
||||
@@ -60,6 +62,7 @@ class ContextCompressor:
|
||||
self.context_length = get_model_context_length(
|
||||
model, base_url=base_url, api_key=api_key,
|
||||
config_context_length=config_context_length,
|
||||
provider=provider,
|
||||
)
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
self.compression_count = 0
|
||||
|
||||
@@ -612,3 +612,95 @@ def write_tty(text: str) -> None:
|
||||
except OSError:
|
||||
sys.stdout.write(text)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Context pressure display (CLI user-facing warnings)
|
||||
# =========================================================================
|
||||
|
||||
# ANSI color codes for context pressure tiers
|
||||
_CYAN = "\033[36m"
|
||||
_YELLOW = "\033[33m"
|
||||
_BOLD = "\033[1m"
|
||||
_DIM_ANSI = "\033[2m"
|
||||
|
||||
# Bar characters
|
||||
_BAR_FILLED = "▰"
|
||||
_BAR_EMPTY = "▱"
|
||||
_BAR_WIDTH = 20
|
||||
|
||||
|
||||
def format_context_pressure(
|
||||
compaction_progress: float,
|
||||
threshold_tokens: int,
|
||||
threshold_percent: float,
|
||||
compression_enabled: bool = True,
|
||||
) -> str:
|
||||
"""Build a formatted context pressure line for CLI display.
|
||||
|
||||
The bar and percentage show progress toward the compaction threshold,
|
||||
NOT the raw context window. 100% = compaction fires.
|
||||
|
||||
Uses ANSI colors:
|
||||
- cyan at ~60% to compaction = informational
|
||||
- bold yellow at ~85% to compaction = warning
|
||||
|
||||
Args:
|
||||
compaction_progress: How close to compaction (0.0–1.0, 1.0 = fires).
|
||||
threshold_tokens: Compaction threshold in tokens.
|
||||
threshold_percent: Compaction threshold as a fraction of context window.
|
||||
compression_enabled: Whether auto-compression is active.
|
||||
"""
|
||||
pct_int = int(compaction_progress * 100)
|
||||
filled = min(int(compaction_progress * _BAR_WIDTH), _BAR_WIDTH)
|
||||
bar = _BAR_FILLED * filled + _BAR_EMPTY * (_BAR_WIDTH - filled)
|
||||
|
||||
threshold_k = f"{threshold_tokens // 1000}k" if threshold_tokens >= 1000 else str(threshold_tokens)
|
||||
threshold_pct_int = int(threshold_percent * 100)
|
||||
|
||||
# Tier styling
|
||||
if compaction_progress >= 0.85:
|
||||
color = f"{_BOLD}{_YELLOW}"
|
||||
icon = "⚠"
|
||||
if compression_enabled:
|
||||
hint = "compaction imminent"
|
||||
else:
|
||||
hint = "no auto-compaction"
|
||||
else:
|
||||
color = _CYAN
|
||||
icon = "◐"
|
||||
hint = "approaching compaction"
|
||||
|
||||
return (
|
||||
f" {color}{icon} context {bar} {pct_int}% to compaction{_ANSI_RESET}"
|
||||
f" {_DIM_ANSI}{threshold_k} threshold ({threshold_pct_int}%) · {hint}{_ANSI_RESET}"
|
||||
)
|
||||
|
||||
|
||||
def format_context_pressure_gateway(
|
||||
compaction_progress: float,
|
||||
threshold_percent: float,
|
||||
compression_enabled: bool = True,
|
||||
) -> str:
|
||||
"""Build a plain-text context pressure notification for messaging platforms.
|
||||
|
||||
No ANSI — just Unicode and plain text suitable for Telegram/Discord/etc.
|
||||
The percentage shows progress toward the compaction threshold.
|
||||
"""
|
||||
pct_int = int(compaction_progress * 100)
|
||||
filled = min(int(compaction_progress * _BAR_WIDTH), _BAR_WIDTH)
|
||||
bar = _BAR_FILLED * filled + _BAR_EMPTY * (_BAR_WIDTH - filled)
|
||||
|
||||
threshold_pct_int = int(threshold_percent * 100)
|
||||
|
||||
if compaction_progress >= 0.85:
|
||||
icon = "⚠️"
|
||||
if compression_enabled:
|
||||
hint = f"Context compaction is imminent (threshold: {threshold_pct_int}% of window)."
|
||||
else:
|
||||
hint = "Auto-compaction is disabled — context may be truncated."
|
||||
else:
|
||||
icon = "ℹ️"
|
||||
hint = f"Compaction threshold is at {threshold_pct_int}% of context window."
|
||||
|
||||
return f"{icon} Context: {bar} {pct_int}% to compaction\n{hint}"
|
||||
|
||||
@@ -19,6 +19,34 @@ from hermes_constants import OPENROUTER_MODELS_URL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Provider names that can appear as a "provider:" prefix before a model ID.
|
||||
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
|
||||
# are preserved so the full model name reaches cache lookups and server queries.
|
||||
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
|
||||
"openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
|
||||
"zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
|
||||
"opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
|
||||
"custom", "local",
|
||||
# Common aliases
|
||||
"glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
|
||||
"github-models", "kimi", "moonshot", "claude", "deep-seek",
|
||||
"opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
|
||||
})
|
||||
|
||||
|
||||
def _strip_provider_prefix(model: str) -> str:
|
||||
"""Strip a recognised provider prefix from a model string.
|
||||
|
||||
``"local:my-model"`` → ``"my-model"``
|
||||
``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix)
|
||||
"""
|
||||
if ":" not in model or model.startswith("http"):
|
||||
return model
|
||||
prefix = model.split(":", 1)[0].strip().lower()
|
||||
if prefix in _PROVIDER_PREFIXES:
|
||||
return model.split(":", 1)[1]
|
||||
return model
|
||||
|
||||
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
||||
_model_metadata_cache_time: float = 0
|
||||
_MODEL_CACHE_TTL = 3600
|
||||
@@ -27,104 +55,52 @@ _endpoint_model_metadata_cache_time: Dict[str, float] = {}
|
||||
_ENDPOINT_MODEL_CACHE_TTL = 300
|
||||
|
||||
# Descending tiers for context length probing when the model is unknown.
|
||||
# We start high and step down on context-length errors until one works.
|
||||
# We start at 128K (a safe default for most modern models) and step down
|
||||
# on context-length errors until one works.
|
||||
CONTEXT_PROBE_TIERS = [
|
||||
2_000_000,
|
||||
1_000_000,
|
||||
512_000,
|
||||
200_000,
|
||||
128_000,
|
||||
64_000,
|
||||
32_000,
|
||||
16_000,
|
||||
8_000,
|
||||
]
|
||||
|
||||
# Default context length when no detection method succeeds.
|
||||
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
||||
|
||||
# Thin fallback defaults — only broad model family patterns.
|
||||
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
||||
# all miss. Replaced the previous 80+ entry dict.
|
||||
# For provider-specific context lengths, models.dev is the primary source.
|
||||
DEFAULT_CONTEXT_LENGTHS = {
|
||||
"anthropic/claude-opus-4": 200000,
|
||||
"anthropic/claude-opus-4.5": 200000,
|
||||
"anthropic/claude-opus-4.6": 200000,
|
||||
"anthropic/claude-sonnet-4": 200000,
|
||||
"anthropic/claude-sonnet-4-20250514": 200000,
|
||||
"anthropic/claude-sonnet-4.5": 200000,
|
||||
"anthropic/claude-sonnet-4.6": 200000,
|
||||
"anthropic/claude-haiku-4.5": 200000,
|
||||
# Bare Anthropic model IDs (for native API provider)
|
||||
"claude-opus-4-6": 200000,
|
||||
"claude-sonnet-4-6": 200000,
|
||||
"claude-opus-4-5-20251101": 200000,
|
||||
"claude-sonnet-4-5-20250929": 200000,
|
||||
"claude-opus-4-1-20250805": 200000,
|
||||
"claude-opus-4-20250514": 200000,
|
||||
"claude-sonnet-4-20250514": 200000,
|
||||
"claude-haiku-4-5-20251001": 200000,
|
||||
"openai/gpt-5": 128000,
|
||||
"openai/gpt-4.1": 1047576,
|
||||
"openai/gpt-4.1-mini": 1047576,
|
||||
"openai/gpt-4o": 128000,
|
||||
"openai/gpt-4-turbo": 128000,
|
||||
"openai/gpt-4o-mini": 128000,
|
||||
"google/gemini-3-pro-preview": 1048576,
|
||||
"google/gemini-3-flash": 1048576,
|
||||
"google/gemini-2.5-flash": 1048576,
|
||||
"google/gemini-2.0-flash": 1048576,
|
||||
"google/gemini-2.5-pro": 1048576,
|
||||
"deepseek/deepseek-v3.2": 65536,
|
||||
"meta-llama/llama-3.3-70b-instruct": 131072,
|
||||
"deepseek/deepseek-chat-v3": 65536,
|
||||
"qwen/qwen-2.5-72b-instruct": 32768,
|
||||
"glm-4.7": 202752,
|
||||
"glm-5": 202752,
|
||||
"glm-4.5": 131072,
|
||||
"glm-4.5-flash": 131072,
|
||||
"kimi-for-coding": 262144,
|
||||
"kimi-k2.5": 262144,
|
||||
"kimi-k2-thinking": 262144,
|
||||
"kimi-k2-thinking-turbo": 262144,
|
||||
"kimi-k2-turbo-preview": 262144,
|
||||
"kimi-k2-0905-preview": 131072,
|
||||
"MiniMax-M2.7": 204800,
|
||||
"MiniMax-M2.7-highspeed": 204800,
|
||||
"MiniMax-M2.5": 204800,
|
||||
"MiniMax-M2.5-highspeed": 204800,
|
||||
"MiniMax-M2.1": 204800,
|
||||
# OpenCode Zen models
|
||||
"gpt-5.4-pro": 128000,
|
||||
"gpt-5.4": 128000,
|
||||
"gpt-5.3-codex": 128000,
|
||||
"gpt-5.3-codex-spark": 128000,
|
||||
"gpt-5.2": 128000,
|
||||
"gpt-5.2-codex": 128000,
|
||||
"gpt-5.1": 128000,
|
||||
"gpt-5.1-codex": 128000,
|
||||
"gpt-5.1-codex-max": 128000,
|
||||
"gpt-5.1-codex-mini": 128000,
|
||||
# Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
|
||||
# fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
|
||||
# substring of "anthropic/claude-sonnet-4.6").
|
||||
# OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
|
||||
"claude-opus-4-6": 1000000,
|
||||
"claude-sonnet-4-6": 1000000,
|
||||
"claude-opus-4.6": 1000000,
|
||||
"claude-sonnet-4.6": 1000000,
|
||||
# Catch-all for older Claude models (must sort after specific entries)
|
||||
"claude": 200000,
|
||||
# OpenAI
|
||||
"gpt-4.1": 1047576,
|
||||
"gpt-5": 128000,
|
||||
"gpt-5-codex": 128000,
|
||||
"gpt-5-nano": 128000,
|
||||
# Bare model IDs without provider prefix (avoid duplicates with entries above)
|
||||
"claude-opus-4-5": 200000,
|
||||
"claude-opus-4-1": 200000,
|
||||
"claude-sonnet-4-5": 200000,
|
||||
"claude-sonnet-4": 200000,
|
||||
"claude-haiku-4-5": 200000,
|
||||
"claude-3-5-haiku": 200000,
|
||||
"gemini-3.1-pro": 1048576,
|
||||
"gemini-3-pro": 1048576,
|
||||
"gemini-3-flash": 1048576,
|
||||
"minimax-m2.5": 204800,
|
||||
"minimax-m2.5-free": 204800,
|
||||
"minimax-m2.1": 204800,
|
||||
"glm-4.6": 202752,
|
||||
"kimi-k2": 262144,
|
||||
"qwen3-coder": 32768,
|
||||
"big-pickle": 128000,
|
||||
# Alibaba Cloud / DashScope Qwen models
|
||||
"qwen3.5-plus": 131072,
|
||||
"qwen3-max": 131072,
|
||||
"qwen3-coder-plus": 131072,
|
||||
"qwen3-coder-next": 131072,
|
||||
"qwen-plus-latest": 131072,
|
||||
"qwen3.5-flash": 131072,
|
||||
"qwen-vl-max": 32768,
|
||||
"gpt-4": 128000,
|
||||
# Google
|
||||
"gemini": 1048576,
|
||||
# DeepSeek
|
||||
"deepseek": 128000,
|
||||
# Meta
|
||||
"llama": 131072,
|
||||
# Qwen
|
||||
"qwen": 131072,
|
||||
# MiniMax
|
||||
"minimax": 204800,
|
||||
# GLM
|
||||
"glm": 202752,
|
||||
# Kimi
|
||||
"kimi": 262144,
|
||||
}
|
||||
|
||||
_CONTEXT_LENGTH_KEYS = (
|
||||
@@ -146,6 +122,9 @@ _MAX_COMPLETION_KEYS = (
|
||||
"max_tokens",
|
||||
)
|
||||
|
||||
# Local server hostnames / address patterns
|
||||
_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
|
||||
|
||||
|
||||
def _normalize_base_url(base_url: str) -> str:
|
||||
return (base_url or "").strip().rstrip("/")
|
||||
@@ -178,6 +157,99 @@ def _is_known_provider_base_url(base_url: str) -> bool:
|
||||
return any(known_host in host for known_host in known_hosts)
|
||||
|
||||
|
||||
def is_local_endpoint(base_url: str) -> bool:
|
||||
"""Return True if base_url points to a local machine (localhost / RFC-1918 / WSL)."""
|
||||
normalized = _normalize_base_url(base_url)
|
||||
if not normalized:
|
||||
return False
|
||||
url = normalized if "://" in normalized else f"http://{normalized}"
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.hostname or ""
|
||||
except Exception:
|
||||
return False
|
||||
if host in _LOCAL_HOSTS:
|
||||
return True
|
||||
# RFC-1918 private ranges and link-local
|
||||
import ipaddress
|
||||
try:
|
||||
addr = ipaddress.ip_address(host)
|
||||
return addr.is_private or addr.is_loopback or addr.is_link_local
|
||||
except ValueError:
|
||||
pass
|
||||
# Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
|
||||
parts = host.split(".")
|
||||
if len(parts) == 4:
|
||||
try:
|
||||
first, second = int(parts[0]), int(parts[1])
|
||||
if first == 10:
|
||||
return True
|
||||
if first == 172 and 16 <= second <= 31:
|
||||
return True
|
||||
if first == 192 and second == 168:
|
||||
return True
|
||||
except ValueError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def detect_local_server_type(base_url: str) -> Optional[str]:
|
||||
"""Detect which local server is running at base_url by probing known endpoints.
|
||||
|
||||
Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
normalized = _normalize_base_url(base_url)
|
||||
server_url = normalized
|
||||
if server_url.endswith("/v1"):
|
||||
server_url = server_url[:-3]
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=2.0) as client:
|
||||
# LM Studio exposes /api/v1/models — check first (most specific)
|
||||
try:
|
||||
r = client.get(f"{server_url}/api/v1/models")
|
||||
if r.status_code == 200:
|
||||
return "lm-studio"
|
||||
except Exception:
|
||||
pass
|
||||
# Ollama exposes /api/tags and responds with {"models": [...]}
|
||||
# LM Studio returns {"error": "Unexpected endpoint"} with status 200
|
||||
# on this path, so we must verify the response contains "models".
|
||||
try:
|
||||
r = client.get(f"{server_url}/api/tags")
|
||||
if r.status_code == 200:
|
||||
try:
|
||||
data = r.json()
|
||||
if "models" in data:
|
||||
return "ollama"
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
# llama.cpp exposes /props
|
||||
try:
|
||||
r = client.get(f"{server_url}/props")
|
||||
if r.status_code == 200 and "default_generation_settings" in r.text:
|
||||
return "llamacpp"
|
||||
except Exception:
|
||||
pass
|
||||
# vLLM: /version
|
||||
try:
|
||||
r = client.get(f"{server_url}/version")
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
if "version" in data:
|
||||
return "vllm"
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _iter_nested_dicts(value: Any):
|
||||
if isinstance(value, dict):
|
||||
yield value
|
||||
@@ -383,7 +455,7 @@ def _get_context_cache_path() -> Path:
|
||||
|
||||
|
||||
def _load_context_cache() -> Dict[str, int]:
|
||||
"""Load the model+provider → context_length cache from disk."""
|
||||
"""Load the model+provider -> context_length cache from disk."""
|
||||
path = _get_context_cache_path()
|
||||
if not path.exists():
|
||||
return {}
|
||||
@@ -412,7 +484,7 @@ def save_context_length(model: str, base_url: str, length: int) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
||||
logger.info("Cached context length %s → %s tokens", key, f"{length:,}")
|
||||
logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
|
||||
except Exception as e:
|
||||
logger.debug("Failed to save context length cache: %s", e)
|
||||
|
||||
@@ -460,26 +532,219 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
||||
return None
|
||||
|
||||
|
||||
def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
|
||||
"""Return True if *candidate_id* (from server) matches *lookup_model* (configured).
|
||||
|
||||
Supports two forms:
|
||||
- Exact match: "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
|
||||
- Slug match: "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
|
||||
(the part after the last "/" equals lookup_model)
|
||||
|
||||
This covers LM Studio's native API which stores models as "publisher/slug"
|
||||
while users typically configure only the slug after the "local:" prefix.
|
||||
"""
|
||||
if candidate_id == lookup_model:
|
||||
return True
|
||||
# Slug match: basename of candidate equals the lookup name
|
||||
if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
||||
"""Query a local server for the model's context length."""
|
||||
import httpx
|
||||
|
||||
# Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
|
||||
# Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
|
||||
model = _strip_provider_prefix(model)
|
||||
|
||||
# Strip /v1 suffix to get the server root
|
||||
server_url = base_url.rstrip("/")
|
||||
if server_url.endswith("/v1"):
|
||||
server_url = server_url[:-3]
|
||||
|
||||
try:
|
||||
server_type = detect_local_server_type(base_url)
|
||||
except Exception:
|
||||
server_type = None
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=3.0) as client:
|
||||
# Ollama: /api/show returns model details with context info
|
||||
if server_type == "ollama":
|
||||
resp = client.post(f"{server_url}/api/show", json={"name": model})
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Check model_info for context length
|
||||
model_info = data.get("model_info", {})
|
||||
for key, value in model_info.items():
|
||||
if "context_length" in key and isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
# Check parameters string for num_ctx
|
||||
params = data.get("parameters", "")
|
||||
if "num_ctx" in params:
|
||||
for line in params.split("\n"):
|
||||
if "num_ctx" in line:
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 2:
|
||||
try:
|
||||
return int(parts[-1])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# LM Studio native API: /api/v1/models returns max_context_length.
|
||||
# This is more reliable than the OpenAI-compat /v1/models which
|
||||
# doesn't include context window information for LM Studio servers.
|
||||
# Use _model_id_matches for fuzzy matching: LM Studio stores models as
|
||||
# "publisher/slug" but users configure only "slug" after "local:" prefix.
|
||||
if server_type == "lm-studio":
|
||||
resp = client.get(f"{server_url}/api/v1/models")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
for m in data.get("models", []):
|
||||
if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
|
||||
# Prefer loaded instance context (actual runtime value)
|
||||
for inst in m.get("loaded_instances", []):
|
||||
cfg = inst.get("config", {})
|
||||
ctx = cfg.get("context_length")
|
||||
if ctx and isinstance(ctx, (int, float)):
|
||||
return int(ctx)
|
||||
# Fall back to max_context_length (theoretical model max)
|
||||
ctx = m.get("max_context_length") or m.get("context_length")
|
||||
if ctx and isinstance(ctx, (int, float)):
|
||||
return int(ctx)
|
||||
|
||||
# LM Studio / vLLM / llama.cpp: try /v1/models/{model}
|
||||
resp = client.get(f"{server_url}/v1/models/{model}")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# vLLM returns max_model_len
|
||||
ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
|
||||
if ctx and isinstance(ctx, (int, float)):
|
||||
return int(ctx)
|
||||
|
||||
# Try /v1/models and find the model in the list.
|
||||
# Use _model_id_matches to handle "publisher/slug" vs bare "slug".
|
||||
resp = client.get(f"{server_url}/v1/models")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
models_list = data.get("data", [])
|
||||
for m in models_list:
|
||||
if _model_id_matches(m.get("id", ""), model):
|
||||
ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
|
||||
if ctx and isinstance(ctx, (int, float)):
|
||||
return int(ctx)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_model_version(model: str) -> str:
|
||||
"""Normalize version separators for matching.
|
||||
|
||||
Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
|
||||
OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
|
||||
Normalize both to dashes for comparison.
|
||||
"""
|
||||
return model.replace(".", "-")
|
||||
|
||||
|
||||
def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
|
||||
"""Query Anthropic's /v1/models endpoint for context length.
|
||||
|
||||
Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
|
||||
OAuth tokens (sk-ant-oat*) from Claude Code return 401.
|
||||
"""
|
||||
if not api_key or api_key.startswith("sk-ant-oat"):
|
||||
return None # OAuth tokens can't access /v1/models
|
||||
try:
|
||||
base = base_url.rstrip("/")
|
||||
if base.endswith("/v1"):
|
||||
base = base[:-3]
|
||||
url = f"{base}/v1/models?limit=1000"
|
||||
headers = {
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
}
|
||||
resp = requests.get(url, headers=headers, timeout=10)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
data = resp.json()
|
||||
for m in data.get("data", []):
|
||||
if m.get("id") == model:
|
||||
ctx = m.get("max_input_tokens")
|
||||
if isinstance(ctx, int) and ctx > 0:
|
||||
return ctx
|
||||
except Exception as e:
|
||||
logger.debug("Anthropic /v1/models query failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_nous_context_length(model: str) -> Optional[int]:
|
||||
"""Resolve Nous Portal model context length via OpenRouter metadata.
|
||||
|
||||
Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
|
||||
prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
|
||||
with version normalization (dot↔dash).
|
||||
"""
|
||||
metadata = fetch_model_metadata() # OpenRouter cache
|
||||
# Exact match first
|
||||
if model in metadata:
|
||||
return metadata[model].get("context_length")
|
||||
|
||||
normalized = _normalize_model_version(model).lower()
|
||||
|
||||
for or_id, entry in metadata.items():
|
||||
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
||||
if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
|
||||
return entry.get("context_length")
|
||||
|
||||
# Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
|
||||
# Require match to be at a word boundary (followed by -, :, or end of string)
|
||||
model_lower = model.lower()
|
||||
for or_id, entry in metadata.items():
|
||||
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
||||
for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
|
||||
if candidate.startswith(query) and (
|
||||
len(candidate) == len(query) or candidate[len(query)] in "-:."
|
||||
):
|
||||
return entry.get("context_length")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_model_context_length(
|
||||
model: str,
|
||||
base_url: str = "",
|
||||
api_key: str = "",
|
||||
config_context_length: int | None = None,
|
||||
provider: str = "",
|
||||
) -> int:
|
||||
"""Get the context length for a model.
|
||||
|
||||
Resolution order:
|
||||
0. Explicit config override (model.context_length in config.yaml)
|
||||
0. Explicit config override (model.context_length or custom_providers per-model)
|
||||
1. Persistent cache (previously discovered via probing)
|
||||
2. Active endpoint metadata (/models for explicit custom endpoints)
|
||||
3. OpenRouter API metadata
|
||||
4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only)
|
||||
5. First probe tier (2M) — will be narrowed on first context error
|
||||
3. Local server query (for local endpoints)
|
||||
4. Anthropic /v1/models API (API-key users only, not OAuth)
|
||||
5. OpenRouter live API metadata
|
||||
6. Nous suffix-match via OpenRouter cache
|
||||
7. models.dev registry lookup (provider-aware)
|
||||
8. Thin hardcoded defaults (broad family patterns)
|
||||
9. Default fallback (128K)
|
||||
"""
|
||||
# 0. Explicit config override — user knows best
|
||||
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
|
||||
return config_context_length
|
||||
|
||||
# Normalise provider-prefixed model names (e.g. "local:model-name" →
|
||||
# "model-name") so cache lookups and server queries use the bare ID that
|
||||
# local servers actually know about. Ollama "model:tag" colons are preserved.
|
||||
model = _strip_provider_prefix(model)
|
||||
|
||||
# 1. Check persistent cache (model+provider)
|
||||
if base_url:
|
||||
cached = get_cached_context_length(model, base_url)
|
||||
@@ -505,30 +770,63 @@ def get_model_context_length(
|
||||
if isinstance(context_length, int):
|
||||
return context_length
|
||||
if not _is_known_provider_base_url(base_url):
|
||||
# Explicit third-party endpoints should not borrow fuzzy global
|
||||
# defaults from unrelated providers with similarly named models.
|
||||
# 3. Try querying local server directly
|
||||
if is_local_endpoint(base_url):
|
||||
local_ctx = _query_local_context_length(model, base_url)
|
||||
if local_ctx and local_ctx > 0:
|
||||
save_context_length(model, base_url, local_ctx)
|
||||
return local_ctx
|
||||
logger.info(
|
||||
"Could not detect context length for model %r at %s — "
|
||||
"defaulting to %s tokens (probe-down). Set model.context_length "
|
||||
"in config.yaml to override.",
|
||||
model, base_url, f"{CONTEXT_PROBE_TIERS[0]:,}",
|
||||
model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
|
||||
)
|
||||
return CONTEXT_PROBE_TIERS[0]
|
||||
return DEFAULT_FALLBACK_CONTEXT
|
||||
|
||||
# 3. OpenRouter API metadata
|
||||
# 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
|
||||
if provider == "anthropic" or (
|
||||
base_url and "api.anthropic.com" in base_url
|
||||
):
|
||||
ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
|
||||
if ctx:
|
||||
return ctx
|
||||
|
||||
# 5. Provider-aware lookups (before generic OpenRouter cache)
|
||||
# These are provider-specific and take priority over the generic OR cache,
|
||||
# since the same model can have different context limits per provider
|
||||
# (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
|
||||
if provider == "nous":
|
||||
ctx = _resolve_nous_context_length(model)
|
||||
if ctx:
|
||||
return ctx
|
||||
elif provider:
|
||||
from agent.models_dev import lookup_models_dev_context
|
||||
ctx = lookup_models_dev_context(provider, model)
|
||||
if ctx:
|
||||
return ctx
|
||||
|
||||
# 6. OpenRouter live API metadata (provider-unaware fallback)
|
||||
metadata = fetch_model_metadata()
|
||||
if model in metadata:
|
||||
return metadata[model].get("context_length", 128000)
|
||||
|
||||
# 4. Hardcoded defaults (fuzzy match — longest key first for specificity)
|
||||
# 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
|
||||
for default_model, length in sorted(
|
||||
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
|
||||
):
|
||||
if default_model in model or model in default_model:
|
||||
return length
|
||||
|
||||
# 5. Unknown model — start at highest probe tier
|
||||
return CONTEXT_PROBE_TIERS[0]
|
||||
# 9. Query local server as last resort
|
||||
if base_url and is_local_endpoint(base_url):
|
||||
local_ctx = _query_local_context_length(model, base_url)
|
||||
if local_ctx and local_ctx > 0:
|
||||
save_context_length(model, base_url, local_ctx)
|
||||
return local_ctx
|
||||
|
||||
# 10. Default fallback — 128K
|
||||
return DEFAULT_FALLBACK_CONTEXT
|
||||
|
||||
|
||||
def estimate_tokens_rough(text: str) -> int:
|
||||
|
||||
170
agent/models_dev.py
Normal file
170
agent/models_dev.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Models.dev registry integration for provider-aware context length detection.
|
||||
|
||||
Fetches model metadata from https://models.dev/api.json — a community-maintained
|
||||
database of 3800+ models across 100+ providers, including per-provider context
|
||||
windows, pricing, and capabilities.
|
||||
|
||||
Data is cached in memory (1hr TTL) and on disk (~/.hermes/models_dev_cache.json)
|
||||
to avoid cold-start network latency.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODELS_DEV_URL = "https://models.dev/api.json"
|
||||
_MODELS_DEV_CACHE_TTL = 3600 # 1 hour in-memory
|
||||
|
||||
# In-memory cache
|
||||
_models_dev_cache: Dict[str, Any] = {}
|
||||
_models_dev_cache_time: float = 0
|
||||
|
||||
# Provider ID mapping: Hermes provider names → models.dev provider IDs
|
||||
PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
|
||||
"openrouter": "openrouter",
|
||||
"anthropic": "anthropic",
|
||||
"zai": "zai",
|
||||
"kimi-coding": "kimi-for-coding",
|
||||
"minimax": "minimax",
|
||||
"minimax-cn": "minimax-cn",
|
||||
"deepseek": "deepseek",
|
||||
"alibaba": "alibaba",
|
||||
"copilot": "github-copilot",
|
||||
"ai-gateway": "vercel",
|
||||
"opencode-zen": "opencode",
|
||||
"opencode-go": "opencode-go",
|
||||
"kilocode": "kilo",
|
||||
}
|
||||
|
||||
|
||||
def _get_cache_path() -> Path:
|
||||
"""Return path to disk cache file."""
|
||||
env_val = os.environ.get("HERMES_HOME", "")
|
||||
hermes_home = Path(env_val) if env_val else Path.home() / ".hermes"
|
||||
return hermes_home / "models_dev_cache.json"
|
||||
|
||||
|
||||
def _load_disk_cache() -> Dict[str, Any]:
|
||||
"""Load models.dev data from disk cache."""
|
||||
try:
|
||||
cache_path = _get_cache_path()
|
||||
if cache_path.exists():
|
||||
with open(cache_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to load models.dev disk cache: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
def _save_disk_cache(data: Dict[str, Any]) -> None:
|
||||
"""Save models.dev data to disk cache."""
|
||||
try:
|
||||
cache_path = _get_cache_path()
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(cache_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, separators=(",", ":"))
|
||||
except Exception as e:
|
||||
logger.debug("Failed to save models.dev disk cache: %s", e)
|
||||
|
||||
|
||||
def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]:
|
||||
"""Fetch models.dev registry. In-memory cache (1hr) + disk fallback.
|
||||
|
||||
Returns the full registry dict keyed by provider ID, or empty dict on failure.
|
||||
"""
|
||||
global _models_dev_cache, _models_dev_cache_time
|
||||
|
||||
# Check in-memory cache
|
||||
if (
|
||||
not force_refresh
|
||||
and _models_dev_cache
|
||||
and (time.time() - _models_dev_cache_time) < _MODELS_DEV_CACHE_TTL
|
||||
):
|
||||
return _models_dev_cache
|
||||
|
||||
# Try network fetch
|
||||
try:
|
||||
response = requests.get(MODELS_DEV_URL, timeout=15)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if isinstance(data, dict) and len(data) > 0:
|
||||
_models_dev_cache = data
|
||||
_models_dev_cache_time = time.time()
|
||||
_save_disk_cache(data)
|
||||
logger.debug(
|
||||
"Fetched models.dev registry: %d providers, %d total models",
|
||||
len(data),
|
||||
sum(len(p.get("models", {})) for p in data.values() if isinstance(p, dict)),
|
||||
)
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.debug("Failed to fetch models.dev: %s", e)
|
||||
|
||||
# Fall back to disk cache
|
||||
if not _models_dev_cache:
|
||||
_models_dev_cache = _load_disk_cache()
|
||||
if _models_dev_cache:
|
||||
_models_dev_cache_time = time.time()
|
||||
logger.debug("Loaded models.dev from disk cache (%d providers)", len(_models_dev_cache))
|
||||
|
||||
return _models_dev_cache
|
||||
|
||||
|
||||
def lookup_models_dev_context(provider: str, model: str) -> Optional[int]:
|
||||
"""Look up context_length for a provider+model combo in models.dev.
|
||||
|
||||
Returns the context window in tokens, or None if not found.
|
||||
Handles case-insensitive matching and filters out context=0 entries.
|
||||
"""
|
||||
mdev_provider_id = PROVIDER_TO_MODELS_DEV.get(provider)
|
||||
if not mdev_provider_id:
|
||||
return None
|
||||
|
||||
data = fetch_models_dev()
|
||||
provider_data = data.get(mdev_provider_id)
|
||||
if not isinstance(provider_data, dict):
|
||||
return None
|
||||
|
||||
models = provider_data.get("models", {})
|
||||
if not isinstance(models, dict):
|
||||
return None
|
||||
|
||||
# Exact match
|
||||
entry = models.get(model)
|
||||
if entry:
|
||||
ctx = _extract_context(entry)
|
||||
if ctx:
|
||||
return ctx
|
||||
|
||||
# Case-insensitive match
|
||||
model_lower = model.lower()
|
||||
for mid, mdata in models.items():
|
||||
if mid.lower() == model_lower:
|
||||
ctx = _extract_context(mdata)
|
||||
if ctx:
|
||||
return ctx
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_context(entry: Dict[str, Any]) -> Optional[int]:
|
||||
"""Extract context_length from a models.dev model entry.
|
||||
|
||||
Returns None for invalid/zero values (some audio/image models have context=0).
|
||||
"""
|
||||
if not isinstance(entry, dict):
|
||||
return None
|
||||
limit = entry.get("limit")
|
||||
if not isinstance(limit, dict):
|
||||
return None
|
||||
ctx = limit.get("context")
|
||||
if isinstance(ctx, (int, float)) and ctx > 0:
|
||||
return int(ctx)
|
||||
return None
|
||||
@@ -206,11 +206,11 @@ PLATFORM_HINTS = {
|
||||
"contextually appropriate."
|
||||
),
|
||||
"cron": (
|
||||
"You are running as a scheduled cron job. Your final response is automatically "
|
||||
"delivered to the job's configured destination, so do not use send_message to "
|
||||
"send to that same target again. If you want the user to receive something in "
|
||||
"the scheduled destination, put it directly in your final response. Use "
|
||||
"send_message only for additional or different targets."
|
||||
"You are running as a scheduled cron job. There is no user present — you "
|
||||
"cannot ask questions, request clarification, or wait for follow-up. Execute "
|
||||
"the task fully and autonomously, making reasonable decisions where needed. "
|
||||
"Your final response is automatically delivered to the job's configured "
|
||||
"destination — put the primary content directly in your response."
|
||||
),
|
||||
"cli": (
|
||||
"You are a CLI AI Agent. Try not to use markdown but simple text "
|
||||
|
||||
66
cli.py
66
cli.py
@@ -973,6 +973,8 @@ def save_config_value(key_path: str, value: any) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# HermesCLI Class
|
||||
# ============================================================================
|
||||
@@ -1509,10 +1511,22 @@ class HermesCLI:
|
||||
partial lines and emits complete lines via _cprint to work
|
||||
reliably with prompt_toolkit's patch_stdout.
|
||||
|
||||
A ``None`` sentinel signals that an intermediate turn (with tool
|
||||
calls) just completed — close any open response/reasoning box
|
||||
and reset so the final response gets a clean frame.
|
||||
|
||||
Reasoning/thinking blocks (<REASONING_SCRATCHPAD>, <think>, etc.)
|
||||
are suppressed during streaming since they'd display raw XML tags.
|
||||
The agent strips them from the final response anyway.
|
||||
"""
|
||||
if text is None:
|
||||
# Intermediate turn ended — discard any open box and reset
|
||||
self._close_reasoning_box()
|
||||
if self._stream_box_opened:
|
||||
w = shutil.get_terminal_size().columns
|
||||
_cprint(f"{_GOLD}╰{'─' * (w - 2)}╯{_RST}")
|
||||
self._reset_stream_state()
|
||||
return
|
||||
if not text:
|
||||
return
|
||||
|
||||
@@ -1522,9 +1536,11 @@ class HermesCLI:
|
||||
# Track whether we're inside a reasoning/thinking block.
|
||||
# These tags are model-generated (system prompt tells the model
|
||||
# to use them) and get stripped from final_response. We must
|
||||
# suppress them during streaming too.
|
||||
_OPEN_TAGS = ("<REASONING_SCRATCHPAD>", "<think>", "<reasoning>", "<THINKING>")
|
||||
_CLOSE_TAGS = ("</REASONING_SCRATCHPAD>", "</think>", "</reasoning>", "</THINKING>")
|
||||
# suppress them during streaming too — unless show_reasoning is
|
||||
# enabled, in which case we route the inner content to the
|
||||
# reasoning display box instead of discarding it.
|
||||
_OPEN_TAGS = ("<REASONING_SCRATCHPAD>", "<think>", "<reasoning>", "<THINKING>", "<thinking>")
|
||||
_CLOSE_TAGS = ("</REASONING_SCRATCHPAD>", "</think>", "</reasoning>", "</THINKING>", "</thinking>")
|
||||
|
||||
# Append to a pre-filter buffer first
|
||||
self._stream_prefilt = getattr(self, "_stream_prefilt", "") + text
|
||||
@@ -1564,6 +1580,12 @@ class HermesCLI:
|
||||
idx = self._stream_prefilt.find(tag)
|
||||
if idx != -1:
|
||||
self._in_reasoning_block = False
|
||||
# When show_reasoning is on, route inner content to
|
||||
# the reasoning display box instead of discarding.
|
||||
if self.show_reasoning:
|
||||
inner = self._stream_prefilt[:idx]
|
||||
if inner:
|
||||
self._stream_reasoning_delta(inner)
|
||||
after = self._stream_prefilt[idx + len(tag):]
|
||||
self._stream_prefilt = ""
|
||||
# Process remaining text after close tag through full
|
||||
@@ -1571,10 +1593,15 @@ class HermesCLI:
|
||||
if after:
|
||||
self._stream_delta(after)
|
||||
return
|
||||
# Still inside reasoning block — keep only the tail that could
|
||||
# be a partial close tag prefix (save memory on long blocks).
|
||||
# When show_reasoning is on, stream reasoning content live
|
||||
# instead of silently accumulating. Keep only the tail that
|
||||
# could be a partial close tag prefix.
|
||||
max_tag_len = max(len(t) for t in _CLOSE_TAGS)
|
||||
if len(self._stream_prefilt) > max_tag_len:
|
||||
if self.show_reasoning:
|
||||
# Route the safe prefix to reasoning display
|
||||
safe_reasoning = self._stream_prefilt[:-max_tag_len]
|
||||
self._stream_reasoning_delta(safe_reasoning)
|
||||
self._stream_prefilt = self._stream_prefilt[-max_tag_len:]
|
||||
return
|
||||
|
||||
@@ -2731,6 +2758,7 @@ class HermesCLI:
|
||||
if self.agent:
|
||||
self.agent.session_id = self.session_id
|
||||
self.agent.session_start = self.session_start
|
||||
self.agent.reset_session_state()
|
||||
if hasattr(self.agent, "_last_flushed_db_idx"):
|
||||
self.agent._last_flushed_db_idx = 0
|
||||
if hasattr(self.agent, "_todo_store"):
|
||||
@@ -2890,6 +2918,14 @@ class HermesCLI:
|
||||
for mid, desc in curated:
|
||||
current_marker = " ← current" if (is_active and mid == self.model) else ""
|
||||
print(f" {mid}{current_marker}")
|
||||
elif p["id"] == "custom":
|
||||
from hermes_cli.models import _get_custom_base_url
|
||||
custom_url = _get_custom_base_url() or os.getenv("OPENAI_BASE_URL", "")
|
||||
if custom_url:
|
||||
print(f" endpoint: {custom_url}")
|
||||
if is_active:
|
||||
print(f" model: {self.model} ← current")
|
||||
print(f" (use /model custom:<model-name>)")
|
||||
else:
|
||||
print(f" (use /model {p['id']}:<model-name>)")
|
||||
print()
|
||||
@@ -3493,8 +3529,17 @@ class HermesCLI:
|
||||
# Parse provider:model syntax (e.g. "openrouter:anthropic/claude-sonnet-4.5")
|
||||
current_provider = self.provider or self.requested_provider or "openrouter"
|
||||
target_provider, new_model = parse_model_input(raw_input, current_provider)
|
||||
# Auto-detect provider when no explicit provider:model syntax was used
|
||||
if target_provider == current_provider:
|
||||
# Auto-detect provider when no explicit provider:model syntax was used.
|
||||
# Skip auto-detection for custom providers — the model name might
|
||||
# coincidentally match a known provider's catalog, but the user
|
||||
# intends to use it on their custom endpoint. Require explicit
|
||||
# provider:model syntax (e.g. /model openai-codex:gpt-5.2-codex)
|
||||
# to switch away from a custom endpoint.
|
||||
_base = self.base_url or ""
|
||||
is_custom = current_provider == "custom" or (
|
||||
"localhost" in _base or "127.0.0.1" in _base
|
||||
)
|
||||
if target_provider == current_provider and not is_custom:
|
||||
from hermes_cli.models import detect_provider_for_model
|
||||
detected = detect_provider_for_model(new_model, current_provider)
|
||||
if detected:
|
||||
@@ -3562,6 +3607,13 @@ class HermesCLI:
|
||||
if message:
|
||||
print(f" Reason: {message}")
|
||||
print(" Note: Model will revert on restart. Use a verified model to save to config.")
|
||||
|
||||
# Helpful hint when staying on a custom endpoint
|
||||
if is_custom and not provider_changed:
|
||||
endpoint = self.base_url or "custom endpoint"
|
||||
print(f" Endpoint: {endpoint}")
|
||||
print(f" Tip: To switch providers, use /model provider:model")
|
||||
print(f" e.g. /model openai-codex:gpt-5.2-codex")
|
||||
else:
|
||||
self._show_model_and_providers()
|
||||
elif canonical == "provider":
|
||||
|
||||
@@ -391,7 +391,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
providers_ignored=pr.get("ignore"),
|
||||
providers_order=pr.get("order"),
|
||||
provider_sort=pr.get("sort"),
|
||||
disabled_toolsets=["cronjob"],
|
||||
disabled_toolsets=["cronjob", "messaging", "clarify"],
|
||||
quiet_mode=True,
|
||||
platform="cron",
|
||||
session_id=f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}",
|
||||
|
||||
@@ -179,6 +179,11 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
# Normalize account for self-message filtering
|
||||
self._account_normalized = self.account.strip()
|
||||
|
||||
# Track recently sent message timestamps to prevent echo-back loops
|
||||
# in Note to Self / self-chat mode (mirrors WhatsApp recentlySentIds)
|
||||
self._recent_sent_timestamps: set = set()
|
||||
self._max_recent_timestamps = 50
|
||||
|
||||
logger.info("Signal adapter initialized: url=%s account=%s groups=%s",
|
||||
self.http_url, _redact_phone(self.account),
|
||||
"enabled" if self.group_allow_from else "disabled")
|
||||
@@ -353,10 +358,26 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
# Unwrap nested envelope if present
|
||||
envelope_data = envelope.get("envelope", envelope)
|
||||
|
||||
# Filter syncMessage envelopes (sent transcripts, read receipts, etc.)
|
||||
# signal-cli may set syncMessage to null vs omitting it, so check key existence
|
||||
# Handle syncMessage: extract "Note to Self" messages (sent to own account)
|
||||
# while still filtering other sync events (read receipts, typing, etc.)
|
||||
is_note_to_self = False
|
||||
if "syncMessage" in envelope_data:
|
||||
return
|
||||
sync_msg = envelope_data.get("syncMessage")
|
||||
if sync_msg and isinstance(sync_msg, dict):
|
||||
sent_msg = sync_msg.get("sentMessage")
|
||||
if sent_msg and isinstance(sent_msg, dict):
|
||||
dest = sent_msg.get("destinationNumber") or sent_msg.get("destination")
|
||||
sent_ts = sent_msg.get("timestamp")
|
||||
if dest == self._account_normalized:
|
||||
# Check if this is an echo of our own outbound reply
|
||||
if sent_ts and sent_ts in self._recent_sent_timestamps:
|
||||
self._recent_sent_timestamps.discard(sent_ts)
|
||||
return
|
||||
# Genuine user Note to Self — promote to dataMessage
|
||||
is_note_to_self = True
|
||||
envelope_data = {**envelope_data, "dataMessage": sent_msg}
|
||||
if not is_note_to_self:
|
||||
return
|
||||
|
||||
# Extract sender info
|
||||
sender = (
|
||||
@@ -371,8 +392,8 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
logger.debug("Signal: ignoring envelope with no sender")
|
||||
return
|
||||
|
||||
# Self-message filtering — prevent reply loops
|
||||
if self._account_normalized and sender == self._account_normalized:
|
||||
# Self-message filtering — prevent reply loops (but allow Note to Self)
|
||||
if self._account_normalized and sender == self._account_normalized and not is_note_to_self:
|
||||
return
|
||||
|
||||
# Filter stories
|
||||
@@ -577,9 +598,18 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
result = await self._rpc("send", params)
|
||||
|
||||
if result is not None:
|
||||
self._track_sent_timestamp(result)
|
||||
return SendResult(success=True)
|
||||
return SendResult(success=False, error="RPC send failed")
|
||||
|
||||
def _track_sent_timestamp(self, rpc_result) -> None:
|
||||
"""Record outbound message timestamp for echo-back filtering."""
|
||||
ts = rpc_result.get("timestamp") if isinstance(rpc_result, dict) else None
|
||||
if ts:
|
||||
self._recent_sent_timestamps.add(ts)
|
||||
if len(self._recent_sent_timestamps) > self._max_recent_timestamps:
|
||||
self._recent_sent_timestamps.pop()
|
||||
|
||||
async def send_typing(self, chat_id: str, metadata=None) -> None:
|
||||
"""Send a typing indicator."""
|
||||
params: Dict[str, Any] = {
|
||||
@@ -635,6 +665,7 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
|
||||
result = await self._rpc("send", params)
|
||||
if result is not None:
|
||||
self._track_sent_timestamp(result)
|
||||
return SendResult(success=True)
|
||||
return SendResult(success=False, error="RPC send with attachment failed")
|
||||
|
||||
@@ -665,6 +696,7 @@ class SignalAdapter(BasePlatformAdapter):
|
||||
|
||||
result = await self._rpc("send", params)
|
||||
if result is not None:
|
||||
self._track_sent_timestamp(result)
|
||||
return SendResult(success=True)
|
||||
return SendResult(success=False, error="RPC send document failed")
|
||||
|
||||
|
||||
@@ -222,6 +222,12 @@ from gateway.platforms.base import BasePlatformAdapter, MessageEvent, MessageTyp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentinel placed into _running_agents immediately when a session starts
|
||||
# processing, *before* any await. Prevents a second message for the same
|
||||
# session from bypassing the "already running" guard during the async gap
|
||||
# between the guard check and actual agent creation.
|
||||
_AGENT_PENDING_SENTINEL = object()
|
||||
|
||||
|
||||
def _resolve_runtime_agent_kwargs() -> dict:
|
||||
"""Resolve provider credentials for gateway-created AIAgent instances."""
|
||||
@@ -1050,6 +1056,8 @@ class GatewayRunner:
|
||||
self._running = False
|
||||
|
||||
for session_key, agent in list(self._running_agents.items()):
|
||||
if agent is _AGENT_PENDING_SENTINEL:
|
||||
continue
|
||||
try:
|
||||
agent.interrupt("Gateway shutting down")
|
||||
logger.debug("Interrupted running agent for session %s during shutdown", session_key[:20])
|
||||
@@ -1346,7 +1354,18 @@ class GatewayRunner:
|
||||
adapter._pending_messages[_quick_key] = event
|
||||
return None
|
||||
|
||||
running_agent = self._running_agents[_quick_key]
|
||||
running_agent = self._running_agents.get(_quick_key)
|
||||
if running_agent is _AGENT_PENDING_SENTINEL:
|
||||
# Agent is being set up but not ready yet.
|
||||
if event.get_command() == "stop":
|
||||
# Nothing to interrupt — agent hasn't started yet.
|
||||
return "⏳ The agent is still starting up — nothing to stop yet."
|
||||
# Queue the message so it will be picked up after the
|
||||
# agent starts.
|
||||
adapter = self.adapters.get(source.platform)
|
||||
if adapter:
|
||||
adapter._pending_messages[_quick_key] = event
|
||||
return None
|
||||
logger.debug("PRIORITY interrupt for session %s", _quick_key[:20])
|
||||
running_agent.interrupt(event.text)
|
||||
if _quick_key in self._pending_messages:
|
||||
@@ -1354,7 +1373,7 @@ class GatewayRunner:
|
||||
else:
|
||||
self._pending_messages[_quick_key] = event.text
|
||||
return None
|
||||
|
||||
|
||||
# Check for commands
|
||||
command = event.get_command()
|
||||
|
||||
@@ -1527,7 +1546,29 @@ class GatewayRunner:
|
||||
# Pending exec approvals are handled by /approve and /deny commands above.
|
||||
# No bare text matching — "yes" in normal conversation must not trigger
|
||||
# execution of a dangerous command.
|
||||
|
||||
|
||||
# ── Claim this session before any await ───────────────────────
|
||||
# Between here and _run_agent registering the real AIAgent, there
|
||||
# are numerous await points (hooks, vision enrichment, STT,
|
||||
# session hygiene compression). Without this sentinel a second
|
||||
# message arriving during any of those yields would pass the
|
||||
# "already running" guard and spin up a duplicate agent for the
|
||||
# same session — corrupting the transcript.
|
||||
self._running_agents[_quick_key] = _AGENT_PENDING_SENTINEL
|
||||
|
||||
try:
|
||||
return await self._handle_message_with_agent(event, source, _quick_key)
|
||||
finally:
|
||||
# If _run_agent replaced the sentinel with a real agent and
|
||||
# then cleaned it up, this is a no-op. If we exited early
|
||||
# (exception, command fallthrough, etc.) the sentinel must
|
||||
# not linger or the session would be permanently locked out.
|
||||
if self._running_agents.get(_quick_key) is _AGENT_PENDING_SENTINEL:
|
||||
del self._running_agents[_quick_key]
|
||||
|
||||
async def _handle_message_with_agent(self, event, source, _quick_key: str):
|
||||
"""Inner handler that runs under the _running_agents sentinel guard."""
|
||||
|
||||
# Get or create session
|
||||
session_entry = self.session_store.get_or_create_session(source)
|
||||
session_key = session_entry.session_key
|
||||
@@ -2291,8 +2332,10 @@ class GatewayRunner:
|
||||
session_entry = self.session_store.get_or_create_session(source)
|
||||
session_key = session_entry.session_key
|
||||
|
||||
if session_key in self._running_agents:
|
||||
agent = self._running_agents[session_key]
|
||||
agent = self._running_agents.get(session_key)
|
||||
if agent is _AGENT_PENDING_SENTINEL:
|
||||
return "⏳ The agent is still starting up — nothing to stop yet."
|
||||
if agent:
|
||||
agent.interrupt()
|
||||
return "⚡ Stopping the current task... The agent will finish its current step and respond."
|
||||
else:
|
||||
@@ -2380,8 +2423,14 @@ class GatewayRunner:
|
||||
lines = [
|
||||
f"🤖 **Current model:** `{current}`",
|
||||
f"**Provider:** {provider_label}",
|
||||
"",
|
||||
]
|
||||
# Show custom endpoint URL when using a custom provider
|
||||
if current_provider == "custom":
|
||||
from hermes_cli.models import _get_custom_base_url
|
||||
custom_url = _get_custom_base_url() or os.getenv("OPENAI_BASE_URL", "")
|
||||
if custom_url:
|
||||
lines.append(f"**Endpoint:** `{custom_url}`")
|
||||
lines.append("")
|
||||
curated = curated_models_for_provider(current_provider)
|
||||
if curated:
|
||||
lines.append(f"**Available models ({provider_label}):**")
|
||||
@@ -2391,7 +2440,7 @@ class GatewayRunner:
|
||||
lines.append(f"• `{mid}`{label}{marker}")
|
||||
lines.append("")
|
||||
lines.append("To change: `/model model-name`")
|
||||
lines.append("Switch provider: `/model provider:model-name`")
|
||||
lines.append("Switch provider: `/model provider-name` or `/model provider:model-name`")
|
||||
return "\n".join(lines)
|
||||
|
||||
# Parse provider:model syntax
|
||||
@@ -4479,6 +4528,26 @@ class GatewayRunner:
|
||||
except Exception as _e:
|
||||
logger.debug("agent:step hook error: %s", _e)
|
||||
|
||||
# Bridge sync status_callback → async adapter.send for context pressure
|
||||
_status_adapter = self.adapters.get(source.platform)
|
||||
_status_chat_id = source.chat_id
|
||||
_status_thread_metadata = {"thread_id": source.thread_id} if source.thread_id else None
|
||||
|
||||
def _status_callback_sync(event_type: str, message: str) -> None:
|
||||
if not _status_adapter:
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
message,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("status_callback error (%s): %s", event_type, _e)
|
||||
|
||||
def run_sync():
|
||||
# Pass session_key to process registry via env var so background
|
||||
# processes can be mapped back to this gateway session
|
||||
@@ -4571,6 +4640,7 @@ class GatewayRunner:
|
||||
tool_progress_callback=progress_callback if tool_progress_enabled else None,
|
||||
step_callback=_step_callback_sync if _hooks_ref.loaded_hooks else None,
|
||||
stream_delta_callback=_stream_delta_cb,
|
||||
status_callback=_status_callback_sync,
|
||||
platform=platform_key,
|
||||
honcho_session_key=session_key,
|
||||
honcho_manager=honcho_manager,
|
||||
|
||||
@@ -145,7 +145,7 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
|
||||
id="minimax",
|
||||
name="MiniMax",
|
||||
auth_type="api_key",
|
||||
inference_base_url="https://api.minimax.io/v1",
|
||||
inference_base_url="https://api.minimax.io/anthropic",
|
||||
api_key_env_vars=("MINIMAX_API_KEY",),
|
||||
base_url_env_var="MINIMAX_BASE_URL",
|
||||
),
|
||||
@@ -168,7 +168,7 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
|
||||
id="minimax-cn",
|
||||
name="MiniMax (China)",
|
||||
auth_type="api_key",
|
||||
inference_base_url="https://api.minimaxi.com/v1",
|
||||
inference_base_url="https://api.minimaxi.com/anthropic",
|
||||
api_key_env_vars=("MINIMAX_CN_API_KEY",),
|
||||
base_url_env_var="MINIMAX_CN_BASE_URL",
|
||||
),
|
||||
|
||||
@@ -670,6 +670,11 @@ OPTIONAL_ENV_VARS = {
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"HONCHO_BASE_URL": {
|
||||
"description": "Base URL for self-hosted Honcho instances (no API key needed)",
|
||||
"prompt": "Honcho base URL (e.g. http://localhost:8000)",
|
||||
"category": "tool",
|
||||
},
|
||||
|
||||
# ── Messaging platforms ──
|
||||
"TELEGRAM_BOT_TOKEN": {
|
||||
|
||||
@@ -1137,10 +1137,21 @@ def _model_flow_custom(config):
|
||||
base_url = input(f"API base URL [{current_url or 'e.g. https://api.example.com/v1'}]: ").strip()
|
||||
api_key = input(f"API key [{current_key[:8] + '...' if current_key else 'optional'}]: ").strip()
|
||||
model_name = input("Model name (e.g. gpt-4, llama-3-70b): ").strip()
|
||||
context_length_str = input("Context length in tokens [leave blank for auto-detect]: ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\nCancelled.")
|
||||
return
|
||||
|
||||
context_length = None
|
||||
if context_length_str:
|
||||
try:
|
||||
context_length = int(context_length_str.replace(",", "").replace("k", "000").replace("K", "000"))
|
||||
if context_length <= 0:
|
||||
context_length = None
|
||||
except ValueError:
|
||||
print(f"Invalid context length: {context_length_str} — will auto-detect.")
|
||||
context_length = None
|
||||
|
||||
if not base_url and not current_url:
|
||||
print("No URL provided. Cancelled.")
|
||||
return
|
||||
@@ -1203,14 +1214,14 @@ def _model_flow_custom(config):
|
||||
print("Endpoint saved. Use `/model` in chat or `hermes model` to set a model.")
|
||||
|
||||
# Auto-save to custom_providers so it appears in the menu next time
|
||||
_save_custom_provider(effective_url, effective_key, model_name or "")
|
||||
_save_custom_provider(effective_url, effective_key, model_name or "", context_length=context_length)
|
||||
|
||||
|
||||
def _save_custom_provider(base_url, api_key="", model=""):
|
||||
def _save_custom_provider(base_url, api_key="", model="", context_length=None):
|
||||
"""Save a custom endpoint to custom_providers in config.yaml.
|
||||
|
||||
Deduplicates by base_url — if the URL already exists, updates the
|
||||
model name but doesn't add a duplicate entry.
|
||||
model name and context_length but doesn't add a duplicate entry.
|
||||
Auto-generates a display name from the URL hostname.
|
||||
"""
|
||||
from hermes_cli.config import load_config, save_config
|
||||
@@ -1220,14 +1231,24 @@ def _save_custom_provider(base_url, api_key="", model=""):
|
||||
if not isinstance(providers, list):
|
||||
providers = []
|
||||
|
||||
# Check if this URL is already saved — update model if so
|
||||
# Check if this URL is already saved — update model/context_length if so
|
||||
for entry in providers:
|
||||
if isinstance(entry, dict) and entry.get("base_url", "").rstrip("/") == base_url.rstrip("/"):
|
||||
changed = False
|
||||
if model and entry.get("model") != model:
|
||||
entry["model"] = model
|
||||
changed = True
|
||||
if model and context_length:
|
||||
models_cfg = entry.get("models", {})
|
||||
if not isinstance(models_cfg, dict):
|
||||
models_cfg = {}
|
||||
models_cfg[model] = {"context_length": context_length}
|
||||
entry["models"] = models_cfg
|
||||
changed = True
|
||||
if changed:
|
||||
cfg["custom_providers"] = providers
|
||||
save_config(cfg)
|
||||
return # already saved, updated model if needed
|
||||
return # already saved, updated if needed
|
||||
|
||||
# Auto-generate a name from the URL
|
||||
import re
|
||||
@@ -1249,6 +1270,8 @@ def _save_custom_provider(base_url, api_key="", model=""):
|
||||
entry["api_key"] = api_key
|
||||
if model:
|
||||
entry["model"] = model
|
||||
if model and context_length:
|
||||
entry["models"] = {model: {"context_length": context_length}}
|
||||
|
||||
providers.append(entry)
|
||||
cfg["custom_providers"] = providers
|
||||
@@ -3721,20 +3744,20 @@ For more help on a command:
|
||||
return
|
||||
has_titles = any(s.get("title") for s in sessions)
|
||||
if has_titles:
|
||||
print(f"{'Title':<22} {'Preview':<40} {'Last Active':<13} {'ID'}")
|
||||
print("─" * 100)
|
||||
print(f"{'Title':<32} {'Preview':<40} {'Last Active':<13} {'ID'}")
|
||||
print("─" * 110)
|
||||
else:
|
||||
print(f"{'Preview':<50} {'Last Active':<13} {'Src':<6} {'ID'}")
|
||||
print("─" * 90)
|
||||
print("─" * 95)
|
||||
for s in sessions:
|
||||
last_active = _relative_time(s.get("last_active"))
|
||||
preview = s.get("preview", "")[:38] if has_titles else s.get("preview", "")[:48]
|
||||
if has_titles:
|
||||
title = (s.get("title") or "—")[:20]
|
||||
sid = s["id"][:20]
|
||||
print(f"{title:<22} {preview:<40} {last_active:<13} {sid}")
|
||||
title = (s.get("title") or "—")[:30]
|
||||
sid = s["id"]
|
||||
print(f"{title:<32} {preview:<40} {last_active:<13} {sid}")
|
||||
else:
|
||||
sid = s["id"][:20]
|
||||
sid = s["id"]
|
||||
print(f"{preview:<50} {last_active:<13} {s['source']:<6} {sid}")
|
||||
|
||||
elif action == "export":
|
||||
|
||||
@@ -389,6 +389,7 @@ def detect_provider_for_model(
|
||||
Returns ``None`` when no confident match is found.
|
||||
|
||||
Priority:
|
||||
0. Bare provider name → switch to that provider's default model
|
||||
1. Direct provider with credentials (highest)
|
||||
2. Direct provider without credentials → remap to OpenRouter slug
|
||||
3. OpenRouter catalog match
|
||||
@@ -399,6 +400,21 @@ def detect_provider_for_model(
|
||||
|
||||
name_lower = name.lower()
|
||||
|
||||
# --- Step 0: bare provider name typed as model ---
|
||||
# If someone types `/model nous` or `/model anthropic`, treat it as a
|
||||
# provider switch and pick the first model from that provider's catalog.
|
||||
# Skip "custom" and "openrouter" — custom has no model catalog, and
|
||||
# openrouter requires an explicit model name to be useful.
|
||||
resolved_provider = _PROVIDER_ALIASES.get(name_lower, name_lower)
|
||||
if resolved_provider not in {"custom", "openrouter"}:
|
||||
default_models = _PROVIDER_MODELS.get(resolved_provider, [])
|
||||
if (
|
||||
resolved_provider in _PROVIDER_LABELS
|
||||
and default_models
|
||||
and resolved_provider != normalize_provider(current_provider)
|
||||
):
|
||||
return (resolved_provider, default_models[0])
|
||||
|
||||
# Aggregators list other providers' models — never auto-switch TO them
|
||||
_AGGREGATORS = {"nous", "openrouter"}
|
||||
|
||||
|
||||
@@ -24,6 +24,18 @@ def _normalize_custom_provider_name(value: str) -> str:
|
||||
return value.strip().lower().replace(" ", "-")
|
||||
|
||||
|
||||
def _detect_api_mode_for_url(base_url: str) -> Optional[str]:
|
||||
"""Auto-detect api_mode from the resolved base URL.
|
||||
|
||||
Direct api.openai.com endpoints need the Responses API for GPT-5.x
|
||||
tool calls with reasoning (chat/completions returns 400).
|
||||
"""
|
||||
normalized = (base_url or "").strip().lower().rstrip("/")
|
||||
if "api.openai.com" in normalized and "openrouter" not in normalized:
|
||||
return "codex_responses"
|
||||
return None
|
||||
|
||||
|
||||
def _auto_detect_local_model(base_url: str) -> str:
|
||||
"""Query a local server for its model name when only one model is loaded."""
|
||||
if not base_url:
|
||||
@@ -185,7 +197,9 @@ def _resolve_named_custom_runtime(
|
||||
|
||||
return {
|
||||
"provider": "openrouter",
|
||||
"api_mode": custom_provider.get("api_mode", "chat_completions"),
|
||||
"api_mode": custom_provider.get("api_mode")
|
||||
or _detect_api_mode_for_url(base_url)
|
||||
or "chat_completions",
|
||||
"base_url": base_url,
|
||||
"api_key": api_key,
|
||||
"source": f"custom_provider:{custom_provider.get('name', requested_provider)}",
|
||||
@@ -263,7 +277,9 @@ def _resolve_openrouter_runtime(
|
||||
|
||||
return {
|
||||
"provider": "openrouter",
|
||||
"api_mode": _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions",
|
||||
"api_mode": _parse_api_mode(model_cfg.get("api_mode"))
|
||||
or _detect_api_mode_for_url(base_url)
|
||||
or "chat_completions",
|
||||
"base_url": base_url,
|
||||
"api_key": api_key,
|
||||
"source": source,
|
||||
@@ -387,6 +403,12 @@ def resolve_runtime_provider(
|
||||
# (e.g. https://api.minimax.io/anthropic, https://dashscope.../anthropic)
|
||||
elif base_url.rstrip("/").endswith("/anthropic"):
|
||||
api_mode = "anthropic_messages"
|
||||
# MiniMax providers always use Anthropic Messages API.
|
||||
# Auto-correct stale /v1 URLs (from old .env or config) to /anthropic.
|
||||
elif provider in ("minimax", "minimax-cn"):
|
||||
api_mode = "anthropic_messages"
|
||||
if base_url.rstrip("/").endswith("/v1"):
|
||||
base_url = base_url.rstrip("/")[:-3] + "/anthropic"
|
||||
return {
|
||||
"provider": provider,
|
||||
"api_mode": api_mode,
|
||||
|
||||
@@ -1045,93 +1045,17 @@ def setup_model_provider(config: dict):
|
||||
print()
|
||||
print_header("Custom OpenAI-Compatible Endpoint")
|
||||
print_info("Works with any API that follows OpenAI's chat completions spec")
|
||||
print()
|
||||
|
||||
current_url = get_env_value("OPENAI_BASE_URL") or ""
|
||||
current_key = get_env_value("OPENAI_API_KEY")
|
||||
_raw_model = config.get("model", "")
|
||||
current_model = (
|
||||
_raw_model.get("default", "")
|
||||
if isinstance(_raw_model, dict)
|
||||
else (_raw_model or "")
|
||||
)
|
||||
|
||||
if current_url:
|
||||
print_info(f" Current URL: {current_url}")
|
||||
if current_key:
|
||||
print_info(f" Current key: {current_key[:8]}... (configured)")
|
||||
|
||||
base_url = prompt(
|
||||
" API base URL (e.g., https://api.example.com/v1)", current_url
|
||||
).strip()
|
||||
api_key = prompt(" API key", password=True)
|
||||
model_name = prompt(" Model name (e.g., gpt-4, claude-3-opus)", current_model)
|
||||
|
||||
if base_url:
|
||||
from hermes_cli.models import probe_api_models
|
||||
|
||||
probe = probe_api_models(api_key, base_url)
|
||||
if probe.get("used_fallback") and probe.get("resolved_base_url"):
|
||||
print_warning(
|
||||
f"Endpoint verification worked at {probe['resolved_base_url']}/models, "
|
||||
f"not the exact URL you entered. Saving the working base URL instead."
|
||||
)
|
||||
base_url = probe["resolved_base_url"]
|
||||
elif probe.get("models") is not None:
|
||||
print_success(
|
||||
f"Verified endpoint via {probe.get('probed_url')} "
|
||||
f"({len(probe.get('models') or [])} model(s) visible)"
|
||||
)
|
||||
else:
|
||||
print_warning(
|
||||
f"Could not verify this endpoint via {probe.get('probed_url')}. "
|
||||
f"Hermes will still save it."
|
||||
)
|
||||
if probe.get("suggested_base_url"):
|
||||
print_info(
|
||||
f" If this server expects /v1, try base URL: {probe['suggested_base_url']}"
|
||||
)
|
||||
|
||||
save_env_value("OPENAI_BASE_URL", base_url)
|
||||
if api_key:
|
||||
save_env_value("OPENAI_API_KEY", api_key)
|
||||
if model_name:
|
||||
_set_default_model(config, model_name)
|
||||
|
||||
try:
|
||||
from hermes_cli.auth import deactivate_provider
|
||||
|
||||
deactivate_provider()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Save provider and base_url to config.yaml so the gateway and CLI
|
||||
# both resolve the correct provider without relying on env-var heuristics.
|
||||
if base_url:
|
||||
import yaml
|
||||
|
||||
config_path = (
|
||||
Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
/ "config.yaml"
|
||||
)
|
||||
try:
|
||||
disk_cfg = {}
|
||||
if config_path.exists():
|
||||
disk_cfg = yaml.safe_load(config_path.read_text()) or {}
|
||||
model_section = disk_cfg.get("model", {})
|
||||
if isinstance(model_section, str):
|
||||
model_section = {"default": model_section}
|
||||
model_section["provider"] = "custom"
|
||||
model_section["base_url"] = base_url.rstrip("/")
|
||||
if model_name:
|
||||
model_section["default"] = model_name
|
||||
disk_cfg["model"] = model_section
|
||||
config_path.write_text(yaml.safe_dump(disk_cfg, sort_keys=False))
|
||||
except Exception as e:
|
||||
logger.debug("Could not save provider to config.yaml: %s", e)
|
||||
|
||||
_set_model_provider(config, "custom", base_url)
|
||||
|
||||
print_success("Custom endpoint configured")
|
||||
# Reuse the shared custom endpoint flow from `hermes model`.
|
||||
# This handles: URL/key/model/context-length prompts, endpoint probing,
|
||||
# env saving, config.yaml updates, and custom_providers persistence.
|
||||
from hermes_cli.main import _model_flow_custom
|
||||
_model_flow_custom(config)
|
||||
# _model_flow_custom handles model selection, config, env vars,
|
||||
# and custom_providers. Keep selected_provider = "custom" so
|
||||
# the model selection step below is skipped (line 1631 check)
|
||||
# but vision and TTS setup still run.
|
||||
|
||||
elif provider_idx == 4: # Z.AI / GLM
|
||||
selected_provider = "zai"
|
||||
|
||||
@@ -117,11 +117,13 @@ class HonchoClientConfig:
|
||||
def from_env(cls, workspace_id: str = "hermes") -> HonchoClientConfig:
|
||||
"""Create config from environment variables (fallback)."""
|
||||
api_key = os.environ.get("HONCHO_API_KEY")
|
||||
base_url = os.environ.get("HONCHO_BASE_URL", "").strip() or None
|
||||
return cls(
|
||||
workspace_id=workspace_id,
|
||||
api_key=api_key,
|
||||
environment=os.environ.get("HONCHO_ENVIRONMENT", "production"),
|
||||
enabled=bool(api_key),
|
||||
base_url=base_url,
|
||||
enabled=bool(api_key or base_url),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -171,8 +173,14 @@ class HonchoClientConfig:
|
||||
or raw.get("environment", "production")
|
||||
)
|
||||
|
||||
# Auto-enable when API key is present (unless explicitly disabled)
|
||||
# Host-level enabled wins, then root-level, then auto-enable if key exists.
|
||||
base_url = (
|
||||
raw.get("baseUrl")
|
||||
or os.environ.get("HONCHO_BASE_URL", "").strip()
|
||||
or None
|
||||
)
|
||||
|
||||
# Auto-enable when API key or base_url is present (unless explicitly disabled)
|
||||
# Host-level enabled wins, then root-level, then auto-enable if key/url exists.
|
||||
host_enabled = host_block.get("enabled")
|
||||
root_enabled = raw.get("enabled")
|
||||
if host_enabled is not None:
|
||||
@@ -180,8 +188,8 @@ class HonchoClientConfig:
|
||||
elif root_enabled is not None:
|
||||
enabled = root_enabled
|
||||
else:
|
||||
# Not explicitly set anywhere -> auto-enable if API key exists
|
||||
enabled = bool(api_key)
|
||||
# Not explicitly set anywhere -> auto-enable if API key or base_url exists
|
||||
enabled = bool(api_key or base_url)
|
||||
|
||||
# write_frequency: accept int or string
|
||||
raw_wf = (
|
||||
@@ -214,6 +222,7 @@ class HonchoClientConfig:
|
||||
workspace_id=workspace,
|
||||
api_key=api_key,
|
||||
environment=environment,
|
||||
base_url=base_url,
|
||||
peer_name=host_block.get("peerName") or raw.get("peerName"),
|
||||
ai_peer=ai_peer,
|
||||
linked_hosts=linked_hosts,
|
||||
@@ -348,11 +357,12 @@ def get_honcho_client(config: HonchoClientConfig | None = None) -> Honcho:
|
||||
if config is None:
|
||||
config = HonchoClientConfig.from_global_config()
|
||||
|
||||
if not config.api_key:
|
||||
if not config.api_key and not config.base_url:
|
||||
raise ValueError(
|
||||
"Honcho API key not found. "
|
||||
"Get your API key at https://app.honcho.dev, "
|
||||
"then run 'hermes honcho setup' or set HONCHO_API_KEY."
|
||||
"then run 'hermes honcho setup' or set HONCHO_API_KEY. "
|
||||
"For local instances, set HONCHO_BASE_URL instead."
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
3
optional-skills/mcp/DESCRIPTION.md
Normal file
3
optional-skills/mcp/DESCRIPTION.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# MCP
|
||||
|
||||
Skills for building, testing, and deploying MCP (Model Context Protocol) servers.
|
||||
299
optional-skills/mcp/fastmcp/SKILL.md
Normal file
299
optional-skills/mcp/fastmcp/SKILL.md
Normal file
@@ -0,0 +1,299 @@
|
||||
---
|
||||
name: fastmcp
|
||||
description: Build, test, inspect, install, and deploy MCP servers with FastMCP in Python. Use when creating a new MCP server, wrapping an API or database as MCP tools, exposing resources or prompts, or preparing a FastMCP server for Claude Code, Cursor, or HTTP deployment.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [MCP, FastMCP, Python, Tools, Resources, Prompts, Deployment]
|
||||
homepage: https://gofastmcp.com
|
||||
related_skills: [native-mcp, mcporter]
|
||||
prerequisites:
|
||||
commands: [python3]
|
||||
---
|
||||
|
||||
# FastMCP
|
||||
|
||||
Build MCP servers in Python with FastMCP, validate them locally, install them into MCP clients, and deploy them as HTTP endpoints.
|
||||
|
||||
## When to Use
|
||||
|
||||
Use this skill when the task is to:
|
||||
|
||||
- create a new MCP server in Python
|
||||
- wrap an API, database, CLI, or file-processing workflow as MCP tools
|
||||
- expose resources or prompts in addition to tools
|
||||
- smoke-test a server with the FastMCP CLI before wiring it into Hermes or another client
|
||||
- install a server into Claude Code, Claude Desktop, Cursor, or a similar MCP client
|
||||
- prepare a FastMCP server repo for HTTP deployment
|
||||
|
||||
Use `native-mcp` when the server already exists and only needs to be connected to Hermes. Use `mcporter` when the goal is ad-hoc CLI access to an existing MCP server instead of building one.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Install FastMCP in the working environment first:
|
||||
|
||||
```bash
|
||||
pip install fastmcp
|
||||
fastmcp version
|
||||
```
|
||||
|
||||
For the API template, install `httpx` if it is not already present:
|
||||
|
||||
```bash
|
||||
pip install httpx
|
||||
```
|
||||
|
||||
## Included Files
|
||||
|
||||
### Templates
|
||||
|
||||
- `templates/api_wrapper.py` - REST API wrapper with auth header support
|
||||
- `templates/database_server.py` - read-only SQLite query server
|
||||
- `templates/file_processor.py` - text-file inspection and search server
|
||||
|
||||
### Scripts
|
||||
|
||||
- `scripts/scaffold_fastmcp.py` - copy a starter template and replace the server name placeholder
|
||||
|
||||
### References
|
||||
|
||||
- `references/fastmcp-cli.md` - FastMCP CLI workflow, installation targets, and deployment checks
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Pick the Smallest Viable Server Shape
|
||||
|
||||
Choose the narrowest useful surface area first:
|
||||
|
||||
- API wrapper: start with 1-3 high-value endpoints, not the whole API
|
||||
- database server: expose read-only introspection and a constrained query path
|
||||
- file processor: expose deterministic operations with explicit path arguments
|
||||
- prompts/resources: add only when the client needs reusable prompt templates or discoverable documents
|
||||
|
||||
Prefer a thin server with good names, docstrings, and schemas over a large server with vague tools.
|
||||
|
||||
### 2. Scaffold from a Template
|
||||
|
||||
Copy a template directly or use the scaffold helper:
|
||||
|
||||
```bash
|
||||
python ~/.hermes/skills/mcp/fastmcp/scripts/scaffold_fastmcp.py \
|
||||
--template api_wrapper \
|
||||
--name "Acme API" \
|
||||
--output ./acme_server.py
|
||||
```
|
||||
|
||||
Available templates:
|
||||
|
||||
```bash
|
||||
python ~/.hermes/skills/mcp/fastmcp/scripts/scaffold_fastmcp.py --list
|
||||
```
|
||||
|
||||
If copying manually, replace `__SERVER_NAME__` with a real server name.
|
||||
|
||||
### 3. Implement Tools First
|
||||
|
||||
Start with `@mcp.tool` functions before adding resources or prompts.
|
||||
|
||||
Rules for tool design:
|
||||
|
||||
- Give every tool a concrete verb-based name
|
||||
- Write docstrings as user-facing tool descriptions
|
||||
- Keep parameters explicit and typed
|
||||
- Return structured JSON-safe data where possible
|
||||
- Validate unsafe inputs early
|
||||
- Prefer read-only behavior by default for first versions
|
||||
|
||||
Good tool examples:
|
||||
|
||||
- `get_customer`
|
||||
- `search_tickets`
|
||||
- `describe_table`
|
||||
- `summarize_text_file`
|
||||
|
||||
Weak tool examples:
|
||||
|
||||
- `run`
|
||||
- `process`
|
||||
- `do_thing`
|
||||
|
||||
### 4. Add Resources and Prompts Only When They Help
|
||||
|
||||
Add `@mcp.resource` when the client benefits from fetching stable read-only content such as schemas, policy docs, or generated reports.
|
||||
|
||||
Add `@mcp.prompt` when the server should provide a reusable prompt template for a known workflow.
|
||||
|
||||
Do not turn every document into a prompt. Prefer:
|
||||
|
||||
- tools for actions
|
||||
- resources for data/document retrieval
|
||||
- prompts for reusable LLM instructions
|
||||
|
||||
### 5. Test the Server Before Integrating It Anywhere
|
||||
|
||||
Use the FastMCP CLI for local validation:
|
||||
|
||||
```bash
|
||||
fastmcp inspect acme_server.py:mcp
|
||||
fastmcp list acme_server.py --json
|
||||
fastmcp call acme_server.py search_resources query=router limit=5 --json
|
||||
```
|
||||
|
||||
For fast iterative debugging, run the server locally:
|
||||
|
||||
```bash
|
||||
fastmcp run acme_server.py:mcp
|
||||
```
|
||||
|
||||
To test HTTP transport locally:
|
||||
|
||||
```bash
|
||||
fastmcp run acme_server.py:mcp --transport http --host 127.0.0.1 --port 8000
|
||||
fastmcp list http://127.0.0.1:8000/mcp --json
|
||||
fastmcp call http://127.0.0.1:8000/mcp search_resources query=router --json
|
||||
```
|
||||
|
||||
Always run at least one real `fastmcp call` against each new tool before claiming the server works.
|
||||
|
||||
### 6. Install into a Client When Local Validation Passes
|
||||
|
||||
FastMCP can register the server with supported MCP clients:
|
||||
|
||||
```bash
|
||||
fastmcp install claude-code acme_server.py
|
||||
fastmcp install claude-desktop acme_server.py
|
||||
fastmcp install cursor acme_server.py -e .
|
||||
```
|
||||
|
||||
Use `fastmcp discover` to inspect named MCP servers already configured on the machine.
|
||||
|
||||
When the goal is Hermes integration, either:
|
||||
|
||||
- configure the server in `~/.hermes/config.yaml` using the `native-mcp` skill, or
|
||||
- keep using FastMCP CLI commands during development until the interface stabilizes
|
||||
|
||||
### 7. Deploy After the Local Contract Is Stable
|
||||
|
||||
For managed hosting, Prefect Horizon is the path FastMCP documents most directly. Before deployment:
|
||||
|
||||
```bash
|
||||
fastmcp inspect acme_server.py:mcp
|
||||
```
|
||||
|
||||
Make sure the repo contains:
|
||||
|
||||
- a Python file with the FastMCP server object
|
||||
- `requirements.txt` or `pyproject.toml`
|
||||
- any environment-variable documentation needed for deployment
|
||||
|
||||
For generic HTTP hosting, validate the HTTP transport locally first, then deploy on any Python-compatible platform that can expose the server port.
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### API Wrapper Pattern
|
||||
|
||||
Use when exposing a REST or HTTP API as MCP tools.
|
||||
|
||||
Recommended first slice:
|
||||
|
||||
- one read path
|
||||
- one list/search path
|
||||
- optional health check
|
||||
|
||||
Implementation notes:
|
||||
|
||||
- keep auth in environment variables, not hardcoded
|
||||
- centralize request logic in one helper
|
||||
- surface API errors with concise context
|
||||
- normalize inconsistent upstream payloads before returning them
|
||||
|
||||
Start from `templates/api_wrapper.py`.
|
||||
|
||||
### Database Pattern
|
||||
|
||||
Use when exposing safe query and inspection capabilities.
|
||||
|
||||
Recommended first slice:
|
||||
|
||||
- `list_tables`
|
||||
- `describe_table`
|
||||
- one constrained read query tool
|
||||
|
||||
Implementation notes:
|
||||
|
||||
- default to read-only DB access
|
||||
- reject non-`SELECT` SQL in early versions
|
||||
- limit row counts
|
||||
- return rows plus column names
|
||||
|
||||
Start from `templates/database_server.py`.
|
||||
|
||||
### File Processor Pattern
|
||||
|
||||
Use when the server needs to inspect or transform files on demand.
|
||||
|
||||
Recommended first slice:
|
||||
|
||||
- summarize file contents
|
||||
- search within files
|
||||
- extract deterministic metadata
|
||||
|
||||
Implementation notes:
|
||||
|
||||
- accept explicit file paths
|
||||
- check for missing files and encoding failures
|
||||
- cap previews and result counts
|
||||
- avoid shelling out unless a specific external tool is required
|
||||
|
||||
Start from `templates/file_processor.py`.
|
||||
|
||||
## Quality Bar
|
||||
|
||||
Before handing off a FastMCP server, verify all of the following:
|
||||
|
||||
- server imports cleanly
|
||||
- `fastmcp inspect <file.py:mcp>` succeeds
|
||||
- `fastmcp list <server spec> --json` succeeds
|
||||
- every new tool has at least one real `fastmcp call`
|
||||
- environment variables are documented
|
||||
- the tool surface is small enough to understand without guesswork
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### FastMCP command missing
|
||||
|
||||
Install the package in the active environment:
|
||||
|
||||
```bash
|
||||
pip install fastmcp
|
||||
fastmcp version
|
||||
```
|
||||
|
||||
### `fastmcp inspect` fails
|
||||
|
||||
Check that:
|
||||
|
||||
- the file imports without side effects that crash
|
||||
- the FastMCP instance is named correctly in `<file.py:object>`
|
||||
- optional dependencies from the template are installed
|
||||
|
||||
### Tool works in Python but not through CLI
|
||||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
fastmcp list server.py --json
|
||||
fastmcp call server.py your_tool_name --json
|
||||
```
|
||||
|
||||
This usually exposes naming mismatches, missing required arguments, or non-serializable return values.
|
||||
|
||||
### Hermes cannot see the deployed server
|
||||
|
||||
The server-building part may be correct while the Hermes config is not. Load the `native-mcp` skill and configure the server in `~/.hermes/config.yaml`, then restart Hermes.
|
||||
|
||||
## References
|
||||
|
||||
For CLI details, install targets, and deployment checks, read `references/fastmcp-cli.md`.
|
||||
110
optional-skills/mcp/fastmcp/references/fastmcp-cli.md
Normal file
110
optional-skills/mcp/fastmcp/references/fastmcp-cli.md
Normal file
@@ -0,0 +1,110 @@
|
||||
# FastMCP CLI Reference
|
||||
|
||||
Use this file when the task needs exact FastMCP CLI workflows rather than the higher-level guidance in `SKILL.md`.
|
||||
|
||||
## Install and Verify
|
||||
|
||||
```bash
|
||||
pip install fastmcp
|
||||
fastmcp version
|
||||
```
|
||||
|
||||
FastMCP documents `pip install fastmcp` and `fastmcp version` as the baseline installation and verification path.
|
||||
|
||||
## Run a Server
|
||||
|
||||
Run a server object from a Python file:
|
||||
|
||||
```bash
|
||||
fastmcp run server.py:mcp
|
||||
```
|
||||
|
||||
Run the same server over HTTP:
|
||||
|
||||
```bash
|
||||
fastmcp run server.py:mcp --transport http --host 127.0.0.1 --port 8000
|
||||
```
|
||||
|
||||
## Inspect a Server
|
||||
|
||||
Inspect what FastMCP will expose:
|
||||
|
||||
```bash
|
||||
fastmcp inspect server.py:mcp
|
||||
```
|
||||
|
||||
This is also the check FastMCP recommends before deploying to Prefect Horizon.
|
||||
|
||||
## List and Call Tools
|
||||
|
||||
List tools from a Python file:
|
||||
|
||||
```bash
|
||||
fastmcp list server.py --json
|
||||
```
|
||||
|
||||
List tools from an HTTP endpoint:
|
||||
|
||||
```bash
|
||||
fastmcp list http://127.0.0.1:8000/mcp --json
|
||||
```
|
||||
|
||||
Call a tool with key-value arguments:
|
||||
|
||||
```bash
|
||||
fastmcp call server.py search_resources query=router limit=5 --json
|
||||
```
|
||||
|
||||
Call a tool with a full JSON input payload:
|
||||
|
||||
```bash
|
||||
fastmcp call server.py create_item '{"name": "Widget", "tags": ["sale"]}' --json
|
||||
```
|
||||
|
||||
## Discover Named MCP Servers
|
||||
|
||||
Find named servers already configured in local MCP-aware tools:
|
||||
|
||||
```bash
|
||||
fastmcp discover
|
||||
```
|
||||
|
||||
FastMCP documents name-based resolution for Claude Desktop, Claude Code, Cursor, Gemini, Goose, and `./mcp.json`.
|
||||
|
||||
## Install into MCP Clients
|
||||
|
||||
Register a server with common clients:
|
||||
|
||||
```bash
|
||||
fastmcp install claude-code server.py
|
||||
fastmcp install claude-desktop server.py
|
||||
fastmcp install cursor server.py -e .
|
||||
```
|
||||
|
||||
FastMCP notes that client installs run in isolated environments, so declare dependencies explicitly when needed with flags such as `--with`, `--env-file`, or editable installs.
|
||||
|
||||
## Deployment Checks
|
||||
|
||||
### Prefect Horizon
|
||||
|
||||
Before pushing to Horizon:
|
||||
|
||||
```bash
|
||||
fastmcp inspect server.py:mcp
|
||||
```
|
||||
|
||||
FastMCP’s Horizon docs expect:
|
||||
|
||||
- a GitHub repo
|
||||
- a Python file containing the FastMCP server object
|
||||
- dependencies declared in `requirements.txt` or `pyproject.toml`
|
||||
- an entrypoint like `main.py:mcp`
|
||||
|
||||
### Generic HTTP Hosting
|
||||
|
||||
Before shipping to any other host:
|
||||
|
||||
1. Start the server locally with HTTP transport.
|
||||
2. Verify `fastmcp list` against the local `/mcp` URL.
|
||||
3. Verify at least one `fastmcp call`.
|
||||
4. Document required environment variables.
|
||||
56
optional-skills/mcp/fastmcp/scripts/scaffold_fastmcp.py
Normal file
56
optional-skills/mcp/fastmcp/scripts/scaffold_fastmcp.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Copy a FastMCP starter template into a working file."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
SKILL_DIR = SCRIPT_DIR.parent
|
||||
TEMPLATE_DIR = SKILL_DIR / "templates"
|
||||
PLACEHOLDER = "__SERVER_NAME__"
|
||||
|
||||
|
||||
def list_templates() -> list[str]:
|
||||
return sorted(path.stem for path in TEMPLATE_DIR.glob("*.py"))
|
||||
|
||||
|
||||
def render_template(template_name: str, server_name: str) -> str:
|
||||
template_path = TEMPLATE_DIR / f"{template_name}.py"
|
||||
if not template_path.exists():
|
||||
available = ", ".join(list_templates())
|
||||
raise SystemExit(f"Unknown template '{template_name}'. Available: {available}")
|
||||
return template_path.read_text(encoding="utf-8").replace(PLACEHOLDER, server_name)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--template", help="Template name without .py suffix")
|
||||
parser.add_argument("--name", help="FastMCP server display name")
|
||||
parser.add_argument("--output", help="Destination Python file path")
|
||||
parser.add_argument("--force", action="store_true", help="Overwrite an existing output file")
|
||||
parser.add_argument("--list", action="store_true", help="List available templates and exit")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list:
|
||||
for name in list_templates():
|
||||
print(name)
|
||||
return 0
|
||||
|
||||
if not args.template or not args.name or not args.output:
|
||||
parser.error("--template, --name, and --output are required unless --list is used")
|
||||
|
||||
output_path = Path(args.output).expanduser()
|
||||
if output_path.exists() and not args.force:
|
||||
raise SystemExit(f"Refusing to overwrite existing file: {output_path}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(render_template(args.template, args.name), encoding="utf-8")
|
||||
print(f"Wrote {output_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
54
optional-skills/mcp/fastmcp/templates/api_wrapper.py
Normal file
54
optional-skills/mcp/fastmcp/templates/api_wrapper.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from fastmcp import FastMCP
|
||||
|
||||
|
||||
mcp = FastMCP("__SERVER_NAME__")
|
||||
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.example.com")
|
||||
API_TOKEN = os.getenv("API_TOKEN")
|
||||
REQUEST_TIMEOUT = float(os.getenv("API_TIMEOUT_SECONDS", "20"))
|
||||
|
||||
|
||||
def _headers() -> dict[str, str]:
|
||||
headers = {"Accept": "application/json"}
|
||||
if API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {API_TOKEN}"
|
||||
return headers
|
||||
|
||||
|
||||
def _request(method: str, path: str, *, params: dict[str, Any] | None = None) -> Any:
|
||||
url = f"{API_BASE_URL.rstrip('/')}/{path.lstrip('/')}"
|
||||
with httpx.Client(timeout=REQUEST_TIMEOUT, headers=_headers()) as client:
|
||||
response = client.request(method, url, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def health_check() -> dict[str, Any]:
|
||||
"""Check whether the upstream API is reachable."""
|
||||
payload = _request("GET", "/health")
|
||||
return {"base_url": API_BASE_URL, "result": payload}
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def get_resource(resource_id: str) -> dict[str, Any]:
|
||||
"""Fetch one resource by ID from the upstream API."""
|
||||
payload = _request("GET", f"/resources/{resource_id}")
|
||||
return {"resource_id": resource_id, "data": payload}
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def search_resources(query: str, limit: int = 10) -> dict[str, Any]:
|
||||
"""Search upstream resources by query string."""
|
||||
payload = _request("GET", "/resources", params={"q": query, "limit": limit})
|
||||
return {"query": query, "limit": limit, "results": payload}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mcp.run()
|
||||
77
optional-skills/mcp/fastmcp/templates/database_server.py
Normal file
77
optional-skills/mcp/fastmcp/templates/database_server.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from typing import Any
|
||||
|
||||
from fastmcp import FastMCP
|
||||
|
||||
|
||||
mcp = FastMCP("__SERVER_NAME__")
|
||||
|
||||
DATABASE_PATH = os.getenv("SQLITE_PATH", "./app.db")
|
||||
MAX_ROWS = int(os.getenv("SQLITE_MAX_ROWS", "200"))
|
||||
TABLE_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
||||
|
||||
|
||||
def _connect() -> sqlite3.Connection:
|
||||
return sqlite3.connect(f"file:{DATABASE_PATH}?mode=ro", uri=True)
|
||||
|
||||
|
||||
def _reject_mutation(sql: str) -> None:
|
||||
normalized = sql.strip().lower()
|
||||
if not normalized.startswith("select"):
|
||||
raise ValueError("Only SELECT queries are allowed")
|
||||
|
||||
|
||||
def _validate_table_name(table_name: str) -> str:
|
||||
if not TABLE_NAME_RE.fullmatch(table_name):
|
||||
raise ValueError("Invalid table name")
|
||||
return table_name
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def list_tables() -> list[str]:
|
||||
"""List user-defined SQLite tables."""
|
||||
with _connect() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name"
|
||||
).fetchall()
|
||||
return [row[0] for row in rows]
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def describe_table(table_name: str) -> list[dict[str, Any]]:
|
||||
"""Describe columns for a SQLite table."""
|
||||
safe_table_name = _validate_table_name(table_name)
|
||||
with _connect() as conn:
|
||||
rows = conn.execute(f"PRAGMA table_info({safe_table_name})").fetchall()
|
||||
return [
|
||||
{
|
||||
"cid": row[0],
|
||||
"name": row[1],
|
||||
"type": row[2],
|
||||
"notnull": bool(row[3]),
|
||||
"default": row[4],
|
||||
"pk": bool(row[5]),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def query(sql: str, limit: int = 50) -> dict[str, Any]:
|
||||
"""Run a read-only SELECT query and return rows plus column names."""
|
||||
_reject_mutation(sql)
|
||||
safe_limit = max(0, min(limit, MAX_ROWS))
|
||||
wrapped_sql = f"SELECT * FROM ({sql.strip().rstrip(';')}) LIMIT {safe_limit}"
|
||||
with _connect() as conn:
|
||||
cursor = conn.execute(wrapped_sql)
|
||||
columns = [column[0] for column in cursor.description or []]
|
||||
rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
||||
return {"limit": safe_limit, "columns": columns, "rows": rows}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mcp.run()
|
||||
55
optional-skills/mcp/fastmcp/templates/file_processor.py
Normal file
55
optional-skills/mcp/fastmcp/templates/file_processor.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from fastmcp import FastMCP
|
||||
|
||||
|
||||
mcp = FastMCP("__SERVER_NAME__")
|
||||
|
||||
|
||||
def _read_text(path: str) -> str:
|
||||
file_path = Path(path).expanduser()
|
||||
try:
|
||||
return file_path.read_text(encoding="utf-8")
|
||||
except FileNotFoundError as exc:
|
||||
raise ValueError(f"File not found: {file_path}") from exc
|
||||
except UnicodeDecodeError as exc:
|
||||
raise ValueError(f"File is not valid UTF-8 text: {file_path}") from exc
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def summarize_text_file(path: str, preview_chars: int = 1200) -> dict[str, int | str]:
|
||||
"""Return basic metadata and a preview for a UTF-8 text file."""
|
||||
file_path = Path(path).expanduser()
|
||||
text = _read_text(path)
|
||||
return {
|
||||
"path": str(file_path),
|
||||
"characters": len(text),
|
||||
"lines": len(text.splitlines()),
|
||||
"preview": text[:preview_chars],
|
||||
}
|
||||
|
||||
|
||||
@mcp.tool
|
||||
def search_text_file(path: str, needle: str, max_matches: int = 20) -> dict[str, Any]:
|
||||
"""Find matching lines in a UTF-8 text file."""
|
||||
file_path = Path(path).expanduser()
|
||||
matches: list[dict[str, Any]] = []
|
||||
for line_number, line in enumerate(_read_text(path).splitlines(), start=1):
|
||||
if needle.lower() in line.lower():
|
||||
matches.append({"line_number": line_number, "line": line})
|
||||
if len(matches) >= max_matches:
|
||||
break
|
||||
return {"path": str(file_path), "needle": needle, "matches": matches}
|
||||
|
||||
|
||||
@mcp.resource("file://{path}")
|
||||
def read_file_resource(path: str) -> str:
|
||||
"""Expose a text file as a resource."""
|
||||
return _read_text(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mcp.run()
|
||||
@@ -92,7 +92,7 @@ hermes-agent = "run_agent:main"
|
||||
hermes-acp = "acp_adapter.entry:main"
|
||||
|
||||
[tool.setuptools]
|
||||
py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions", "cli", "hermes_constants", "hermes_state", "hermes_time", "mini_swe_runner", "rl_cli", "utils"]
|
||||
py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions", "cli", "hermes_constants", "hermes_state", "hermes_time", "mini_swe_runner", "minisweagent_path", "rl_cli", "utils"]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["agent", "tools", "tools.*", "hermes_cli", "gateway", "gateway.*", "cron", "honcho_integration", "acp_adapter"]
|
||||
|
||||
229
run_agent.py
229
run_agent.py
@@ -400,6 +400,7 @@ class AIAgent:
|
||||
clarify_callback: callable = None,
|
||||
step_callback: callable = None,
|
||||
stream_delta_callback: callable = None,
|
||||
status_callback: callable = None,
|
||||
max_tokens: int = None,
|
||||
reasoning_config: Dict[str, Any] = None,
|
||||
prefill_messages: List[Dict[str, Any]] = None,
|
||||
@@ -501,6 +502,12 @@ class AIAgent:
|
||||
else:
|
||||
self.api_mode = "chat_completions"
|
||||
|
||||
# Direct OpenAI sessions use the Responses API path. GPT-5.x tool
|
||||
# calls with reasoning are rejected on /v1/chat/completions, and
|
||||
# Hermes is a tool-using client by default.
|
||||
if self.api_mode == "chat_completions" and self._is_direct_openai_url():
|
||||
self.api_mode = "codex_responses"
|
||||
|
||||
# Pre-warm OpenRouter model metadata cache in a background thread.
|
||||
# fetch_model_metadata() is cached for 1 hour; this avoids a blocking
|
||||
# HTTP request on the first API response when pricing is estimated.
|
||||
@@ -516,8 +523,13 @@ class AIAgent:
|
||||
self.clarify_callback = clarify_callback
|
||||
self.step_callback = step_callback
|
||||
self.stream_delta_callback = stream_delta_callback
|
||||
self.status_callback = status_callback
|
||||
self._last_reported_tool = None # Track for "new tool" mode
|
||||
|
||||
# Tool execution state — allows _vprint during tool execution
|
||||
# even when stream consumers are registered (no tokens streaming then)
|
||||
self._executing_tools = False
|
||||
|
||||
# Interrupt mechanism for breaking out of tool loops
|
||||
self._interrupt_requested = False
|
||||
self._interrupt_message = None # Optional message that triggered interrupt
|
||||
@@ -561,6 +573,12 @@ class AIAgent:
|
||||
self._budget_warning_threshold = 0.9 # 90% — urgent, respond now
|
||||
self._budget_pressure_enabled = True
|
||||
|
||||
# Context pressure warnings: notify the USER (not the LLM) as context
|
||||
# fills up. Purely informational — displayed in CLI output and sent via
|
||||
# status_callback for gateway platforms. Does NOT inject into messages.
|
||||
self._context_50_warned = False
|
||||
self._context_70_warned = False
|
||||
|
||||
# Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
|
||||
# so tool failures, API errors, etc. are inspectable after the fact.
|
||||
# In gateway mode, each incoming message creates a new AIAgent instance,
|
||||
@@ -981,6 +999,27 @@ class AIAgent:
|
||||
_config_context_length = int(_config_context_length)
|
||||
except (TypeError, ValueError):
|
||||
_config_context_length = None
|
||||
|
||||
# Check custom_providers per-model context_length
|
||||
if _config_context_length is None:
|
||||
_custom_providers = _agent_cfg.get("custom_providers")
|
||||
if isinstance(_custom_providers, list):
|
||||
for _cp_entry in _custom_providers:
|
||||
if not isinstance(_cp_entry, dict):
|
||||
continue
|
||||
_cp_url = (_cp_entry.get("base_url") or "").rstrip("/")
|
||||
if _cp_url and _cp_url == self.base_url.rstrip("/"):
|
||||
_cp_models = _cp_entry.get("models", {})
|
||||
if isinstance(_cp_models, dict):
|
||||
_cp_model_cfg = _cp_models.get(self.model, {})
|
||||
if isinstance(_cp_model_cfg, dict):
|
||||
_cp_ctx = _cp_model_cfg.get("context_length")
|
||||
if _cp_ctx is not None:
|
||||
try:
|
||||
_config_context_length = int(_cp_ctx)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
break
|
||||
|
||||
self.context_compressor = ContextCompressor(
|
||||
model=self.model,
|
||||
@@ -993,6 +1032,7 @@ class AIAgent:
|
||||
base_url=self.base_url,
|
||||
api_key=getattr(self, "api_key", ""),
|
||||
config_context_length=_config_context_length,
|
||||
provider=self.provider,
|
||||
)
|
||||
self.compression_enabled = compression_enabled
|
||||
self._user_turn_count = 0
|
||||
@@ -1016,6 +1056,46 @@ class AIAgent:
|
||||
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {self.context_compressor.threshold_tokens:,})")
|
||||
else:
|
||||
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
|
||||
|
||||
def reset_session_state(self):
|
||||
"""Reset all session-scoped token counters to 0 for a fresh session.
|
||||
|
||||
This method encapsulates the reset logic for all session-level metrics
|
||||
including:
|
||||
- Token usage counters (input, output, total, prompt, completion)
|
||||
- Cache read/write tokens
|
||||
- API call count
|
||||
- Reasoning tokens
|
||||
- Estimated cost tracking
|
||||
- Context compressor internal counters
|
||||
|
||||
The method safely handles optional attributes (e.g., context compressor)
|
||||
using ``hasattr`` checks.
|
||||
|
||||
This keeps the counter reset logic DRY and maintainable in one place
|
||||
rather than scattering it across multiple methods.
|
||||
"""
|
||||
# Token usage counters
|
||||
self.session_total_tokens = 0
|
||||
self.session_input_tokens = 0
|
||||
self.session_output_tokens = 0
|
||||
self.session_prompt_tokens = 0
|
||||
self.session_completion_tokens = 0
|
||||
self.session_cache_read_tokens = 0
|
||||
self.session_cache_write_tokens = 0
|
||||
self.session_reasoning_tokens = 0
|
||||
self.session_api_calls = 0
|
||||
self.session_estimated_cost_usd = 0.0
|
||||
self.session_cost_status = "unknown"
|
||||
self.session_cost_source = "none"
|
||||
|
||||
# Context compressor internal counters (if present)
|
||||
if hasattr(self, "context_compressor") and self.context_compressor:
|
||||
self.context_compressor.last_prompt_tokens = 0
|
||||
self.context_compressor.last_completion_tokens = 0
|
||||
self.context_compressor.last_total_tokens = 0
|
||||
self.context_compressor.compression_count = 0
|
||||
self.context_compressor._context_probed = False
|
||||
|
||||
@staticmethod
|
||||
def _safe_print(*args, **kwargs):
|
||||
@@ -1031,15 +1111,24 @@ class AIAgent:
|
||||
pass
|
||||
|
||||
def _vprint(self, *args, force: bool = False, **kwargs):
|
||||
"""Verbose print — suppressed when streaming TTS is active.
|
||||
"""Verbose print — suppressed when actively streaming tokens.
|
||||
|
||||
Pass ``force=True`` for error/warning messages that should always be
|
||||
shown even during streaming playback (TTS or display).
|
||||
|
||||
During tool execution (``_executing_tools`` is True), printing is
|
||||
allowed even with stream consumers registered because no tokens
|
||||
are being streamed at that point.
|
||||
"""
|
||||
if not force and self._has_stream_consumers():
|
||||
if not force and self._has_stream_consumers() and not self._executing_tools:
|
||||
return
|
||||
self._safe_print(*args, **kwargs)
|
||||
|
||||
def _is_direct_openai_url(self, base_url: str = None) -> bool:
|
||||
"""Return True when a base URL targets OpenAI's native API."""
|
||||
url = (base_url or self._base_url_lower).lower()
|
||||
return "api.openai.com" in url and "openrouter" not in url
|
||||
|
||||
def _max_tokens_param(self, value: int) -> dict:
|
||||
"""Return the correct max tokens kwarg for the current provider.
|
||||
|
||||
@@ -1047,11 +1136,7 @@ class AIAgent:
|
||||
'max_completion_tokens'. OpenRouter, local models, and older
|
||||
OpenAI models use 'max_tokens'.
|
||||
"""
|
||||
_is_direct_openai = (
|
||||
"api.openai.com" in self._base_url_lower
|
||||
and "openrouter" not in self._base_url_lower
|
||||
)
|
||||
if _is_direct_openai:
|
||||
if self._is_direct_openai_url():
|
||||
return {"max_completion_tokens": value}
|
||||
return {"max_tokens": value}
|
||||
|
||||
@@ -1078,10 +1163,16 @@ class AIAgent:
|
||||
return bool(cleaned.strip())
|
||||
|
||||
def _strip_think_blocks(self, content: str) -> str:
|
||||
"""Remove <think>...</think> blocks from content, returning only visible text."""
|
||||
"""Remove reasoning/thinking blocks from content, returning only visible text."""
|
||||
if not content:
|
||||
return ""
|
||||
return re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
|
||||
# Strip all reasoning tag variants: <think>, <thinking>, <THINKING>,
|
||||
# <reasoning>, <REASONING_SCRATCHPAD>
|
||||
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
|
||||
content = re.sub(r'<thinking>.*?</thinking>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
||||
content = re.sub(r'<reasoning>.*?</reasoning>', '', content, flags=re.DOTALL)
|
||||
content = re.sub(r'<REASONING_SCRATCHPAD>.*?</REASONING_SCRATCHPAD>', '', content, flags=re.DOTALL)
|
||||
return content
|
||||
|
||||
def _looks_like_codex_intermediate_ack(
|
||||
self,
|
||||
@@ -3507,13 +3598,15 @@ class AIAgent:
|
||||
fb_provider)
|
||||
return False
|
||||
|
||||
# Determine api_mode from provider
|
||||
# Determine api_mode from provider / base URL
|
||||
fb_api_mode = "chat_completions"
|
||||
fb_base_url = str(fb_client.base_url)
|
||||
if fb_provider == "openai-codex":
|
||||
fb_api_mode = "codex_responses"
|
||||
elif fb_provider == "anthropic" or fb_base_url.rstrip("/").lower().endswith("/anthropic"):
|
||||
fb_api_mode = "anthropic_messages"
|
||||
elif self._is_direct_openai_url(fb_base_url):
|
||||
fb_api_mode = "codex_responses"
|
||||
|
||||
old_model = self.model
|
||||
self.model = fb_model
|
||||
@@ -4300,6 +4393,10 @@ class AIAgent:
|
||||
except Exception as e:
|
||||
logger.debug("Session DB compression split failed: %s", e)
|
||||
|
||||
# Reset context pressure warnings — usage drops after compaction
|
||||
self._context_50_warned = False
|
||||
self._context_70_warned = False
|
||||
|
||||
return compressed, new_system_prompt
|
||||
|
||||
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
|
||||
@@ -4311,14 +4408,19 @@ class AIAgent:
|
||||
"""
|
||||
tool_calls = assistant_message.tool_calls
|
||||
|
||||
if not _should_parallelize_tool_batch(tool_calls):
|
||||
return self._execute_tool_calls_sequential(
|
||||
# Allow _vprint during tool execution even with stream consumers
|
||||
self._executing_tools = True
|
||||
try:
|
||||
if not _should_parallelize_tool_batch(tool_calls):
|
||||
return self._execute_tool_calls_sequential(
|
||||
assistant_message, messages, effective_task_id, api_call_count
|
||||
)
|
||||
|
||||
return self._execute_tool_calls_concurrent(
|
||||
assistant_message, messages, effective_task_id, api_call_count
|
||||
)
|
||||
|
||||
return self._execute_tool_calls_concurrent(
|
||||
assistant_message, messages, effective_task_id, api_call_count
|
||||
)
|
||||
finally:
|
||||
self._executing_tools = False
|
||||
|
||||
def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str) -> str:
|
||||
"""Invoke a single tool and return the result string. No display logic.
|
||||
@@ -4735,7 +4837,7 @@ class AIAgent:
|
||||
spinner.stop(cute_msg)
|
||||
elif self.quiet_mode:
|
||||
self._vprint(f" {cute_msg}")
|
||||
elif self.quiet_mode and not self._has_stream_consumers():
|
||||
elif self.quiet_mode:
|
||||
face = random.choice(KawaiiSpinner.KAWAII_WAITING)
|
||||
emoji = _get_tool_emoji(function_name)
|
||||
preview = _build_tool_preview(function_name, function_args) or function_name
|
||||
@@ -4875,6 +4977,45 @@ class AIAgent:
|
||||
)
|
||||
return None
|
||||
|
||||
def _emit_context_pressure(self, compaction_progress: float, compressor) -> None:
|
||||
"""Notify the user that context is approaching the compaction threshold.
|
||||
|
||||
Args:
|
||||
compaction_progress: How close to compaction (0.0–1.0, where 1.0 = fires).
|
||||
compressor: The ContextCompressor instance (for threshold/context info).
|
||||
|
||||
Purely user-facing — does NOT modify the message stream.
|
||||
For CLI: prints a formatted line with a progress bar.
|
||||
For gateway: fires status_callback so the platform can send a chat message.
|
||||
"""
|
||||
from agent.display import format_context_pressure, format_context_pressure_gateway
|
||||
|
||||
threshold_pct = compressor.threshold_tokens / compressor.context_length if compressor.context_length else 0.5
|
||||
|
||||
# CLI output — always shown (these are user-facing status notifications,
|
||||
# not verbose debug output, so they bypass quiet_mode).
|
||||
# Gateway users also get the callback below.
|
||||
if self.platform in (None, "cli"):
|
||||
line = format_context_pressure(
|
||||
compaction_progress=compaction_progress,
|
||||
threshold_tokens=compressor.threshold_tokens,
|
||||
threshold_percent=threshold_pct,
|
||||
compression_enabled=self.compression_enabled,
|
||||
)
|
||||
self._safe_print(line)
|
||||
|
||||
# Gateway / external consumers
|
||||
if self.status_callback:
|
||||
try:
|
||||
msg = format_context_pressure_gateway(
|
||||
compaction_progress=compaction_progress,
|
||||
threshold_percent=threshold_pct,
|
||||
compression_enabled=self.compression_enabled,
|
||||
)
|
||||
self.status_callback("context_pressure", msg)
|
||||
except Exception:
|
||||
logger.debug("status_callback error in context pressure", exc_info=True)
|
||||
|
||||
def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
|
||||
"""Request a summary when max iterations are reached. Returns the final response text."""
|
||||
print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...")
|
||||
@@ -5375,14 +5516,17 @@ class AIAgent:
|
||||
self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...")
|
||||
self._vprint(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
|
||||
self._vprint(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}")
|
||||
elif not self._has_stream_consumers():
|
||||
# Animated thinking spinner in quiet mode (skip during streaming)
|
||||
else:
|
||||
# Animated thinking spinner in quiet mode
|
||||
face = random.choice(KawaiiSpinner.KAWAII_THINKING)
|
||||
verb = random.choice(KawaiiSpinner.THINKING_VERBS)
|
||||
if self.thinking_callback:
|
||||
# CLI TUI mode: use prompt_toolkit widget instead of raw spinner
|
||||
# (works in both streaming and non-streaming modes)
|
||||
self.thinking_callback(f"{face} {verb}...")
|
||||
else:
|
||||
elif not self._has_stream_consumers():
|
||||
# Raw KawaiiSpinner only when no streaming consumers
|
||||
# (would conflict with streamed token output)
|
||||
spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star'])
|
||||
thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type)
|
||||
thinking_spinner.start()
|
||||
@@ -6408,7 +6552,13 @@ class AIAgent:
|
||||
)
|
||||
|
||||
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
|
||||
|
||||
|
||||
# Signal streaming display that this was an intermediate turn.
|
||||
# Content may have been streamed (opening the response box) before
|
||||
# tool_calls were detected. The None sentinel tells the CLI to
|
||||
# close any open box and reset so the final response gets a clean frame.
|
||||
self._fire_stream_delta(None)
|
||||
|
||||
# If this turn has both content AND tool_calls, capture the content
|
||||
# as a fallback final response. Common pattern: model delivers its
|
||||
# answer and calls memory/skill tools as a side-effect in the same
|
||||
@@ -6447,6 +6597,23 @@ class AIAgent:
|
||||
+ _compressor.last_completion_tokens
|
||||
+ _new_chars // 3 # conservative: JSON-heavy tool results ≈ 3 chars/token
|
||||
)
|
||||
|
||||
# ── Context pressure warnings (user-facing only) ──────────
|
||||
# Notify the user (NOT the LLM) as context approaches the
|
||||
# compaction threshold. Thresholds are relative to where
|
||||
# compaction fires, not the raw context window.
|
||||
# Does not inject into messages — just prints to CLI output
|
||||
# and fires status_callback for gateway platforms.
|
||||
if _compressor.threshold_tokens > 0:
|
||||
_compaction_progress = _estimated_next_prompt / _compressor.threshold_tokens
|
||||
if _compaction_progress >= 0.85 and not self._context_70_warned:
|
||||
self._context_70_warned = True
|
||||
self._context_50_warned = True # skip first tier if we jumped past it
|
||||
self._emit_context_pressure(_compaction_progress, _compressor)
|
||||
elif _compaction_progress >= 0.60 and not self._context_50_warned:
|
||||
self._context_50_warned = True
|
||||
self._emit_context_pressure(_compaction_progress, _compressor)
|
||||
|
||||
if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
|
||||
messages, active_system_prompt = self._compress_context(
|
||||
messages, system_message,
|
||||
@@ -6532,7 +6699,21 @@ class AIAgent:
|
||||
self._response_was_previewed = True
|
||||
break
|
||||
|
||||
# No fallback -- append the empty message as-is
|
||||
# No fallback -- if reasoning_text exists, the model put its
|
||||
# entire response inside <think> tags; use that as the content.
|
||||
if reasoning_text:
|
||||
self._vprint(f"{self.log_prefix}Using reasoning as response content (model wrapped entire response in think tags).", force=True)
|
||||
final_response = reasoning_text
|
||||
empty_msg = {
|
||||
"role": "assistant",
|
||||
"content": final_response,
|
||||
"reasoning": reasoning_text,
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
messages.append(empty_msg)
|
||||
break
|
||||
|
||||
# Truly empty -- no reasoning and no content
|
||||
empty_msg = {
|
||||
"role": "assistant",
|
||||
"content": final_response,
|
||||
@@ -6540,10 +6721,10 @@ class AIAgent:
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
messages.append(empty_msg)
|
||||
|
||||
|
||||
self._cleanup_task_resources(effective_task_id)
|
||||
self._persist_session(messages, conversation_history)
|
||||
|
||||
|
||||
return {
|
||||
"final_response": final_response or None,
|
||||
"messages": messages,
|
||||
|
||||
@@ -16,7 +16,7 @@ Use this skill when a user asks about configuring Hermes, enabling features, set
|
||||
- API keys: `~/.hermes/.env`
|
||||
- Skills: `~/.hermes/skills/`
|
||||
- Hermes install: `~/.hermes/hermes-agent/`
|
||||
- Venv: `~/.hermes/hermes-agent/.venv/` (or `venv/`)
|
||||
- Venv: `~/.hermes/hermes-agent/venv/`
|
||||
|
||||
## CLI Overview
|
||||
|
||||
@@ -98,7 +98,7 @@ The interactive setup wizard walks through:
|
||||
Run it from terminal:
|
||||
```bash
|
||||
cd ~/.hermes/hermes-agent
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m hermes_cli.main setup
|
||||
```
|
||||
|
||||
@@ -140,7 +140,7 @@ Voice messages from Telegram/Discord/WhatsApp/Slack/Signal are auto-transcribed
|
||||
|
||||
```bash
|
||||
cd ~/.hermes/hermes-agent
|
||||
source .venv/bin/activate # or: source venv/bin/activate
|
||||
source venv/bin/activate
|
||||
pip install faster-whisper
|
||||
```
|
||||
|
||||
@@ -189,7 +189,7 @@ Hermes can reply with voice when users send voice messages.
|
||||
|
||||
```bash
|
||||
cd ~/.hermes/hermes-agent
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m hermes_cli.main tools
|
||||
```
|
||||
|
||||
@@ -217,7 +217,7 @@ Use `/reset` in the chat to start a fresh session with the new toolset. Tool cha
|
||||
Some tools need extra packages:
|
||||
|
||||
```bash
|
||||
cd ~/.hermes/hermes-agent && source .venv/bin/activate
|
||||
cd ~/.hermes/hermes-agent && source venv/bin/activate
|
||||
|
||||
pip install faster-whisper # Local STT (voice transcription)
|
||||
pip install browserbase # Browser automation
|
||||
|
||||
@@ -12,7 +12,7 @@ training server.
|
||||
|
||||
```bash
|
||||
cd ~/.hermes/hermes-agent
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
|
||||
python environments/your_env.py process \
|
||||
--env.total_steps 1 \
|
||||
|
||||
@@ -22,6 +22,7 @@ from unittest.mock import patch, MagicMock
|
||||
from agent.model_metadata import (
|
||||
CONTEXT_PROBE_TIERS,
|
||||
DEFAULT_CONTEXT_LENGTHS,
|
||||
_strip_provider_prefix,
|
||||
estimate_tokens_rough,
|
||||
estimate_messages_tokens_rough,
|
||||
get_model_context_length,
|
||||
@@ -105,9 +106,14 @@ class TestEstimateMessagesTokensRough:
|
||||
# =========================================================================
|
||||
|
||||
class TestDefaultContextLengths:
|
||||
def test_claude_models_200k(self):
|
||||
def test_claude_models_context_lengths(self):
|
||||
for key, value in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
if "claude" in key:
|
||||
if "claude" not in key:
|
||||
continue
|
||||
# Claude 4.6 models have 1M context
|
||||
if "4.6" in key or "4-6" in key:
|
||||
assert value == 1000000, f"{key} should be 1000000"
|
||||
else:
|
||||
assert value == 200000, f"{key} should be 200000"
|
||||
|
||||
def test_gpt4_models_128k_or_1m(self):
|
||||
@@ -292,6 +298,49 @@ class TestGetModelContextLength:
|
||||
assert result == 200000
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# _strip_provider_prefix — Ollama model:tag vs provider:model
|
||||
# =========================================================================
|
||||
|
||||
class TestStripProviderPrefix:
|
||||
def test_known_provider_prefix_is_stripped(self):
|
||||
assert _strip_provider_prefix("local:my-model") == "my-model"
|
||||
assert _strip_provider_prefix("openrouter:anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
|
||||
assert _strip_provider_prefix("anthropic:claude-sonnet-4") == "claude-sonnet-4"
|
||||
|
||||
def test_ollama_model_tag_preserved(self):
|
||||
"""Ollama model:tag format must NOT be stripped."""
|
||||
assert _strip_provider_prefix("qwen3.5:27b") == "qwen3.5:27b"
|
||||
assert _strip_provider_prefix("llama3.3:70b") == "llama3.3:70b"
|
||||
assert _strip_provider_prefix("gemma2:9b") == "gemma2:9b"
|
||||
assert _strip_provider_prefix("codellama:13b-instruct-q4_0") == "codellama:13b-instruct-q4_0"
|
||||
|
||||
def test_http_urls_preserved(self):
|
||||
assert _strip_provider_prefix("http://example.com") == "http://example.com"
|
||||
assert _strip_provider_prefix("https://example.com") == "https://example.com"
|
||||
|
||||
def test_no_colon_returns_unchanged(self):
|
||||
assert _strip_provider_prefix("gpt-4o") == "gpt-4o"
|
||||
assert _strip_provider_prefix("anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
|
||||
|
||||
@patch("agent.model_metadata.fetch_model_metadata")
|
||||
def test_ollama_model_tag_not_mangled_in_context_lookup(self, mock_fetch):
|
||||
"""Ensure 'qwen3.5:27b' is NOT reduced to '27b' during context length lookup.
|
||||
|
||||
We mock a custom endpoint that knows 'qwen3.5:27b' — the full name
|
||||
must reach the endpoint metadata lookup intact.
|
||||
"""
|
||||
mock_fetch.return_value = {}
|
||||
with patch("agent.model_metadata.fetch_endpoint_model_metadata") as mock_ep, \
|
||||
patch("agent.model_metadata._is_custom_endpoint", return_value=True):
|
||||
mock_ep.return_value = {"qwen3.5:27b": {"context_length": 32768}}
|
||||
result = get_model_context_length(
|
||||
"qwen3.5:27b",
|
||||
base_url="http://localhost:11434/v1",
|
||||
)
|
||||
assert result == 32768
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# fetch_model_metadata — caching, TTL, slugs, failures
|
||||
# =========================================================================
|
||||
@@ -423,35 +472,35 @@ class TestContextProbeTiers:
|
||||
for i in range(len(CONTEXT_PROBE_TIERS) - 1):
|
||||
assert CONTEXT_PROBE_TIERS[i] > CONTEXT_PROBE_TIERS[i + 1]
|
||||
|
||||
def test_first_tier_is_2m(self):
|
||||
assert CONTEXT_PROBE_TIERS[0] == 2_000_000
|
||||
def test_first_tier_is_128k(self):
|
||||
assert CONTEXT_PROBE_TIERS[0] == 128_000
|
||||
|
||||
def test_last_tier_is_32k(self):
|
||||
assert CONTEXT_PROBE_TIERS[-1] == 32_000
|
||||
def test_last_tier_is_8k(self):
|
||||
assert CONTEXT_PROBE_TIERS[-1] == 8_000
|
||||
|
||||
|
||||
class TestGetNextProbeTier:
|
||||
def test_from_2m(self):
|
||||
assert get_next_probe_tier(2_000_000) == 1_000_000
|
||||
|
||||
def test_from_1m(self):
|
||||
assert get_next_probe_tier(1_000_000) == 512_000
|
||||
|
||||
def test_from_128k(self):
|
||||
assert get_next_probe_tier(128_000) == 64_000
|
||||
|
||||
def test_from_32k_returns_none(self):
|
||||
assert get_next_probe_tier(32_000) is None
|
||||
def test_from_64k(self):
|
||||
assert get_next_probe_tier(64_000) == 32_000
|
||||
|
||||
def test_from_32k(self):
|
||||
assert get_next_probe_tier(32_000) == 16_000
|
||||
|
||||
def test_from_8k_returns_none(self):
|
||||
assert get_next_probe_tier(8_000) is None
|
||||
|
||||
def test_from_below_min_returns_none(self):
|
||||
assert get_next_probe_tier(16_000) is None
|
||||
assert get_next_probe_tier(4_000) is None
|
||||
|
||||
def test_from_arbitrary_value(self):
|
||||
assert get_next_probe_tier(300_000) == 200_000
|
||||
assert get_next_probe_tier(100_000) == 64_000
|
||||
|
||||
def test_above_max_tier(self):
|
||||
"""Value above 2M should return 2M."""
|
||||
assert get_next_probe_tier(5_000_000) == 2_000_000
|
||||
"""Value above 128K should return 128K."""
|
||||
assert get_next_probe_tier(500_000) == 128_000
|
||||
|
||||
def test_zero_returns_none(self):
|
||||
assert get_next_probe_tier(0) is None
|
||||
|
||||
197
tests/agent/test_models_dev.py
Normal file
197
tests/agent/test_models_dev.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""Tests for agent.models_dev — models.dev registry integration."""
|
||||
import json
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
from agent.models_dev import (
|
||||
PROVIDER_TO_MODELS_DEV,
|
||||
_extract_context,
|
||||
fetch_models_dev,
|
||||
lookup_models_dev_context,
|
||||
)
|
||||
|
||||
|
||||
SAMPLE_REGISTRY = {
|
||||
"anthropic": {
|
||||
"id": "anthropic",
|
||||
"name": "Anthropic",
|
||||
"models": {
|
||||
"claude-opus-4-6": {
|
||||
"id": "claude-opus-4-6",
|
||||
"limit": {"context": 1000000, "output": 128000},
|
||||
},
|
||||
"claude-sonnet-4-6": {
|
||||
"id": "claude-sonnet-4-6",
|
||||
"limit": {"context": 1000000, "output": 64000},
|
||||
},
|
||||
"claude-sonnet-4-0": {
|
||||
"id": "claude-sonnet-4-0",
|
||||
"limit": {"context": 200000, "output": 64000},
|
||||
},
|
||||
},
|
||||
},
|
||||
"github-copilot": {
|
||||
"id": "github-copilot",
|
||||
"name": "GitHub Copilot",
|
||||
"models": {
|
||||
"claude-opus-4.6": {
|
||||
"id": "claude-opus-4.6",
|
||||
"limit": {"context": 128000, "output": 32000},
|
||||
},
|
||||
},
|
||||
},
|
||||
"kilo": {
|
||||
"id": "kilo",
|
||||
"name": "Kilo Gateway",
|
||||
"models": {
|
||||
"anthropic/claude-sonnet-4.6": {
|
||||
"id": "anthropic/claude-sonnet-4.6",
|
||||
"limit": {"context": 1000000, "output": 128000},
|
||||
},
|
||||
},
|
||||
},
|
||||
"deepseek": {
|
||||
"id": "deepseek",
|
||||
"name": "DeepSeek",
|
||||
"models": {
|
||||
"deepseek-chat": {
|
||||
"id": "deepseek-chat",
|
||||
"limit": {"context": 128000, "output": 8192},
|
||||
},
|
||||
},
|
||||
},
|
||||
"audio-only": {
|
||||
"id": "audio-only",
|
||||
"models": {
|
||||
"tts-model": {
|
||||
"id": "tts-model",
|
||||
"limit": {"context": 0, "output": 0},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TestProviderMapping:
|
||||
def test_all_mapped_providers_are_strings(self):
|
||||
for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items():
|
||||
assert isinstance(hermes_id, str)
|
||||
assert isinstance(mdev_id, str)
|
||||
|
||||
def test_known_providers_mapped(self):
|
||||
assert PROVIDER_TO_MODELS_DEV["anthropic"] == "anthropic"
|
||||
assert PROVIDER_TO_MODELS_DEV["copilot"] == "github-copilot"
|
||||
assert PROVIDER_TO_MODELS_DEV["kilocode"] == "kilo"
|
||||
assert PROVIDER_TO_MODELS_DEV["ai-gateway"] == "vercel"
|
||||
|
||||
def test_unmapped_provider_not_in_dict(self):
|
||||
assert "nous" not in PROVIDER_TO_MODELS_DEV
|
||||
assert "openai-codex" not in PROVIDER_TO_MODELS_DEV
|
||||
|
||||
|
||||
class TestExtractContext:
|
||||
def test_valid_entry(self):
|
||||
assert _extract_context({"limit": {"context": 128000}}) == 128000
|
||||
|
||||
def test_zero_context_returns_none(self):
|
||||
assert _extract_context({"limit": {"context": 0}}) is None
|
||||
|
||||
def test_missing_limit_returns_none(self):
|
||||
assert _extract_context({"id": "test"}) is None
|
||||
|
||||
def test_missing_context_returns_none(self):
|
||||
assert _extract_context({"limit": {"output": 8192}}) is None
|
||||
|
||||
def test_non_dict_returns_none(self):
|
||||
assert _extract_context("not a dict") is None
|
||||
|
||||
def test_float_context_coerced_to_int(self):
|
||||
assert _extract_context({"limit": {"context": 131072.0}}) == 131072
|
||||
|
||||
|
||||
class TestLookupModelsDevContext:
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_exact_match(self, mock_fetch):
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
assert lookup_models_dev_context("anthropic", "claude-opus-4-6") == 1000000
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_case_insensitive_match(self, mock_fetch):
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
assert lookup_models_dev_context("anthropic", "Claude-Opus-4-6") == 1000000
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_provider_not_mapped(self, mock_fetch):
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
assert lookup_models_dev_context("nous", "some-model") is None
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_model_not_found(self, mock_fetch):
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
assert lookup_models_dev_context("anthropic", "nonexistent-model") is None
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_provider_aware_context(self, mock_fetch):
|
||||
"""Same model, different context per provider."""
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
# Anthropic direct: 1M
|
||||
assert lookup_models_dev_context("anthropic", "claude-opus-4-6") == 1000000
|
||||
# GitHub Copilot: only 128K for same model
|
||||
assert lookup_models_dev_context("copilot", "claude-opus-4.6") == 128000
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_zero_context_filtered(self, mock_fetch):
|
||||
mock_fetch.return_value = SAMPLE_REGISTRY
|
||||
# audio-only is not a mapped provider, but test the filtering directly
|
||||
data = SAMPLE_REGISTRY["audio-only"]["models"]["tts-model"]
|
||||
assert _extract_context(data) is None
|
||||
|
||||
@patch("agent.models_dev.fetch_models_dev")
|
||||
def test_empty_registry(self, mock_fetch):
|
||||
mock_fetch.return_value = {}
|
||||
assert lookup_models_dev_context("anthropic", "claude-opus-4-6") is None
|
||||
|
||||
|
||||
class TestFetchModelsDev:
|
||||
@patch("agent.models_dev.requests.get")
|
||||
def test_fetch_success(self, mock_get):
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = SAMPLE_REGISTRY
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_get.return_value = mock_resp
|
||||
|
||||
# Clear caches
|
||||
import agent.models_dev as md
|
||||
md._models_dev_cache = {}
|
||||
md._models_dev_cache_time = 0
|
||||
|
||||
with patch.object(md, "_save_disk_cache"):
|
||||
result = fetch_models_dev(force_refresh=True)
|
||||
|
||||
assert "anthropic" in result
|
||||
assert len(result) == len(SAMPLE_REGISTRY)
|
||||
|
||||
@patch("agent.models_dev.requests.get")
|
||||
def test_fetch_failure_returns_stale_cache(self, mock_get):
|
||||
mock_get.side_effect = Exception("network error")
|
||||
|
||||
import agent.models_dev as md
|
||||
md._models_dev_cache = SAMPLE_REGISTRY
|
||||
md._models_dev_cache_time = 0 # expired
|
||||
|
||||
with patch.object(md, "_load_disk_cache", return_value=SAMPLE_REGISTRY):
|
||||
result = fetch_models_dev(force_refresh=True)
|
||||
|
||||
assert "anthropic" in result
|
||||
|
||||
@patch("agent.models_dev.requests.get")
|
||||
def test_in_memory_cache_used(self, mock_get):
|
||||
import agent.models_dev as md
|
||||
import time
|
||||
md._models_dev_cache = SAMPLE_REGISTRY
|
||||
md._models_dev_cache_time = time.time() # fresh
|
||||
|
||||
result = fetch_models_dev()
|
||||
mock_get.assert_not_called()
|
||||
assert result == SAMPLE_REGISTRY
|
||||
267
tests/gateway/test_session_race_guard.py
Normal file
267
tests/gateway/test_session_race_guard.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Tests for the session race guard that prevents concurrent agent runs.
|
||||
|
||||
The sentinel-based guard ensures that when _handle_message passes the
|
||||
"is an agent already running?" check and proceeds to the slow async
|
||||
setup path (vision enrichment, STT, hooks, session hygiene), a second
|
||||
message for the same session is correctly recognized as "already running"
|
||||
and routed through the interrupt/queue path instead of spawning a
|
||||
duplicate agent.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
||||
from gateway.platforms.base import MessageEvent, MessageType
|
||||
from gateway.run import GatewayRunner, _AGENT_PENDING_SENTINEL
|
||||
from gateway.session import SessionSource, build_session_key
|
||||
|
||||
|
||||
class _FakeAdapter:
|
||||
"""Minimal adapter stub for testing."""
|
||||
|
||||
def __init__(self):
|
||||
self._pending_messages = {}
|
||||
|
||||
async def send(self, chat_id, text, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def _make_runner():
|
||||
runner = object.__new__(GatewayRunner)
|
||||
runner.config = GatewayConfig(
|
||||
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
|
||||
)
|
||||
runner.adapters = {Platform.TELEGRAM: _FakeAdapter()}
|
||||
runner._running_agents = {}
|
||||
runner._pending_messages = {}
|
||||
runner._pending_approvals = {}
|
||||
runner._voice_mode = {}
|
||||
runner._is_user_authorized = lambda _source: True
|
||||
return runner
|
||||
|
||||
|
||||
def _make_event(text="hello", chat_id="12345"):
|
||||
source = SessionSource(
|
||||
platform=Platform.TELEGRAM, chat_id=chat_id, chat_type="dm"
|
||||
)
|
||||
return MessageEvent(text=text, message_type=MessageType.TEXT, source=source)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 1: Sentinel is placed before _handle_message_with_agent runs
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_sentinel_placed_before_agent_setup():
|
||||
"""After passing the 'not running' guard, the sentinel must be
|
||||
written into _running_agents *before* any await, so that a
|
||||
concurrent message sees the session as occupied."""
|
||||
runner = _make_runner()
|
||||
event = _make_event()
|
||||
session_key = build_session_key(event.source)
|
||||
|
||||
# Patch _handle_message_with_agent to capture state at entry
|
||||
sentinel_was_set = False
|
||||
|
||||
async def mock_inner(self_inner, ev, src, qk):
|
||||
nonlocal sentinel_was_set
|
||||
sentinel_was_set = runner._running_agents.get(qk) is _AGENT_PENDING_SENTINEL
|
||||
return "ok"
|
||||
|
||||
with patch.object(GatewayRunner, "_handle_message_with_agent", mock_inner):
|
||||
await runner._handle_message(event)
|
||||
|
||||
assert sentinel_was_set, (
|
||||
"Sentinel must be in _running_agents when _handle_message_with_agent starts"
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 2: Sentinel is cleaned up after _handle_message_with_agent
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_sentinel_cleaned_up_after_handler_returns():
|
||||
"""If _handle_message_with_agent returns normally, the sentinel
|
||||
must be removed so the session is not permanently locked."""
|
||||
runner = _make_runner()
|
||||
event = _make_event()
|
||||
session_key = build_session_key(event.source)
|
||||
|
||||
async def mock_inner(self_inner, ev, src, qk):
|
||||
return "ok"
|
||||
|
||||
with patch.object(GatewayRunner, "_handle_message_with_agent", mock_inner):
|
||||
await runner._handle_message(event)
|
||||
|
||||
assert session_key not in runner._running_agents, (
|
||||
"Sentinel must be removed after handler completes"
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 3: Sentinel cleaned up on exception
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_sentinel_cleaned_up_on_exception():
|
||||
"""If _handle_message_with_agent raises, the sentinel must still
|
||||
be cleaned up so the session is not permanently locked."""
|
||||
runner = _make_runner()
|
||||
event = _make_event()
|
||||
session_key = build_session_key(event.source)
|
||||
|
||||
async def mock_inner(self_inner, ev, src, qk):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
with patch.object(GatewayRunner, "_handle_message_with_agent", mock_inner):
|
||||
with pytest.raises(RuntimeError, match="boom"):
|
||||
await runner._handle_message(event)
|
||||
|
||||
assert session_key not in runner._running_agents, (
|
||||
"Sentinel must be removed even if handler raises"
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 4: Second message during sentinel sees "already running"
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_second_message_during_sentinel_queued_not_duplicate():
|
||||
"""While the sentinel is set (agent setup in progress), a second
|
||||
message for the same session must hit the 'already running' branch
|
||||
and be queued — not start a second agent."""
|
||||
runner = _make_runner()
|
||||
event1 = _make_event(text="first message")
|
||||
event2 = _make_event(text="second message")
|
||||
session_key = build_session_key(event1.source)
|
||||
|
||||
barrier = asyncio.Event()
|
||||
|
||||
async def slow_inner(self_inner, ev, src, qk):
|
||||
# Simulate slow setup — wait until test tells us to proceed
|
||||
await barrier.wait()
|
||||
return "ok"
|
||||
|
||||
with patch.object(GatewayRunner, "_handle_message_with_agent", slow_inner):
|
||||
# Start first message (will block at barrier)
|
||||
task1 = asyncio.create_task(runner._handle_message(event1))
|
||||
# Yield so task1 enters slow_inner and sentinel is set
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# Verify sentinel is set
|
||||
assert runner._running_agents.get(session_key) is _AGENT_PENDING_SENTINEL
|
||||
|
||||
# Second message should see "already running" and be queued
|
||||
result2 = await runner._handle_message(event2)
|
||||
assert result2 is None, "Second message should return None (queued)"
|
||||
|
||||
# The second message should have been queued in adapter pending
|
||||
adapter = runner.adapters[Platform.TELEGRAM]
|
||||
assert session_key in adapter._pending_messages, (
|
||||
"Second message should be queued as pending"
|
||||
)
|
||||
assert adapter._pending_messages[session_key] is event2
|
||||
|
||||
# Let first message complete
|
||||
barrier.set()
|
||||
await task1
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 5: Sentinel not placed for command messages
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_command_messages_do_not_leave_sentinel():
|
||||
"""Slash commands (/help, /status, etc.) return early from
|
||||
_handle_message. They must NOT leave a sentinel behind."""
|
||||
runner = _make_runner()
|
||||
source = SessionSource(
|
||||
platform=Platform.TELEGRAM, chat_id="12345", chat_type="dm"
|
||||
)
|
||||
event = MessageEvent(
|
||||
text="/help", message_type=MessageType.TEXT, source=source
|
||||
)
|
||||
session_key = build_session_key(source)
|
||||
|
||||
# Mock the help handler to avoid needing full runner setup
|
||||
runner._handle_help_command = AsyncMock(return_value="Help text")
|
||||
# Need hooks for command emission
|
||||
runner.hooks = MagicMock()
|
||||
runner.hooks.emit = AsyncMock()
|
||||
|
||||
await runner._handle_message(event)
|
||||
|
||||
assert session_key not in runner._running_agents, (
|
||||
"Command handlers must not leave sentinel in _running_agents"
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 6: /stop during sentinel returns helpful message
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_stop_during_sentinel_returns_message():
|
||||
"""If /stop arrives while the sentinel is set (agent still starting),
|
||||
it should return a helpful message instead of crashing or queuing."""
|
||||
runner = _make_runner()
|
||||
event1 = _make_event(text="hello")
|
||||
session_key = build_session_key(event1.source)
|
||||
|
||||
barrier = asyncio.Event()
|
||||
|
||||
async def slow_inner(self_inner, ev, src, qk):
|
||||
await barrier.wait()
|
||||
return "ok"
|
||||
|
||||
with patch.object(GatewayRunner, "_handle_message_with_agent", slow_inner):
|
||||
task1 = asyncio.create_task(runner._handle_message(event1))
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# Sentinel should be set
|
||||
assert runner._running_agents.get(session_key) is _AGENT_PENDING_SENTINEL
|
||||
|
||||
# Send /stop — should get a message, not crash
|
||||
stop_event = _make_event(text="/stop")
|
||||
result = await runner._handle_message(stop_event)
|
||||
assert result is not None, "/stop during sentinel should return a message"
|
||||
assert "starting up" in result.lower()
|
||||
|
||||
# Should NOT be queued as pending
|
||||
adapter = runner.adapters[Platform.TELEGRAM]
|
||||
assert session_key not in adapter._pending_messages
|
||||
|
||||
barrier.set()
|
||||
await task1
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Test 7: Shutdown skips sentinel entries
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.asyncio
|
||||
async def test_shutdown_skips_sentinel():
|
||||
"""During gateway shutdown, sentinel entries in _running_agents
|
||||
should be skipped without raising AttributeError."""
|
||||
runner = _make_runner()
|
||||
session_key = "telegram:dm:99999"
|
||||
|
||||
# Simulate a sentinel in _running_agents
|
||||
runner._running_agents[session_key] = _AGENT_PENDING_SENTINEL
|
||||
|
||||
# Also add a real agent mock to verify it still gets interrupted
|
||||
real_agent = MagicMock()
|
||||
runner._running_agents["telegram:dm:88888"] = real_agent
|
||||
|
||||
runner.adapters = {} # No adapters to disconnect
|
||||
runner._running = True
|
||||
runner._shutdown_event = asyncio.Event()
|
||||
runner._exit_reason = None
|
||||
runner._shutdown_all_gateway_honcho = lambda: None
|
||||
|
||||
with patch("gateway.status.remove_pid_file"), \
|
||||
patch("gateway.status.write_runtime_status"):
|
||||
await runner.stop()
|
||||
|
||||
# Real agent should have been interrupted
|
||||
real_agent.interrupt.assert_called_once()
|
||||
# Should not have raised on the sentinel
|
||||
@@ -97,30 +97,32 @@ def test_custom_setup_clears_active_oauth_provider(tmp_path, monkeypatch):
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
|
||||
prompt_values = iter(
|
||||
[
|
||||
"https://custom.example/v1",
|
||||
"custom-api-key",
|
||||
"custom/model",
|
||||
]
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.setup.prompt",
|
||||
lambda *args, **kwargs: next(prompt_values),
|
||||
)
|
||||
# _model_flow_custom uses builtins.input (URL, key, model, context_length)
|
||||
input_values = iter([
|
||||
"https://custom.example/v1",
|
||||
"custom-api-key",
|
||||
"custom/model",
|
||||
"", # context_length (blank = auto-detect)
|
||||
])
|
||||
monkeypatch.setattr("builtins.input", lambda _prompt="": next(input_values))
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False)
|
||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||
monkeypatch.setattr("hermes_cli.main._save_custom_provider", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.models.probe_api_models",
|
||||
lambda api_key, base_url: {"models": ["m"], "probed_url": base_url + "/models"},
|
||||
)
|
||||
|
||||
setup_model_provider(config)
|
||||
save_config(config)
|
||||
|
||||
reloaded = load_config()
|
||||
|
||||
# Core assertion: switching to custom endpoint clears OAuth provider
|
||||
assert get_active_provider() is None
|
||||
assert isinstance(reloaded["model"], dict)
|
||||
assert reloaded["model"]["provider"] == "custom"
|
||||
assert reloaded["model"]["base_url"] == "https://custom.example/v1"
|
||||
assert reloaded["model"]["default"] == "custom/model"
|
||||
|
||||
# _model_flow_custom writes config via its own load/save cycle
|
||||
reloaded = load_config()
|
||||
if isinstance(reloaded.get("model"), dict):
|
||||
assert reloaded["model"].get("provider") == "custom"
|
||||
assert reloaded["model"].get("default") == "custom/model"
|
||||
|
||||
|
||||
def test_codex_setup_uses_runtime_access_token_for_live_model_list(tmp_path, monkeypatch):
|
||||
|
||||
@@ -99,21 +99,21 @@ def test_setup_custom_endpoint_saves_working_v1_base_url(tmp_path, monkeypatch):
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
def fake_prompt(message, current=None, **kwargs):
|
||||
if "API base URL" in message:
|
||||
return "http://localhost:8000"
|
||||
if "API key" in message:
|
||||
return "local-key"
|
||||
if "Model name" in message:
|
||||
return "llm"
|
||||
return ""
|
||||
# _model_flow_custom uses builtins.input (URL, key, model, context_length)
|
||||
input_values = iter([
|
||||
"http://localhost:8000",
|
||||
"local-key",
|
||||
"llm",
|
||||
"", # context_length (blank = auto-detect)
|
||||
])
|
||||
monkeypatch.setattr("builtins.input", lambda _prompt="": next(input_values))
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt", fake_prompt)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False)
|
||||
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||
monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: [])
|
||||
monkeypatch.setattr("hermes_cli.main._save_custom_provider", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.models.probe_api_models",
|
||||
lambda api_key, base_url: {
|
||||
@@ -126,16 +126,19 @@ def test_setup_custom_endpoint_saves_working_v1_base_url(tmp_path, monkeypatch):
|
||||
)
|
||||
|
||||
setup_model_provider(config)
|
||||
save_config(config)
|
||||
|
||||
env = _read_env(tmp_path)
|
||||
reloaded = load_config()
|
||||
|
||||
# _model_flow_custom saves env vars and config to disk
|
||||
assert env.get("OPENAI_BASE_URL") == "http://localhost:8000/v1"
|
||||
assert env.get("OPENAI_API_KEY") == "local-key"
|
||||
assert reloaded["model"]["provider"] == "custom"
|
||||
assert reloaded["model"]["base_url"] == "http://localhost:8000/v1"
|
||||
assert reloaded["model"]["default"] == "llm"
|
||||
|
||||
# The model config is saved as a dict by _model_flow_custom
|
||||
reloaded = load_config()
|
||||
model_cfg = reloaded.get("model", {})
|
||||
if isinstance(model_cfg, dict):
|
||||
assert model_cfg.get("provider") == "custom"
|
||||
assert model_cfg.get("default") == "llm"
|
||||
|
||||
|
||||
def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tmp_path, monkeypatch):
|
||||
|
||||
@@ -60,6 +60,21 @@ class TestFromEnv:
|
||||
config = HonchoClientConfig.from_env(workspace_id="custom")
|
||||
assert config.workspace_id == "custom"
|
||||
|
||||
def test_reads_base_url_from_env(self):
|
||||
with patch.dict(os.environ, {"HONCHO_BASE_URL": "http://localhost:8000"}, clear=False):
|
||||
config = HonchoClientConfig.from_env()
|
||||
assert config.base_url == "http://localhost:8000"
|
||||
assert config.enabled is True
|
||||
|
||||
def test_enabled_without_api_key_when_base_url_set(self):
|
||||
"""base_url alone (no API key) is sufficient to enable a local instance."""
|
||||
with patch.dict(os.environ, {"HONCHO_BASE_URL": "http://localhost:8000"}, clear=False):
|
||||
os.environ.pop("HONCHO_API_KEY", None)
|
||||
config = HonchoClientConfig.from_env()
|
||||
assert config.api_key is None
|
||||
assert config.base_url == "http://localhost:8000"
|
||||
assert config.enabled is True
|
||||
|
||||
|
||||
class TestFromGlobalConfig:
|
||||
def test_missing_config_falls_back_to_env(self, tmp_path):
|
||||
@@ -188,6 +203,36 @@ class TestFromGlobalConfig:
|
||||
config = HonchoClientConfig.from_global_config(config_path=config_file)
|
||||
assert config.api_key == "env-key"
|
||||
|
||||
def test_base_url_env_fallback(self, tmp_path):
|
||||
"""HONCHO_BASE_URL env var is used when no baseUrl in config JSON."""
|
||||
config_file = tmp_path / "config.json"
|
||||
config_file.write_text(json.dumps({"workspace": "local"}))
|
||||
|
||||
with patch.dict(os.environ, {"HONCHO_BASE_URL": "http://localhost:8000"}, clear=False):
|
||||
config = HonchoClientConfig.from_global_config(config_path=config_file)
|
||||
assert config.base_url == "http://localhost:8000"
|
||||
assert config.enabled is True
|
||||
|
||||
def test_base_url_from_config_root(self, tmp_path):
|
||||
"""baseUrl in config root is read and takes precedence over env var."""
|
||||
config_file = tmp_path / "config.json"
|
||||
config_file.write_text(json.dumps({"baseUrl": "http://config-host:9000"}))
|
||||
|
||||
with patch.dict(os.environ, {"HONCHO_BASE_URL": "http://localhost:8000"}, clear=False):
|
||||
config = HonchoClientConfig.from_global_config(config_path=config_file)
|
||||
assert config.base_url == "http://config-host:9000"
|
||||
|
||||
def test_base_url_not_read_from_host_block(self, tmp_path):
|
||||
"""baseUrl is a root-level connection setting, not overridable per-host (consistent with apiKey)."""
|
||||
config_file = tmp_path / "config.json"
|
||||
config_file.write_text(json.dumps({
|
||||
"baseUrl": "http://root:9000",
|
||||
"hosts": {"hermes": {"baseUrl": "http://host-block:9001"}},
|
||||
}))
|
||||
|
||||
config = HonchoClientConfig.from_global_config(config_path=config_file)
|
||||
assert config.base_url == "http://root:9000"
|
||||
|
||||
|
||||
class TestResolveSessionName:
|
||||
def test_manual_override(self):
|
||||
|
||||
@@ -92,8 +92,8 @@ class TestProviderRegistry:
|
||||
assert PROVIDER_REGISTRY["copilot-acp"].inference_base_url == "acp://copilot"
|
||||
assert PROVIDER_REGISTRY["zai"].inference_base_url == "https://api.z.ai/api/paas/v4"
|
||||
assert PROVIDER_REGISTRY["kimi-coding"].inference_base_url == "https://api.moonshot.ai/v1"
|
||||
assert PROVIDER_REGISTRY["minimax"].inference_base_url == "https://api.minimax.io/v1"
|
||||
assert PROVIDER_REGISTRY["minimax-cn"].inference_base_url == "https://api.minimaxi.com/v1"
|
||||
assert PROVIDER_REGISTRY["minimax"].inference_base_url == "https://api.minimax.io/anthropic"
|
||||
assert PROVIDER_REGISTRY["minimax-cn"].inference_base_url == "https://api.minimaxi.com/anthropic"
|
||||
assert PROVIDER_REGISTRY["ai-gateway"].inference_base_url == "https://ai-gateway.vercel.sh/v1"
|
||||
assert PROVIDER_REGISTRY["kilocode"].inference_base_url == "https://api.kilo.ai/api/gateway"
|
||||
|
||||
@@ -399,14 +399,14 @@ class TestResolveApiKeyProviderCredentials:
|
||||
creds = resolve_api_key_provider_credentials("minimax")
|
||||
assert creds["provider"] == "minimax"
|
||||
assert creds["api_key"] == "mm-secret-key"
|
||||
assert creds["base_url"] == "https://api.minimax.io/v1"
|
||||
assert creds["base_url"] == "https://api.minimax.io/anthropic"
|
||||
|
||||
def test_resolve_minimax_cn_with_key(self, monkeypatch):
|
||||
monkeypatch.setenv("MINIMAX_CN_API_KEY", "mmcn-secret-key")
|
||||
creds = resolve_api_key_provider_credentials("minimax-cn")
|
||||
assert creds["provider"] == "minimax-cn"
|
||||
assert creds["api_key"] == "mmcn-secret-key"
|
||||
assert creds["base_url"] == "https://api.minimaxi.com/v1"
|
||||
assert creds["base_url"] == "https://api.minimaxi.com/anthropic"
|
||||
|
||||
def test_resolve_ai_gateway_with_key(self, monkeypatch):
|
||||
monkeypatch.setenv("AI_GATEWAY_API_KEY", "gw-secret-key")
|
||||
|
||||
@@ -42,6 +42,7 @@ def _make_cli(env_overrides=None, config_overrides=None, **kwargs):
|
||||
"prompt_toolkit.key_binding": MagicMock(),
|
||||
"prompt_toolkit.completion": MagicMock(),
|
||||
"prompt_toolkit.formatted_text": MagicMock(),
|
||||
"prompt_toolkit.auto_suggest": MagicMock(),
|
||||
}
|
||||
with patch.dict(sys.modules, prompt_toolkit_stubs), \
|
||||
patch.dict("os.environ", clean_env, clear=False):
|
||||
|
||||
@@ -12,6 +12,17 @@ from hermes_state import SessionDB
|
||||
from tools.todo_tool import TodoStore
|
||||
|
||||
|
||||
class _FakeCompressor:
|
||||
"""Minimal stand-in for ContextCompressor."""
|
||||
|
||||
def __init__(self):
|
||||
self.last_prompt_tokens = 500
|
||||
self.last_completion_tokens = 200
|
||||
self.last_total_tokens = 700
|
||||
self.compression_count = 3
|
||||
self._context_probed = True
|
||||
|
||||
|
||||
class _FakeAgent:
|
||||
def __init__(self, session_id: str, session_start):
|
||||
self.session_id = session_id
|
||||
@@ -25,6 +36,42 @@ class _FakeAgent:
|
||||
self.flush_memories = MagicMock()
|
||||
self._invalidate_system_prompt = MagicMock()
|
||||
|
||||
# Token counters (non-zero to verify reset)
|
||||
self.session_total_tokens = 1000
|
||||
self.session_input_tokens = 600
|
||||
self.session_output_tokens = 400
|
||||
self.session_prompt_tokens = 550
|
||||
self.session_completion_tokens = 350
|
||||
self.session_cache_read_tokens = 100
|
||||
self.session_cache_write_tokens = 50
|
||||
self.session_reasoning_tokens = 80
|
||||
self.session_api_calls = 5
|
||||
self.session_estimated_cost_usd = 0.42
|
||||
self.session_cost_status = "estimated"
|
||||
self.session_cost_source = "openrouter"
|
||||
self.context_compressor = _FakeCompressor()
|
||||
|
||||
def reset_session_state(self):
|
||||
"""Mirror the real AIAgent.reset_session_state()."""
|
||||
self.session_total_tokens = 0
|
||||
self.session_input_tokens = 0
|
||||
self.session_output_tokens = 0
|
||||
self.session_prompt_tokens = 0
|
||||
self.session_completion_tokens = 0
|
||||
self.session_cache_read_tokens = 0
|
||||
self.session_cache_write_tokens = 0
|
||||
self.session_reasoning_tokens = 0
|
||||
self.session_api_calls = 0
|
||||
self.session_estimated_cost_usd = 0.0
|
||||
self.session_cost_status = "unknown"
|
||||
self.session_cost_source = "none"
|
||||
if hasattr(self, "context_compressor") and self.context_compressor:
|
||||
self.context_compressor.last_prompt_tokens = 0
|
||||
self.context_compressor.last_completion_tokens = 0
|
||||
self.context_compressor.last_total_tokens = 0
|
||||
self.context_compressor.compression_count = 0
|
||||
self.context_compressor._context_probed = False
|
||||
|
||||
|
||||
def _make_cli(env_overrides=None, config_overrides=None, **kwargs):
|
||||
"""Create a HermesCLI instance with minimal mocking."""
|
||||
@@ -58,6 +105,7 @@ def _make_cli(env_overrides=None, config_overrides=None, **kwargs):
|
||||
"prompt_toolkit.key_binding": MagicMock(),
|
||||
"prompt_toolkit.completion": MagicMock(),
|
||||
"prompt_toolkit.formatted_text": MagicMock(),
|
||||
"prompt_toolkit.auto_suggest": MagicMock(),
|
||||
}
|
||||
with patch.dict(sys.modules, prompt_toolkit_stubs), patch.dict(
|
||||
"os.environ", clean_env, clear=False
|
||||
@@ -137,3 +185,38 @@ def test_clear_command_starts_new_session_before_redrawing(tmp_path):
|
||||
cli.console.clear.assert_called_once()
|
||||
cli.show_banner.assert_called_once()
|
||||
assert cli.conversation_history == []
|
||||
|
||||
|
||||
def test_new_session_resets_token_counters(tmp_path):
|
||||
"""Regression test for #2099: /new must zero all token counters."""
|
||||
cli = _prepare_cli_with_active_session(tmp_path)
|
||||
|
||||
# Verify counters are non-zero before reset
|
||||
agent = cli.agent
|
||||
assert agent.session_total_tokens > 0
|
||||
assert agent.session_api_calls > 0
|
||||
assert agent.context_compressor.compression_count > 0
|
||||
|
||||
cli.process_command("/new")
|
||||
|
||||
# All agent token counters must be zero
|
||||
assert agent.session_total_tokens == 0
|
||||
assert agent.session_input_tokens == 0
|
||||
assert agent.session_output_tokens == 0
|
||||
assert agent.session_prompt_tokens == 0
|
||||
assert agent.session_completion_tokens == 0
|
||||
assert agent.session_cache_read_tokens == 0
|
||||
assert agent.session_cache_write_tokens == 0
|
||||
assert agent.session_reasoning_tokens == 0
|
||||
assert agent.session_api_calls == 0
|
||||
assert agent.session_estimated_cost_usd == 0.0
|
||||
assert agent.session_cost_status == "unknown"
|
||||
assert agent.session_cost_source == "none"
|
||||
|
||||
# Context compressor counters must also be zero
|
||||
comp = agent.context_compressor
|
||||
assert comp.last_prompt_tokens == 0
|
||||
assert comp.last_completion_tokens == 0
|
||||
assert comp.last_total_tokens == 0
|
||||
assert comp.compression_count == 0
|
||||
assert comp._context_probed is False
|
||||
|
||||
@@ -459,7 +459,7 @@ def test_model_flow_custom_saves_verified_v1_base_url(monkeypatch, capsys):
|
||||
)
|
||||
monkeypatch.setattr("hermes_cli.config.save_config", lambda cfg: None)
|
||||
|
||||
answers = iter(["http://localhost:8000", "local-key", "llm"])
|
||||
answers = iter(["http://localhost:8000", "local-key", "llm", ""])
|
||||
monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers))
|
||||
|
||||
hermes_main._model_flow_custom({})
|
||||
|
||||
249
tests/test_context_pressure.py
Normal file
249
tests/test_context_pressure.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""Tests for context pressure warnings (user-facing, not injected into messages).
|
||||
|
||||
Covers:
|
||||
- Display formatting (CLI and gateway variants)
|
||||
- Flag tracking and threshold logic on AIAgent
|
||||
- Flag reset after compression
|
||||
- status_callback invocation
|
||||
"""
|
||||
|
||||
import json
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.display import format_context_pressure, format_context_pressure_gateway
|
||||
from run_agent import AIAgent
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Display formatting tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFormatContextPressure:
|
||||
"""CLI context pressure display (agent/display.py).
|
||||
|
||||
The bar shows progress toward the compaction threshold, not the
|
||||
raw context window. 60% = 60% of the way to compaction.
|
||||
"""
|
||||
|
||||
def test_60_percent_uses_info_icon(self):
|
||||
line = format_context_pressure(0.60, 100_000, 0.50)
|
||||
assert "◐" in line
|
||||
assert "60% to compaction" in line
|
||||
|
||||
def test_85_percent_uses_warning_icon(self):
|
||||
line = format_context_pressure(0.85, 100_000, 0.50)
|
||||
assert "⚠" in line
|
||||
assert "85% to compaction" in line
|
||||
|
||||
def test_bar_length_scales_with_progress(self):
|
||||
line_60 = format_context_pressure(0.60, 100_000, 0.50)
|
||||
line_85 = format_context_pressure(0.85, 100_000, 0.50)
|
||||
assert line_85.count("▰") > line_60.count("▰")
|
||||
|
||||
def test_shows_threshold_tokens(self):
|
||||
line = format_context_pressure(0.60, 100_000, 0.50)
|
||||
assert "100k" in line
|
||||
|
||||
def test_small_threshold(self):
|
||||
line = format_context_pressure(0.60, 500, 0.50)
|
||||
assert "500" in line
|
||||
|
||||
def test_shows_threshold_percent(self):
|
||||
line = format_context_pressure(0.85, 100_000, 0.50)
|
||||
assert "50%" in line # threshold percent shown
|
||||
|
||||
def test_imminent_hint_at_85(self):
|
||||
line = format_context_pressure(0.85, 100_000, 0.50)
|
||||
assert "compaction imminent" in line
|
||||
|
||||
def test_approaching_hint_below_85(self):
|
||||
line = format_context_pressure(0.60, 100_000, 0.80)
|
||||
assert "approaching compaction" in line
|
||||
|
||||
def test_no_compaction_when_disabled(self):
|
||||
line = format_context_pressure(0.85, 100_000, 0.50, compression_enabled=False)
|
||||
assert "no auto-compaction" in line
|
||||
|
||||
def test_returns_string(self):
|
||||
result = format_context_pressure(0.65, 128_000, 0.50)
|
||||
assert isinstance(result, str)
|
||||
|
||||
def test_over_100_percent_capped(self):
|
||||
"""Progress > 1.0 should not break the bar."""
|
||||
line = format_context_pressure(1.05, 100_000, 0.50)
|
||||
assert "▰" in line
|
||||
assert line.count("▰") == 20
|
||||
|
||||
|
||||
class TestFormatContextPressureGateway:
|
||||
"""Gateway (plain text) context pressure display."""
|
||||
|
||||
def test_60_percent_informational(self):
|
||||
msg = format_context_pressure_gateway(0.60, 0.50)
|
||||
assert "60% to compaction" in msg
|
||||
assert "50%" in msg # threshold shown
|
||||
|
||||
def test_85_percent_warning(self):
|
||||
msg = format_context_pressure_gateway(0.85, 0.50)
|
||||
assert "85% to compaction" in msg
|
||||
assert "imminent" in msg
|
||||
|
||||
def test_no_compaction_warning(self):
|
||||
msg = format_context_pressure_gateway(0.85, 0.50, compression_enabled=False)
|
||||
assert "disabled" in msg
|
||||
|
||||
def test_no_ansi_codes(self):
|
||||
msg = format_context_pressure_gateway(0.85, 0.50)
|
||||
assert "\033[" not in msg
|
||||
|
||||
def test_has_progress_bar(self):
|
||||
msg = format_context_pressure_gateway(0.85, 0.50)
|
||||
assert "▰" in msg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AIAgent context pressure flag tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_tool_defs(*names):
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": n,
|
||||
"description": f"{n} tool",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
}
|
||||
for n in names
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def agent():
|
||||
"""Minimal AIAgent with mocked internals."""
|
||||
with (
|
||||
patch("run_agent.get_tool_definitions", return_value=_make_tool_defs("web_search")),
|
||||
patch("run_agent.check_toolset_requirements", return_value={}),
|
||||
patch("run_agent.OpenAI"),
|
||||
):
|
||||
a = AIAgent(
|
||||
api_key="test-key-1234567890",
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
)
|
||||
a.client = MagicMock()
|
||||
return a
|
||||
|
||||
|
||||
class TestContextPressureFlags:
|
||||
"""Context pressure warning flag tracking on AIAgent."""
|
||||
|
||||
def test_flags_initialized_false(self, agent):
|
||||
assert agent._context_50_warned is False
|
||||
assert agent._context_70_warned is False
|
||||
|
||||
def test_emit_calls_status_callback(self, agent):
|
||||
"""status_callback should be invoked with event type and message."""
|
||||
cb = MagicMock()
|
||||
agent.status_callback = cb
|
||||
|
||||
compressor = MagicMock()
|
||||
compressor.context_length = 200_000
|
||||
compressor.threshold_tokens = 100_000 # 50%
|
||||
|
||||
agent._emit_context_pressure(0.85, compressor)
|
||||
|
||||
cb.assert_called_once()
|
||||
args = cb.call_args[0]
|
||||
assert args[0] == "context_pressure"
|
||||
assert "85% to compaction" in args[1]
|
||||
|
||||
def test_emit_no_callback_no_crash(self, agent):
|
||||
"""No status_callback set — should not crash."""
|
||||
agent.status_callback = None
|
||||
|
||||
compressor = MagicMock()
|
||||
compressor.context_length = 200_000
|
||||
compressor.threshold_tokens = 100_000
|
||||
|
||||
# Should not raise
|
||||
agent._emit_context_pressure(0.60, compressor)
|
||||
|
||||
def test_emit_prints_for_cli_platform(self, agent, capsys):
|
||||
"""CLI platform should always print context pressure, even in quiet_mode."""
|
||||
agent.quiet_mode = True
|
||||
agent.platform = "cli"
|
||||
agent.status_callback = None
|
||||
|
||||
compressor = MagicMock()
|
||||
compressor.context_length = 200_000
|
||||
compressor.threshold_tokens = 100_000
|
||||
|
||||
agent._emit_context_pressure(0.85, compressor)
|
||||
captured = capsys.readouterr()
|
||||
assert "▰" in captured.out
|
||||
assert "to compaction" in captured.out
|
||||
|
||||
def test_emit_skips_print_for_gateway_platform(self, agent, capsys):
|
||||
"""Gateway platforms get the callback, not CLI print."""
|
||||
agent.platform = "telegram"
|
||||
agent.status_callback = None
|
||||
|
||||
compressor = MagicMock()
|
||||
compressor.context_length = 200_000
|
||||
compressor.threshold_tokens = 100_000
|
||||
|
||||
agent._emit_context_pressure(0.85, compressor)
|
||||
captured = capsys.readouterr()
|
||||
assert "▰" not in captured.out
|
||||
|
||||
def test_flags_reset_on_compression(self, agent):
|
||||
"""After _compress_context, context pressure flags should reset."""
|
||||
agent._context_50_warned = True
|
||||
agent._context_70_warned = True
|
||||
agent.compression_enabled = True
|
||||
|
||||
# Mock the compressor's compress method to return minimal valid output
|
||||
agent.context_compressor = MagicMock()
|
||||
agent.context_compressor.compress.return_value = [
|
||||
{"role": "user", "content": "Summary of conversation so far."}
|
||||
]
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 100_000
|
||||
|
||||
# Mock _todo_store
|
||||
agent._todo_store = MagicMock()
|
||||
agent._todo_store.format_for_injection.return_value = None
|
||||
|
||||
# Mock _build_system_prompt
|
||||
agent._build_system_prompt = MagicMock(return_value="system prompt")
|
||||
agent._cached_system_prompt = "old system prompt"
|
||||
agent._session_db = None
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "hi there"},
|
||||
]
|
||||
agent._compress_context(messages, "system prompt")
|
||||
|
||||
assert agent._context_50_warned is False
|
||||
assert agent._context_70_warned is False
|
||||
|
||||
def test_emit_callback_error_handled(self, agent):
|
||||
"""If status_callback raises, it should be caught gracefully."""
|
||||
cb = MagicMock(side_effect=RuntimeError("callback boom"))
|
||||
agent.status_callback = cb
|
||||
|
||||
compressor = MagicMock()
|
||||
compressor.context_length = 200_000
|
||||
compressor.threshold_tokens = 100_000
|
||||
|
||||
# Should not raise
|
||||
agent._emit_context_pressure(0.85, compressor)
|
||||
493
tests/test_model_metadata_local_ctx.py
Normal file
493
tests/test_model_metadata_local_ctx.py
Normal file
@@ -0,0 +1,493 @@
|
||||
"""Tests for _query_local_context_length and the local server fallback in
|
||||
get_model_context_length.
|
||||
|
||||
All tests use synthetic inputs — no filesystem or live server required.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _query_local_context_length — unit tests with mocked httpx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestQueryLocalContextLengthOllama:
|
||||
"""_query_local_context_length with server_type == 'ollama'."""
|
||||
|
||||
def _make_resp(self, status_code, body):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
return resp
|
||||
|
||||
def test_ollama_model_info_context_length(self):
|
||||
"""Reads context length from model_info dict in /api/show response."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(200, {
|
||||
"model_info": {"llama.context_length": 131072}
|
||||
})
|
||||
models_resp = self._make_resp(404, {})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = models_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_ollama_parameters_num_ctx(self):
|
||||
"""Falls back to num_ctx in parameters string when model_info lacks context_length."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(200, {
|
||||
"model_info": {},
|
||||
"parameters": "num_ctx 32768\ntemperature 0.7\n"
|
||||
})
|
||||
models_resp = self._make_resp(404, {})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = models_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("some-model", "http://localhost:11434/v1")
|
||||
|
||||
assert result == 32768
|
||||
|
||||
def test_ollama_show_404_falls_through(self):
|
||||
"""When /api/show returns 404, falls through to /v1/models/{model}."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(404, {})
|
||||
model_detail_resp = self._make_resp(200, {"max_model_len": 65536})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = model_detail_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("some-model", "http://localhost:11434/v1")
|
||||
|
||||
assert result == 65536
|
||||
|
||||
|
||||
class TestQueryLocalContextLengthVllm:
|
||||
"""_query_local_context_length with vLLM-style /v1/models/{model} response."""
|
||||
|
||||
def _make_resp(self, status_code, body):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
return resp
|
||||
|
||||
def test_vllm_max_model_len(self):
|
||||
"""Reads max_model_len from /v1/models/{model} response."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
detail_resp = self._make_resp(200, {"id": "omnicoder-9b", "max_model_len": 100000})
|
||||
list_resp = self._make_resp(404, {})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = self._make_resp(404, {})
|
||||
client_mock.get.return_value = detail_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="vllm"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("omnicoder-9b", "http://localhost:8000/v1")
|
||||
|
||||
assert result == 100000
|
||||
|
||||
def test_vllm_context_length_key(self):
|
||||
"""Reads context_length from /v1/models/{model} response."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
detail_resp = self._make_resp(200, {"id": "some-model", "context_length": 32768})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = self._make_resp(404, {})
|
||||
client_mock.get.return_value = detail_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="vllm"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("some-model", "http://localhost:8000/v1")
|
||||
|
||||
assert result == 32768
|
||||
|
||||
|
||||
class TestQueryLocalContextLengthModelsList:
|
||||
"""_query_local_context_length: falls back to /v1/models list."""
|
||||
|
||||
def _make_resp(self, status_code, body):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
return resp
|
||||
|
||||
def test_models_list_max_model_len(self):
|
||||
"""Finds context length for model in /v1/models list."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
detail_resp = self._make_resp(404, {})
|
||||
list_resp = self._make_resp(200, {
|
||||
"data": [
|
||||
{"id": "other-model", "max_model_len": 4096},
|
||||
{"id": "omnicoder-9b", "max_model_len": 131072},
|
||||
]
|
||||
})
|
||||
|
||||
call_count = [0]
|
||||
def side_effect(url, **kwargs):
|
||||
call_count[0] += 1
|
||||
if call_count[0] == 1:
|
||||
return detail_resp # /v1/models/omnicoder-9b
|
||||
return list_resp # /v1/models
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = self._make_resp(404, {})
|
||||
client_mock.get.side_effect = side_effect
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value=None), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("omnicoder-9b", "http://localhost:1234")
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_models_list_model_not_found_returns_none(self):
|
||||
"""Returns None when model is not in the /v1/models list."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
detail_resp = self._make_resp(404, {})
|
||||
list_resp = self._make_resp(200, {
|
||||
"data": [{"id": "other-model", "max_model_len": 4096}]
|
||||
})
|
||||
|
||||
call_count = [0]
|
||||
def side_effect(url, **kwargs):
|
||||
call_count[0] += 1
|
||||
if call_count[0] == 1:
|
||||
return detail_resp
|
||||
return list_resp
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = self._make_resp(404, {})
|
||||
client_mock.get.side_effect = side_effect
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value=None), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("omnicoder-9b", "http://localhost:1234")
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestQueryLocalContextLengthLmStudio:
|
||||
"""_query_local_context_length with LM Studio native /api/v1/models response."""
|
||||
|
||||
def _make_resp(self, status_code, body):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
return resp
|
||||
|
||||
def _make_client(self, native_resp, detail_resp, list_resp):
|
||||
"""Build a mock httpx.Client with sequenced GET responses."""
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = self._make_resp(404, {})
|
||||
|
||||
responses = [native_resp, detail_resp, list_resp]
|
||||
call_idx = [0]
|
||||
|
||||
def get_side_effect(url, **kwargs):
|
||||
idx = call_idx[0]
|
||||
call_idx[0] += 1
|
||||
if idx < len(responses):
|
||||
return responses[idx]
|
||||
return self._make_resp(404, {})
|
||||
|
||||
client_mock.get.side_effect = get_side_effect
|
||||
return client_mock
|
||||
|
||||
def test_lmstudio_exact_key_match(self):
|
||||
"""Reads max_context_length when key matches exactly."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
native_resp = self._make_resp(200, {
|
||||
"models": [
|
||||
{"key": "nvidia/nvidia-nemotron-super-49b-v1", "id": "nvidia/nvidia-nemotron-super-49b-v1",
|
||||
"max_context_length": 131072},
|
||||
]
|
||||
})
|
||||
client_mock = self._make_client(
|
||||
native_resp,
|
||||
self._make_resp(404, {}),
|
||||
self._make_resp(404, {}),
|
||||
)
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="lm-studio"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length(
|
||||
"nvidia/nvidia-nemotron-super-49b-v1", "http://192.168.1.22:1234/v1"
|
||||
)
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_lmstudio_slug_only_matches_key_with_publisher_prefix(self):
|
||||
"""Fuzzy match: bare model slug matches key that includes publisher prefix.
|
||||
|
||||
When the user configures the model as "local:nvidia-nemotron-super-49b-v1"
|
||||
(slug only, no publisher), but LM Studio's native API stores it as
|
||||
"nvidia/nvidia-nemotron-super-49b-v1", the lookup must still succeed.
|
||||
"""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
native_resp = self._make_resp(200, {
|
||||
"models": [
|
||||
{"key": "nvidia/nvidia-nemotron-super-49b-v1",
|
||||
"id": "nvidia/nvidia-nemotron-super-49b-v1",
|
||||
"max_context_length": 131072},
|
||||
]
|
||||
})
|
||||
client_mock = self._make_client(
|
||||
native_resp,
|
||||
self._make_resp(404, {}),
|
||||
self._make_resp(404, {}),
|
||||
)
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="lm-studio"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
# Model passed in is just the slug after stripping "local:" prefix
|
||||
result = _query_local_context_length(
|
||||
"nvidia-nemotron-super-49b-v1", "http://192.168.1.22:1234/v1"
|
||||
)
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_lmstudio_v1_models_list_slug_fuzzy_match(self):
|
||||
"""Fuzzy match also works for /v1/models list when exact match fails.
|
||||
|
||||
LM Studio's OpenAI-compat /v1/models returns id like
|
||||
"nvidia/nvidia-nemotron-super-49b-v1" — must match bare slug.
|
||||
"""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
# native /api/v1/models: no match
|
||||
native_resp = self._make_resp(404, {})
|
||||
# /v1/models/{model}: no match
|
||||
detail_resp = self._make_resp(404, {})
|
||||
# /v1/models list: model found with publisher prefix, includes context_length
|
||||
list_resp = self._make_resp(200, {
|
||||
"data": [
|
||||
{"id": "nvidia/nvidia-nemotron-super-49b-v1", "context_length": 131072},
|
||||
]
|
||||
})
|
||||
client_mock = self._make_client(native_resp, detail_resp, list_resp)
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="lm-studio"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length(
|
||||
"nvidia-nemotron-super-49b-v1", "http://192.168.1.22:1234/v1"
|
||||
)
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_lmstudio_loaded_instances_context_length(self):
|
||||
"""Reads active context_length from loaded_instances when max_context_length absent."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
native_resp = self._make_resp(200, {
|
||||
"models": [
|
||||
{
|
||||
"key": "nvidia/nvidia-nemotron-super-49b-v1",
|
||||
"id": "nvidia/nvidia-nemotron-super-49b-v1",
|
||||
"loaded_instances": [
|
||||
{"config": {"context_length": 65536}},
|
||||
],
|
||||
},
|
||||
]
|
||||
})
|
||||
client_mock = self._make_client(
|
||||
native_resp,
|
||||
self._make_resp(404, {}),
|
||||
self._make_resp(404, {}),
|
||||
)
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="lm-studio"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length(
|
||||
"nvidia-nemotron-super-49b-v1", "http://192.168.1.22:1234/v1"
|
||||
)
|
||||
|
||||
assert result == 65536
|
||||
|
||||
def test_lmstudio_loaded_instance_beats_max_context_length(self):
|
||||
"""loaded_instances context_length takes priority over max_context_length.
|
||||
|
||||
LM Studio may show max_context_length=1_048_576 (theoretical model max)
|
||||
while the actual loaded context is 122_651 (runtime setting). The loaded
|
||||
value is the real constraint and must be preferred.
|
||||
"""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
native_resp = self._make_resp(200, {
|
||||
"models": [
|
||||
{
|
||||
"key": "nvidia/nvidia-nemotron-3-nano-4b",
|
||||
"id": "nvidia/nvidia-nemotron-3-nano-4b",
|
||||
"max_context_length": 1_048_576,
|
||||
"loaded_instances": [
|
||||
{"config": {"context_length": 122_651}},
|
||||
],
|
||||
},
|
||||
]
|
||||
})
|
||||
client_mock = self._make_client(
|
||||
native_resp,
|
||||
self._make_resp(404, {}),
|
||||
self._make_resp(404, {}),
|
||||
)
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="lm-studio"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length(
|
||||
"nvidia-nemotron-3-nano-4b", "http://192.168.1.22:1234/v1"
|
||||
)
|
||||
|
||||
assert result == 122_651, (
|
||||
f"Expected loaded instance context (122651) but got {result}. "
|
||||
"max_context_length (1048576) must not win over loaded_instances."
|
||||
)
|
||||
|
||||
|
||||
class TestQueryLocalContextLengthNetworkError:
|
||||
"""_query_local_context_length handles network failures gracefully."""
|
||||
|
||||
def test_connection_error_returns_none(self):
|
||||
"""Returns None when the server is unreachable."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.side_effect = Exception("Connection refused")
|
||||
client_mock.get.side_effect = Exception("Connection refused")
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value=None), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_model_context_length — integration-style tests with mocked helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetModelContextLengthLocalFallback:
|
||||
"""get_model_context_length uses local server query before falling back to 2M."""
|
||||
|
||||
def test_local_endpoint_unknown_model_queries_server(self):
|
||||
"""Unknown model on local endpoint gets ctx from server, not 2M default."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
|
||||
patch("agent.model_metadata._query_local_context_length", return_value=131072), \
|
||||
patch("agent.model_metadata.save_context_length") as mock_save:
|
||||
result = get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
assert result == 131072
|
||||
|
||||
def test_local_endpoint_unknown_model_result_is_cached(self):
|
||||
"""Context length returned from local server is persisted to cache."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
|
||||
patch("agent.model_metadata._query_local_context_length", return_value=131072), \
|
||||
patch("agent.model_metadata.save_context_length") as mock_save:
|
||||
get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
mock_save.assert_called_once_with("omnicoder-9b", "http://localhost:11434/v1", 131072)
|
||||
|
||||
def test_local_endpoint_server_returns_none_falls_back_to_2m(self):
|
||||
"""When local server returns None, still falls back to 2M probe tier."""
|
||||
from agent.model_metadata import get_model_context_length, CONTEXT_PROBE_TIERS
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
|
||||
patch("agent.model_metadata._query_local_context_length", return_value=None):
|
||||
result = get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
assert result == CONTEXT_PROBE_TIERS[0]
|
||||
|
||||
def test_non_local_endpoint_does_not_query_local_server(self):
|
||||
"""For non-local endpoints, _query_local_context_length is not called."""
|
||||
from agent.model_metadata import get_model_context_length, CONTEXT_PROBE_TIERS
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.is_local_endpoint", return_value=False), \
|
||||
patch("agent.model_metadata._query_local_context_length") as mock_query:
|
||||
result = get_model_context_length(
|
||||
"unknown-model", "https://some-cloud-api.example.com/v1"
|
||||
)
|
||||
|
||||
mock_query.assert_not_called()
|
||||
|
||||
def test_cached_result_skips_local_query(self):
|
||||
"""Cached context length is returned without querying the local server."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=65536), \
|
||||
patch("agent.model_metadata._query_local_context_length") as mock_query:
|
||||
result = get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
|
||||
|
||||
assert result == 65536
|
||||
mock_query.assert_not_called()
|
||||
|
||||
def test_no_base_url_does_not_query_local_server(self):
|
||||
"""When base_url is empty, local server is not queried."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata._query_local_context_length") as mock_query:
|
||||
result = get_model_context_length("unknown-xyz-model", "")
|
||||
|
||||
mock_query.assert_not_called()
|
||||
@@ -479,8 +479,8 @@ def test_api_key_provider_explicit_api_mode_config(monkeypatch):
|
||||
assert resolved["api_mode"] == "anthropic_messages"
|
||||
|
||||
|
||||
def test_api_key_provider_default_url_stays_chat_completions(monkeypatch):
|
||||
"""API-key providers with default /v1 URL should stay on chat_completions."""
|
||||
def test_minimax_default_url_uses_anthropic_messages(monkeypatch):
|
||||
"""MiniMax with default /anthropic URL should auto-detect anthropic_messages mode."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "minimax")
|
||||
monkeypatch.setattr(rp, "_get_model_config", lambda: {})
|
||||
monkeypatch.setenv("MINIMAX_API_KEY", "test-minimax-key")
|
||||
@@ -488,9 +488,50 @@ def test_api_key_provider_default_url_stays_chat_completions(monkeypatch):
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="minimax")
|
||||
|
||||
assert resolved["provider"] == "minimax"
|
||||
assert resolved["api_mode"] == "anthropic_messages"
|
||||
assert resolved["base_url"] == "https://api.minimax.io/anthropic"
|
||||
|
||||
|
||||
def test_minimax_stale_v1_url_auto_corrected(monkeypatch):
|
||||
"""MiniMax with stale /v1 base URL should be auto-corrected to /anthropic."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "minimax")
|
||||
monkeypatch.setattr(rp, "_get_model_config", lambda: {})
|
||||
monkeypatch.setenv("MINIMAX_API_KEY", "test-minimax-key")
|
||||
monkeypatch.setenv("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="minimax")
|
||||
|
||||
assert resolved["provider"] == "minimax"
|
||||
assert resolved["api_mode"] == "anthropic_messages"
|
||||
assert resolved["base_url"] == "https://api.minimax.io/anthropic"
|
||||
|
||||
|
||||
def test_minimax_cn_stale_v1_url_auto_corrected(monkeypatch):
|
||||
"""MiniMax-CN with stale /v1 base URL should be auto-corrected to /anthropic."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "minimax-cn")
|
||||
monkeypatch.setattr(rp, "_get_model_config", lambda: {})
|
||||
monkeypatch.setenv("MINIMAX_CN_API_KEY", "test-minimax-cn-key")
|
||||
monkeypatch.setenv("MINIMAX_CN_BASE_URL", "https://api.minimaxi.com/v1")
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="minimax-cn")
|
||||
|
||||
assert resolved["provider"] == "minimax-cn"
|
||||
assert resolved["api_mode"] == "anthropic_messages"
|
||||
assert resolved["base_url"] == "https://api.minimaxi.com/anthropic"
|
||||
|
||||
|
||||
def test_minimax_explicit_api_mode_respected(monkeypatch):
|
||||
"""Explicit api_mode config should override MiniMax auto-detection."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "minimax")
|
||||
monkeypatch.setattr(rp, "_get_model_config", lambda: {"api_mode": "chat_completions"})
|
||||
monkeypatch.setenv("MINIMAX_API_KEY", "test-minimax-key")
|
||||
monkeypatch.delenv("MINIMAX_BASE_URL", raising=False)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="minimax")
|
||||
|
||||
assert resolved["provider"] == "minimax"
|
||||
assert resolved["api_mode"] == "chat_completions"
|
||||
assert resolved["base_url"] == "https://api.minimax.io/v1"
|
||||
|
||||
|
||||
def test_named_custom_provider_anthropic_api_mode(monkeypatch):
|
||||
|
||||
@@ -106,6 +106,18 @@ class TestSchemaConversion:
|
||||
assert schema["parameters"]["type"] == "object"
|
||||
assert schema["parameters"]["properties"] == {}
|
||||
|
||||
def test_object_schema_without_properties_gets_normalized(self):
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
mcp_tool = _make_mcp_tool(
|
||||
name="ask",
|
||||
description="Ask Crawl4AI",
|
||||
input_schema={"type": "object"},
|
||||
)
|
||||
schema = _convert_mcp_schema("crawl4ai", mcp_tool)
|
||||
|
||||
assert schema["parameters"] == {"type": "object", "properties": {}}
|
||||
|
||||
def test_tool_name_prefix_format(self):
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
@@ -1893,6 +1905,33 @@ class TestSamplingCallbackText:
|
||||
messages = call_args.kwargs["messages"]
|
||||
assert messages[0] == {"role": "system", "content": "Be helpful"}
|
||||
|
||||
def test_server_tools_with_object_schema_are_normalized(self):
|
||||
"""Server-provided tools should gain empty properties for object schemas."""
|
||||
fake_client = MagicMock()
|
||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||
server_tool = SimpleNamespace(
|
||||
name="ask",
|
||||
description="Ask Crawl4AI",
|
||||
inputSchema={"type": "object"},
|
||||
)
|
||||
|
||||
with patch(
|
||||
"agent.auxiliary_client.call_llm",
|
||||
return_value=fake_client.chat.completions.create.return_value,
|
||||
) as mock_call:
|
||||
params = _make_sampling_params(tools=[server_tool])
|
||||
asyncio.run(self.handler(None, params))
|
||||
|
||||
tools = mock_call.call_args.kwargs["tools"]
|
||||
assert tools == [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ask",
|
||||
"description": "Ask Crawl4AI",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
}]
|
||||
|
||||
def test_length_stop_reason(self):
|
||||
"""finish_reason='length' maps to stopReason='maxTokens'."""
|
||||
fake_client = MagicMock()
|
||||
|
||||
@@ -336,11 +336,9 @@ Jobs run in a fresh session with no current-chat context, so prompts must be sel
|
||||
If skill or skills are provided on create, the future cron run loads those skills in order, then follows the prompt as the task instruction.
|
||||
On update, passing skills=[] clears attached skills.
|
||||
|
||||
NOTE: The agent's final response is auto-delivered to the target — do NOT use
|
||||
send_message in the prompt for that same destination. Same-target send_message
|
||||
calls are skipped to avoid duplicate cron deliveries. Put the primary
|
||||
user-facing content in the final response, and use send_message only for
|
||||
additional or different targets.
|
||||
NOTE: The agent's final response is auto-delivered to the target. Put the primary
|
||||
user-facing content in the final response. Cron jobs run autonomously with no user
|
||||
present — they cannot ask questions or request clarification.
|
||||
|
||||
Important safety rule: cron-run sessions should not recursively schedule more cron jobs.""",
|
||||
"parameters": {
|
||||
|
||||
@@ -262,13 +262,11 @@ def _run_single_child(
|
||||
# Get the progress callback from the child agent
|
||||
child_progress_cb = getattr(child, 'tool_progress_callback', None)
|
||||
|
||||
# Save the parent's resolved tool names before the child agent can
|
||||
# overwrite the process-global via get_tool_definitions().
|
||||
# This must be in _run_single_child (not _build_child_agent) so the
|
||||
# save/restore happens in the same scope as the try/finally.
|
||||
# Restore parent tool names using the value saved before child construction
|
||||
# mutated the global. This is the correct parent toolset, not the child's.
|
||||
import model_tools
|
||||
_saved_tool_names = list(model_tools._last_resolved_tool_names)
|
||||
child._delegate_saved_tool_names = _saved_tool_names
|
||||
_saved_tool_names = getattr(child, "_delegate_saved_tool_names",
|
||||
list(model_tools._last_resolved_tool_names))
|
||||
|
||||
try:
|
||||
result = child.run_conversation(user_message=goal)
|
||||
@@ -465,6 +463,12 @@ def delegate_task(
|
||||
# Track goal labels for progress display (truncated for readability)
|
||||
task_labels = [t["goal"][:40] for t in task_list]
|
||||
|
||||
# Save parent tool names BEFORE any child construction mutates the global.
|
||||
# _build_child_agent() calls AIAgent() which calls get_tool_definitions(),
|
||||
# which overwrites model_tools._last_resolved_tool_names with child's toolset.
|
||||
import model_tools as _model_tools
|
||||
_parent_tool_names = list(_model_tools._last_resolved_tool_names)
|
||||
|
||||
# Build all child agents on the main thread (thread-safe construction)
|
||||
children = []
|
||||
for i, t in enumerate(task_list):
|
||||
@@ -476,8 +480,13 @@ def delegate_task(
|
||||
override_api_key=creds["api_key"],
|
||||
override_api_mode=creds["api_mode"],
|
||||
)
|
||||
# Override with correct parent tool names (before child construction mutated global)
|
||||
child._delegate_saved_tool_names = _parent_tool_names
|
||||
children.append((i, t, child))
|
||||
|
||||
# Authoritative restore: reset global to parent's tool names after all children built
|
||||
_model_tools._last_resolved_tool_names = _parent_tool_names
|
||||
|
||||
if n_tasks == 1:
|
||||
# Single task -- run directly (no thread pool overhead)
|
||||
_i, _t, child = children[0]
|
||||
|
||||
@@ -605,7 +605,9 @@ class SamplingHandler:
|
||||
"function": {
|
||||
"name": getattr(t, "name", ""),
|
||||
"description": getattr(t, "description", "") or "",
|
||||
"parameters": getattr(t, "inputSchema", {}) or {},
|
||||
"parameters": _normalize_mcp_input_schema(
|
||||
getattr(t, "inputSchema", None)
|
||||
),
|
||||
},
|
||||
}
|
||||
for t in server_tools
|
||||
@@ -1213,6 +1215,17 @@ def _make_check_fn(server_name: str):
|
||||
# Discovery & registration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _normalize_mcp_input_schema(schema: dict | None) -> dict:
|
||||
"""Normalize MCP input schemas for LLM tool-calling compatibility."""
|
||||
if not schema:
|
||||
return {"type": "object", "properties": {}}
|
||||
|
||||
if schema.get("type") == "object" and "properties" not in schema:
|
||||
return {**schema, "properties": {}}
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def _convert_mcp_schema(server_name: str, mcp_tool) -> dict:
|
||||
"""Convert an MCP tool listing to the Hermes registry schema format.
|
||||
|
||||
@@ -1231,10 +1244,7 @@ def _convert_mcp_schema(server_name: str, mcp_tool) -> dict:
|
||||
return {
|
||||
"name": prefixed_name,
|
||||
"description": mcp_tool.description or f"MCP tool {mcp_tool.name} from {server_name}",
|
||||
"parameters": mcp_tool.inputSchema if mcp_tool.inputSchema else {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
},
|
||||
"parameters": _normalize_mcp_input_schema(mcp_tool.inputSchema),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -305,14 +305,14 @@ For docs-only examples, the exact file set may differ. The point is to cover:
|
||||
Run tests with xdist disabled:
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m pytest tests/test_runtime_provider_resolution.py tests/test_cli_provider_resolution.py tests/test_cli_model_command.py tests/test_setup_model_selection.py -n0 -q
|
||||
```
|
||||
|
||||
For deeper changes, run the full suite before pushing:
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m pytest tests/ -n0 -q
|
||||
```
|
||||
|
||||
@@ -321,14 +321,14 @@ python -m pytest tests/ -n0 -q
|
||||
After tests, run a real smoke test.
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m hermes_cli.main chat -q "Say hello" --provider your-provider --model your-model
|
||||
```
|
||||
|
||||
Also test the interactive flows if you changed menus:
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
source venv/bin/activate
|
||||
python -m hermes_cli.main model
|
||||
python -m hermes_cli.main setup
|
||||
```
|
||||
|
||||
@@ -416,7 +416,19 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo
|
||||
|
||||
### Context Length Detection
|
||||
|
||||
Hermes automatically detects your model's context length by querying the endpoint's `/v1/models` response. For most setups this works out of the box. If detection fails (the model name doesn't match, the endpoint doesn't expose `/v1/models`, etc.), Hermes falls back to a high default and probes downward on context-length errors.
|
||||
Hermes uses a multi-source resolution chain to detect the correct context window for your model and provider:
|
||||
|
||||
1. **Config override** — `model.context_length` in config.yaml (highest priority)
|
||||
2. **Custom provider per-model** — `custom_providers[].models.<id>.context_length`
|
||||
3. **Persistent cache** — previously discovered values (survives restarts)
|
||||
4. **Endpoint `/models`** — queries your server's API (local/custom endpoints)
|
||||
5. **Anthropic `/v1/models`** — queries Anthropic's API for `max_input_tokens` (API-key users only)
|
||||
6. **OpenRouter API** — live model metadata from OpenRouter
|
||||
7. **Nous Portal** — suffix-matches Nous model IDs against OpenRouter metadata
|
||||
8. **[models.dev](https://models.dev)** — community-maintained registry with provider-specific context lengths for 3800+ models across 100+ providers
|
||||
9. **Fallback defaults** — broad model family patterns (128K default)
|
||||
|
||||
For most setups this works out of the box. The system is provider-aware — the same model can have different context limits depending on who serves it (e.g., `claude-opus-4.6` is 1M on Anthropic direct but 128K on GitHub Copilot).
|
||||
|
||||
To set the context length explicitly, add `context_length` to your model config:
|
||||
|
||||
@@ -427,10 +439,23 @@ model:
|
||||
context_length: 131072 # tokens
|
||||
```
|
||||
|
||||
This takes highest priority — it overrides auto-detection, cached values, and hardcoded defaults.
|
||||
For custom endpoints, you can also set context length per model:
|
||||
|
||||
```yaml
|
||||
custom_providers:
|
||||
- name: "My Local LLM"
|
||||
base_url: "http://localhost:11434/v1"
|
||||
models:
|
||||
qwen3.5:27b:
|
||||
context_length: 32768
|
||||
deepseek-r1:70b:
|
||||
context_length: 65536
|
||||
```
|
||||
|
||||
`hermes model` will prompt for context length when configuring a custom endpoint. Leave it blank for auto-detection.
|
||||
|
||||
:::tip When to set this manually
|
||||
- Your model shows "2M context" in the status bar (detection failed)
|
||||
- You're using Ollama with a custom `num_ctx` that's lower than the model's maximum
|
||||
- You want to limit context below the model's maximum (e.g., 8k on a 128k model to save VRAM)
|
||||
- You're running behind a proxy that doesn't expose `/v1/models`
|
||||
:::
|
||||
|
||||
@@ -177,6 +177,19 @@ All phone numbers are automatically redacted in logs:
|
||||
- `+15551234567` → `+155****4567`
|
||||
- This applies to both Hermes gateway logs and the global redaction system
|
||||
|
||||
### Note to Self (Single-Number Setup)
|
||||
|
||||
If you run signal-cli as a **linked secondary device** on your own phone number (rather than a separate bot number), you can interact with Hermes through Signal's "Note to Self" feature.
|
||||
|
||||
Just send a message to yourself from your phone — signal-cli picks it up and Hermes responds in the same conversation.
|
||||
|
||||
**How it works:**
|
||||
- "Note to Self" messages arrive as `syncMessage.sentMessage` envelopes
|
||||
- The adapter detects when these are addressed to the bot's own account and processes them as regular inbound messages
|
||||
- Echo-back protection (sent-timestamp tracking) prevents infinite loops — the bot's own replies are filtered out automatically
|
||||
|
||||
**No extra configuration needed.** This works automatically as long as `SIGNAL_ACCOUNT` matches your phone number.
|
||||
|
||||
### Health Monitoring
|
||||
|
||||
The adapter monitors the SSE connection and automatically reconnects if:
|
||||
|
||||
Reference in New Issue
Block a user