2026-02-21 22:31:43 -08:00
|
|
|
|
"""Model metadata, context lengths, and token estimation utilities.
|
|
|
|
|
|
|
|
|
|
|
|
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
|
|
|
|
|
and run_agent.py for pre-flight context checks.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-13 06:17:13 +02:00
|
|
|
|
import ipaddress
|
2026-02-21 22:31:43 -08:00
|
|
|
|
import logging
|
2026-03-05 16:09:57 -08:00
|
|
|
|
import re
|
2026-02-21 22:31:43 -08:00
|
|
|
|
import time
|
2026-03-05 16:09:57 -08:00
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from typing import Any, Dict, List, Optional
|
2026-03-18 03:04:07 -07:00
|
|
|
|
from urllib.parse import urlparse
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
import requests
|
2026-03-05 16:09:57 -08:00
|
|
|
|
import yaml
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
fix: sweep remaining provider-URL substring checks across codebase
Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.
New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
'domain in base_url'. Accepts hostname equality and subdomain matches;
rejects path segments, host suffixes, and prefix collisions.
Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):
run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection
agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
(resolve custom, resolve auto, OpenRouter-fallback-to-custom,
_async_client_from_sync, resolve_provider_client explicit-custom,
resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff
agent/usage_pricing.py:
- resolve_billing_route openrouter branch
agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup
hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic
hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)
hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes
tools/delegate_tool.py:
- subagent Codex endpoint detection
trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
kimi-coding, arcee, minimax-cn, minimax)
cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)
Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.
Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
suite (exact match, subdomain, path-segment rejection, host-suffix
rejection, host-prefix rejection, empty-input, case-insensitivity,
trailing dot).
Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.
2026-04-20 21:17:28 -07:00
|
|
|
|
from utils import base_url_host_matches, base_url_hostname
|
fix: extend hostname-match provider detection across remaining call sites
Aslaaen's fix in the original PR covered _detect_api_mode_for_url and the
two openai/xai sites in run_agent.py. This finishes the sweep: the same
substring-match false-positive class (e.g. https://api.openai.com.evil/v1,
https://proxy/api.openai.com/v1, https://api.anthropic.com.example/v1)
existed in eight more call sites, and the hostname helper was duplicated
in two modules.
- utils: add shared base_url_hostname() (single source of truth).
- hermes_cli/runtime_provider, run_agent: drop local duplicates, import
from utils. Reuse the cached AIAgent._base_url_hostname attribute
everywhere it's already populated.
- agent/auxiliary_client: switch codex-wrap auto-detect, max_completion_tokens
gate (auxiliary_max_tokens_param), and custom-endpoint max_tokens kwarg
selection to hostname equality.
- run_agent: native-anthropic check in the Claude-style model branch
and in the AIAgent init provider-auto-detect branch.
- agent/model_metadata: Anthropic /v1/models context-length lookup.
- hermes_cli/providers.determine_api_mode: anthropic / openai URL
heuristics for custom/unknown providers (the /anthropic path-suffix
convention for third-party gateways is preserved).
- tools/delegate_tool: anthropic detection for delegated subagent
runtimes.
- hermes_cli/setup, hermes_cli/tools_config: setup-wizard vision-endpoint
native-OpenAI detection (paired with deduping the repeated check into
a single is_native_openai boolean per branch).
Tests:
- tests/test_base_url_hostname.py covers the helper directly
(path-containing-host, host-suffix, trailing dot, port, case).
- tests/hermes_cli/test_determine_api_mode_hostname.py adds the same
regression class for determine_api_mode, plus a test that the
/anthropic third-party gateway convention still wins.
Also: add asslaenn5@gmail.com → Aslaaen to scripts/release.py AUTHOR_MAP.
2026-04-20 20:58:01 -07:00
|
|
|
|
|
2026-02-21 22:31:43 -08:00
|
|
|
|
from hermes_constants import OPENROUTER_MODELS_URL
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
|
# Provider names that can appear as a "provider:" prefix before a model ID.
|
|
|
|
|
|
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
|
|
|
|
|
|
# are preserved so the full model name reaches cache lookups and server queries.
|
|
|
|
|
|
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
|
|
|
|
|
|
"openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
|
2026-04-22 13:28:01 +05:30
|
|
|
|
"gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "stepfun", "minimax", "minimax-cn", "anthropic", "deepseek",
|
2026-03-20 03:19:31 -07:00
|
|
|
|
"opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
|
"qwen-oauth",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
|
"xiaomi",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
|
"arcee",
|
2026-03-20 03:19:31 -07:00
|
|
|
|
"custom", "local",
|
|
|
|
|
|
# Common aliases
|
2026-04-06 10:14:01 -07:00
|
|
|
|
"google", "google-gemini", "google-ai-studio",
|
2026-03-20 03:19:31 -07:00
|
|
|
|
"glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
|
2026-04-13 11:13:09 -07:00
|
|
|
|
"github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
|
2026-04-15 22:32:05 -07:00
|
|
|
|
"ollama",
|
2026-04-22 13:28:01 +05:30
|
|
|
|
"stepfun", "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
|
"mimo", "xiaomi-mimo",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
|
"arcee-ai", "arceeai",
|
2026-04-14 16:43:23 -07:00
|
|
|
|
"xai", "x-ai", "x.ai", "grok",
|
feat(providers): add native NVIDIA NIM provider
Adds NVIDIA NIM as a first-class provider: ProviderConfig in
auth.py, HermesOverlay in providers.py, curated models
(Nemotron plus other open source models hosted on
build.nvidia.com), URL mapping in model_metadata.py, aliases
(nim, nvidia-nim, build-nvidia, nemotron), and env var tests.
Docs updated: providers page, quickstart table, fallback
providers table, and README provider list.
2026-04-17 09:55:58 -07:00
|
|
|
|
"nvidia", "nim", "nvidia-nim", "nemotron",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
|
"qwen-portal",
|
2026-03-20 03:19:31 -07:00
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-20 08:52:37 -07:00
|
|
|
|
_OLLAMA_TAG_PATTERN = re.compile(
|
|
|
|
|
|
r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
|
|
|
|
|
|
re.IGNORECASE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-13 06:17:13 +02:00
|
|
|
|
# Tailscale's CGNAT range (RFC 6598). `ipaddress.is_private` excludes this
|
|
|
|
|
|
# block, so without an explicit check Ollama reached over Tailscale (e.g.
|
|
|
|
|
|
# `http://100.77.243.5:11434`) wouldn't be treated as local and its stream
|
|
|
|
|
|
# read / stale timeouts wouldn't get auto-bumped. Built once at import time.
|
|
|
|
|
|
_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
|
def _strip_provider_prefix(model: str) -> str:
|
|
|
|
|
|
"""Strip a recognised provider prefix from a model string.
|
|
|
|
|
|
|
|
|
|
|
|
``"local:my-model"`` → ``"my-model"``
|
|
|
|
|
|
``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix)
|
2026-03-20 08:52:37 -07:00
|
|
|
|
``"qwen:0.5b"`` → ``"qwen:0.5b"`` (unchanged — Ollama model:tag)
|
|
|
|
|
|
``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag)
|
2026-03-20 03:19:31 -07:00
|
|
|
|
"""
|
|
|
|
|
|
if ":" not in model or model.startswith("http"):
|
|
|
|
|
|
return model
|
2026-03-20 08:52:37 -07:00
|
|
|
|
prefix, suffix = model.split(":", 1)
|
|
|
|
|
|
prefix_lower = prefix.strip().lower()
|
|
|
|
|
|
if prefix_lower in _PROVIDER_PREFIXES:
|
|
|
|
|
|
# Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
|
|
|
|
|
|
if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
|
|
|
|
|
|
return model
|
|
|
|
|
|
return suffix
|
2026-03-20 03:19:31 -07:00
|
|
|
|
return model
|
|
|
|
|
|
|
2026-02-21 22:31:43 -08:00
|
|
|
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
_model_metadata_cache_time: float = 0
|
|
|
|
|
|
_MODEL_CACHE_TTL = 3600
|
2026-03-18 03:04:07 -07:00
|
|
|
|
_endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {}
|
|
|
|
|
|
_endpoint_model_metadata_cache_time: Dict[str, float] = {}
|
|
|
|
|
|
_ENDPOINT_MODEL_CACHE_TTL = 300
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
|
# Descending tiers for context length probing when the model is unknown.
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# We start at 128K (a safe default for most modern models) and step down
|
|
|
|
|
|
# on context-length errors until one works.
|
2026-03-05 16:09:57 -08:00
|
|
|
|
CONTEXT_PROBE_TIERS = [
|
|
|
|
|
|
128_000,
|
|
|
|
|
|
64_000,
|
|
|
|
|
|
32_000,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
16_000,
|
|
|
|
|
|
8_000,
|
2026-03-05 16:09:57 -08:00
|
|
|
|
]
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# Default context length when no detection method succeeds.
|
|
|
|
|
|
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
|
|
|
|
|
|
fix: prevent agent from stopping mid-task — compression floor, budget overhaul, activity tracking
Three root causes of the 'agent stops mid-task' gateway bug:
1. Compression threshold floor (64K tokens minimum)
- The 50% threshold on a 100K-context model fired at 50K tokens,
causing premature compression that made models lose track of
multi-step plans. Now threshold_tokens = max(50% * context, 64K).
- Models with <64K context are rejected at startup with a clear error.
2. Budget warning removal — grace call instead
- Removed the 70%/90% iteration budget warnings entirely. These
injected '[BUDGET WARNING: Provide your final response NOW]' into
tool results, causing models to abandon complex tasks prematurely.
- Now: no warnings during normal execution. When the budget is
actually exhausted (90/90), inject a user message asking the model
to summarise, allow one grace API call, and only then fall back
to _handle_max_iterations.
3. Activity touches during long terminal execution
- _wait_for_process polls every 0.2s but never reported activity.
The gateway's inactivity timeout (default 1800s) would fire during
long-running commands that appeared 'idle.'
- Now: thread-local activity callback fires every 10s during the
poll loop, keeping the gateway's activity tracker alive.
- Agent wires _touch_activity into the callback before each tool call.
Also: docs update noting 64K minimum context requirement.
Closes #7915 (root cause was agent-loop termination, not Weixin delivery limits).
2026-04-11 16:18:57 -07:00
|
|
|
|
# Minimum context length required to run Hermes Agent. Models with fewer
|
|
|
|
|
|
# tokens cannot maintain enough working memory for tool-calling workflows.
|
|
|
|
|
|
# Sessions, model switches, and cron jobs should reject models below this.
|
|
|
|
|
|
MINIMUM_CONTEXT_LENGTH = 64_000
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# Thin fallback defaults — only broad model family patterns.
|
|
|
|
|
|
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
|
|
|
|
|
# all miss. Replaced the previous 80+ entry dict.
|
|
|
|
|
|
# For provider-specific context lengths, models.dev is the primary source.
|
2026-02-21 22:31:43 -08:00
|
|
|
|
DEFAULT_CONTEXT_LENGTHS = {
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
|
|
|
|
|
|
# fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
|
|
|
|
|
|
# substring of "anthropic/claude-sonnet-4.6").
|
|
|
|
|
|
# OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
|
fix(agent): complete Claude Opus 4.7 API migration
Claude Opus 4.7 introduced several breaking API changes that the current
codebase partially handled but not completely. This patch finishes the
migration per the official migration guide at
https://platform.claude.com/docs/en/about-claude/models/migration-guide
Fixes NousResearch/hermes-agent#11137
Breaking-change coverage:
1. Adaptive thinking + output_config.effort — 4.7 is now recognized by
_supports_adaptive_thinking() (extends previous 4.6-only gate).
2. Sampling parameter stripping — 4.7 returns 400 for any non-default
temperature / top_p / top_k. build_anthropic_kwargs drops them as a
safety net; the OpenAI-protocol auxiliary path (_build_call_kwargs)
and AnthropicCompletionsAdapter.create() both early-exit before
setting temperature for 4.7+ models. This keeps flush_memories and
structured-JSON aux paths that hardcode temperature from 400ing
when the aux model is flipped to 4.7.
3. thinking.display = "summarized" — 4.7 defaults display to "omitted",
which silently hides reasoning text from Hermes's CLI activity feed
during long tool runs. Restoring "summarized" preserves 4.6 UX.
4. Effort level mapping — xhigh now maps to xhigh (was xhigh→max, which
silently over-efforted every coding/agentic request). max is now a
distinct ceiling per Anthropic's 5-level effort model.
5. New stop_reason values — refusal and model_context_window_exceeded
were silently collapsed to "stop" (end_turn) by the adapter's
stop_reason_map. Now mapped to "content_filter" and "length"
respectively, matching upstream finish-reason handling already in
bedrock_adapter.
6. Model catalogs — claude-opus-4-7 added to the Anthropic provider
list, anthropic/claude-opus-4.7 added at top of OpenRouter fallback
catalog (recommended), claude-opus-4-7 added to model_metadata
DEFAULT_CONTEXT_LENGTHS (1M, matching 4.6 per migration guide).
7. Prefill docstrings — run_agent.AIAgent and BatchRunner now document
that Anthropic Sonnet/Opus 4.6+ reject a trailing assistant-role
prefill (400).
8. Tests — 4 new tests in test_anthropic_adapter covering display
default, xhigh preservation, max on 4.7, refusal / context-overflow
stop_reason mapping, plus the sampling-param predicate. test_model_metadata
accepts 4.7 at 1M context.
Tested on macOS 15.5 (darwin). 119 tests pass in
tests/agent/test_anthropic_adapter.py, 1320 pass in tests/agent/.
2026-04-16 12:35:43 -05:00
|
|
|
|
"claude-opus-4-7": 1000000,
|
|
|
|
|
|
"claude-opus-4.7": 1000000,
|
2026-03-20 04:38:59 -07:00
|
|
|
|
"claude-opus-4-6": 1000000,
|
|
|
|
|
|
"claude-sonnet-4-6": 1000000,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
"claude-opus-4.6": 1000000,
|
|
|
|
|
|
"claude-sonnet-4.6": 1000000,
|
|
|
|
|
|
# Catch-all for older Claude models (must sort after specific entries)
|
|
|
|
|
|
"claude": 200000,
|
fix: correct GPT-5 family context lengths in fallback defaults (#9309)
The generic 'gpt-5' fallback was set to 128,000 — which is the max
OUTPUT tokens, not the context window. GPT-5 base and most variants
(codex, mini) have 400,000 context. This caused /model to report
128k for models like gpt-5.3-codex when models.dev was unavailable.
Added specific entries for GPT-5 variants with different context sizes:
- gpt-5.4, gpt-5.4-pro: 1,050,000 (1.05M)
- gpt-5.4-mini, gpt-5.4-nano: 400,000
- gpt-5.3-codex-spark: 128,000 (reduced)
- gpt-5.1-chat: 128,000 (chat variant)
- gpt-5 (catch-all): 400,000
Sources: https://developers.openai.com/api/docs/models
2026-04-13 19:22:23 -07:00
|
|
|
|
# OpenAI — GPT-5 family (most have 400k; specific overrides first)
|
|
|
|
|
|
# Source: https://developers.openai.com/api/docs/models
|
2026-04-23 13:32:43 -07:00
|
|
|
|
# GPT-5.5 (launched Apr 23 2026). Verified via live ChatGPT codex/models
|
|
|
|
|
|
# endpoint: bare slug `gpt-5.5`, no -pro/-mini variants. 400k context on Codex.
|
|
|
|
|
|
"gpt-5.5": 400000,
|
fix: correct GPT-5 family context lengths in fallback defaults (#9309)
The generic 'gpt-5' fallback was set to 128,000 — which is the max
OUTPUT tokens, not the context window. GPT-5 base and most variants
(codex, mini) have 400,000 context. This caused /model to report
128k for models like gpt-5.3-codex when models.dev was unavailable.
Added specific entries for GPT-5 variants with different context sizes:
- gpt-5.4, gpt-5.4-pro: 1,050,000 (1.05M)
- gpt-5.4-mini, gpt-5.4-nano: 400,000
- gpt-5.3-codex-spark: 128,000 (reduced)
- gpt-5.1-chat: 128,000 (chat variant)
- gpt-5 (catch-all): 400,000
Sources: https://developers.openai.com/api/docs/models
2026-04-13 19:22:23 -07:00
|
|
|
|
"gpt-5.4-nano": 400000, # 400k (not 1.05M like full 5.4)
|
|
|
|
|
|
"gpt-5.4-mini": 400000, # 400k (not 1.05M like full 5.4)
|
|
|
|
|
|
"gpt-5.4": 1050000, # GPT-5.4, GPT-5.4 Pro (1.05M context)
|
|
|
|
|
|
"gpt-5.1-chat": 128000, # Chat variant has 128k context
|
|
|
|
|
|
"gpt-5": 400000, # GPT-5.x base, mini, codex variants (400k)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
"gpt-4.1": 1047576,
|
|
|
|
|
|
"gpt-4": 128000,
|
|
|
|
|
|
# Google
|
|
|
|
|
|
"gemini": 1048576,
|
2026-04-06 10:14:01 -07:00
|
|
|
|
# Gemma (open models served via AI Studio)
|
2026-04-22 22:56:54 +01:00
|
|
|
|
"gemma-4": 256000, # Gemma 4 family
|
|
|
|
|
|
"gemma4": 256000, # Ollama-style naming (e.g. gemma4:31b-cloud)
|
2026-04-06 10:19:19 -07:00
|
|
|
|
"gemma-4-31b": 256000,
|
2026-04-06 10:14:01 -07:00
|
|
|
|
"gemma-3": 131072,
|
|
|
|
|
|
"gemma": 8192, # fallback for older gemma models
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# DeepSeek
|
|
|
|
|
|
"deepseek": 128000,
|
|
|
|
|
|
# Meta
|
|
|
|
|
|
"llama": 131072,
|
2026-04-11 03:29:09 -07:00
|
|
|
|
# Qwen — specific model families before the catch-all.
|
|
|
|
|
|
# Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
|
|
|
|
|
|
"qwen3-coder-plus": 1000000, # 1M context
|
|
|
|
|
|
"qwen3-coder": 262144, # 256K context
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
"qwen": 131072,
|
fix: align MiniMax provider with official API docs
Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.
Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).
Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api
Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-04-10 03:53:18 -07:00
|
|
|
|
# MiniMax — official docs: 204,800 context for all models
|
|
|
|
|
|
# https://platform.minimax.io/docs/api-reference/text-anthropic-api
|
|
|
|
|
|
"minimax": 204800,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# GLM
|
|
|
|
|
|
"glm": 202752,
|
fix(model_metadata): add xAI Grok context length fallbacks
xAI /v1/models does not return context_length metadata, so Hermes
probes down to the 128k default whenever a user configures a custom
provider pointing at https://api.x.ai/v1. This forces every xAI user
to manually override model.context_length in config.yaml (2M for
Grok 4.20 / 4.1-fast / 4-fast) or lose most of the usable context
window.
Add DEFAULT_CONTEXT_LENGTHS entries for the Grok family so the
fallback lookup returns the correct value via substring matching.
Values sourced from models.dev (2026-04) and cross-checked against
the xAI /v1/models listing:
- grok-4.20-* 2,000,000 (reasoning, non-reasoning, multi-agent)
- grok-4-1-fast-* 2,000,000
- grok-4-fast-* 2,000,000
- grok-4 / grok-4-0709 256,000
- grok-code-fast-1 256,000
- grok-3* 131,072
- grok-2 / latest 131,072
- grok-2-vision* 8,192
- grok (catch-all) 131,072
Keys are ordered longest-first so that specific variants match before
the catch-all, consistent with the existing Claude/Gemma/MiniMax entries.
Add TestDefaultContextLengths.test_grok_models_context_lengths and
test_grok_substring_matching to pin the values and verify the full
lookup path. All 77 tests in test_model_metadata.py pass.
2026-04-10 12:08:16 +04:00
|
|
|
|
# xAI Grok — xAI /v1/models does not return context_length metadata,
|
|
|
|
|
|
# so these hardcoded fallbacks prevent Hermes from probing-down to
|
|
|
|
|
|
# the default 128k when the user points at https://api.x.ai/v1
|
|
|
|
|
|
# via a custom provider. Values sourced from models.dev (2026-04).
|
|
|
|
|
|
# Keys use substring matching (longest-first), so e.g. "grok-4.20"
|
|
|
|
|
|
# matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
|
|
|
|
|
|
"grok-code-fast": 256000, # grok-code-fast-1
|
|
|
|
|
|
"grok-4-1-fast": 2000000, # grok-4-1-fast-(non-)reasoning
|
|
|
|
|
|
"grok-2-vision": 8192, # grok-2-vision, -1212, -latest
|
|
|
|
|
|
"grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning
|
|
|
|
|
|
"grok-4.20": 2000000, # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
|
|
|
|
|
|
"grok-4": 256000, # grok-4, grok-4-0709
|
|
|
|
|
|
"grok-3": 131072, # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
|
|
|
|
|
|
"grok-2": 131072, # grok-2, grok-2-1212, grok-2-latest
|
|
|
|
|
|
"grok": 131072, # catch-all (grok-beta, unknown grok-*)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# Kimi
|
|
|
|
|
|
"kimi": 262144,
|
2026-04-17 13:09:14 -07:00
|
|
|
|
# Nemotron — NVIDIA's open-weights series (128K context across all sizes)
|
|
|
|
|
|
"nemotron": 131072,
|
2026-04-03 13:45:16 -07:00
|
|
|
|
# Arcee
|
|
|
|
|
|
"trinity": 262144,
|
2026-04-13 21:16:14 -07:00
|
|
|
|
# OpenRouter
|
|
|
|
|
|
"elephant": 262144,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
|
# Hugging Face Inference Providers — model IDs use org/name format
|
|
|
|
|
|
"Qwen/Qwen3.5-397B-A17B": 131072,
|
2026-03-27 13:54:46 -07:00
|
|
|
|
"Qwen/Qwen3.5-35B-A3B": 131072,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
|
"deepseek-ai/DeepSeek-V3.2": 65536,
|
|
|
|
|
|
"moonshotai/Kimi-K2.5": 262144,
|
2026-04-20 23:20:33 -07:00
|
|
|
|
"moonshotai/Kimi-K2.6": 262144,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
|
"moonshotai/Kimi-K2-Thinking": 262144,
|
fix: align MiniMax provider with official API docs
Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.
Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).
Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api
Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-04-10 03:53:18 -07:00
|
|
|
|
"MiniMaxAI/MiniMax-M2.5": 204800,
|
feat: add Xiaomi MiMo v2.5-pro and v2.5 model support (#14635)
## Merged
Adds MiMo v2.5-pro and v2.5 support to Xiaomi native provider, OpenCode Go, and setup wizard.
### Changes
- Context lengths: added v2.5-pro (1M) and v2.5 (1M), corrected existing MiMo entries to exact values (262144)
- Provider lists: xiaomi, opencode-go, setup wizard
- Vision: upgraded from mimo-v2-omni to mimo-v2.5 (omnimodal)
- Config description updated for XIAOMI_API_KEY
- Tests updated for new vision model preference
### Verification
- 4322 tests passed, 0 new regressions
- Live API tested on Xiaomi portal: basic, reasoning, tool calling, multi-tool, file ops, system prompt, vision — all pass
- Self-review found and fixed 2 issues (redundant vision check, stale HuggingFace context length)
2026-04-23 10:06:25 -07:00
|
|
|
|
"XiaomiMiMo/MiMo-V2-Flash": 262144,
|
|
|
|
|
|
"mimo-v2-pro": 1048576,
|
|
|
|
|
|
"mimo-v2.5-pro": 1048576,
|
|
|
|
|
|
"mimo-v2.5": 1048576,
|
|
|
|
|
|
"mimo-v2-omni": 262144,
|
|
|
|
|
|
"mimo-v2-flash": 262144,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
|
"zai-org/GLM-5": 202752,
|
2026-02-21 22:31:43 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
_CONTEXT_LENGTH_KEYS = (
|
|
|
|
|
|
"context_length",
|
|
|
|
|
|
"context_window",
|
|
|
|
|
|
"max_context_length",
|
|
|
|
|
|
"max_position_embeddings",
|
|
|
|
|
|
"max_model_len",
|
|
|
|
|
|
"max_input_tokens",
|
|
|
|
|
|
"max_sequence_length",
|
|
|
|
|
|
"max_seq_len",
|
2026-03-19 06:01:16 -07:00
|
|
|
|
"n_ctx_train",
|
|
|
|
|
|
"n_ctx",
|
2026-04-12 14:13:02 -04:00
|
|
|
|
"ctx_size",
|
2026-03-18 03:04:07 -07:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_MAX_COMPLETION_KEYS = (
|
|
|
|
|
|
"max_completion_tokens",
|
|
|
|
|
|
"max_output_tokens",
|
|
|
|
|
|
"max_tokens",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
# Local server hostnames / address patterns
|
|
|
|
|
|
_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
|
2026-04-11 14:46:18 -07:00
|
|
|
|
# Docker / Podman / Lima DNS names that resolve to the host machine
|
|
|
|
|
|
_CONTAINER_LOCAL_SUFFIXES = (
|
|
|
|
|
|
".docker.internal",
|
|
|
|
|
|
".containers.internal",
|
|
|
|
|
|
".lima.internal",
|
|
|
|
|
|
)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
|
|
|
|
|
def _normalize_base_url(base_url: str) -> str:
|
|
|
|
|
|
return (base_url or "").strip().rstrip("/")
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
def _auth_headers(api_key: str = "") -> Dict[str, str]:
|
|
|
|
|
|
token = str(api_key or "").strip()
|
|
|
|
|
|
if not token:
|
|
|
|
|
|
return {}
|
|
|
|
|
|
return {"Authorization": f"Bearer {token}"}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
def _is_openrouter_base_url(base_url: str) -> bool:
|
fix: sweep remaining provider-URL substring checks across codebase
Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.
New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
'domain in base_url'. Accepts hostname equality and subdomain matches;
rejects path segments, host suffixes, and prefix collisions.
Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):
run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection
agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
(resolve custom, resolve auto, OpenRouter-fallback-to-custom,
_async_client_from_sync, resolve_provider_client explicit-custom,
resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff
agent/usage_pricing.py:
- resolve_billing_route openrouter branch
agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup
hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic
hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)
hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes
tools/delegate_tool.py:
- subagent Codex endpoint detection
trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
kimi-coding, arcee, minimax-cn, minimax)
cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)
Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.
Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
suite (exact match, subdomain, path-segment rejection, host-suffix
rejection, host-prefix rejection, empty-input, case-insensitivity,
trailing dot).
Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.
2026-04-20 21:17:28 -07:00
|
|
|
|
return base_url_host_matches(base_url, "openrouter.ai")
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_custom_endpoint(base_url: str) -> bool:
|
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
|
return bool(normalized) and not _is_openrouter_base_url(normalized)
|
|
|
|
|
|
|
|
|
|
|
|
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
_URL_TO_PROVIDER: Dict[str, str] = {
|
|
|
|
|
|
"api.openai.com": "openai",
|
|
|
|
|
|
"chatgpt.com": "openai",
|
|
|
|
|
|
"api.anthropic.com": "anthropic",
|
|
|
|
|
|
"api.z.ai": "zai",
|
2026-04-11 23:20:53 +08:00
|
|
|
|
"open.bigmodel.cn": "zai",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
"api.moonshot.ai": "kimi-coding",
|
2026-04-13 11:13:09 -07:00
|
|
|
|
"api.moonshot.cn": "kimi-coding-cn",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
"api.kimi.com": "kimi-coding",
|
2026-04-22 13:28:01 +05:30
|
|
|
|
"api.stepfun.ai": "stepfun",
|
|
|
|
|
|
"api.stepfun.com": "stepfun",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
|
"api.arcee.ai": "arcee",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
"api.minimax": "minimax",
|
|
|
|
|
|
"dashscope.aliyuncs.com": "alibaba",
|
2026-03-20 12:51:39 -07:00
|
|
|
|
"dashscope-intl.aliyuncs.com": "alibaba",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
|
"portal.qwen.ai": "qwen-oauth",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
"openrouter.ai": "openrouter",
|
2026-04-06 10:14:01 -07:00
|
|
|
|
"generativelanguage.googleapis.com": "gemini",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
"inference-api.nousresearch.com": "nous",
|
|
|
|
|
|
"api.deepseek.com": "deepseek",
|
2026-03-22 08:15:06 -07:00
|
|
|
|
"api.githubcopilot.com": "copilot",
|
|
|
|
|
|
"models.github.ai": "copilot",
|
2026-03-30 20:37:08 -07:00
|
|
|
|
"api.fireworks.ai": "fireworks",
|
2026-04-02 19:59:19 -05:00
|
|
|
|
"opencode.ai": "opencode-go",
|
2026-04-10 12:51:30 +04:00
|
|
|
|
"api.x.ai": "xai",
|
feat(providers): add native NVIDIA NIM provider
Adds NVIDIA NIM as a first-class provider: ProviderConfig in
auth.py, HermesOverlay in providers.py, curated models
(Nemotron plus other open source models hosted on
build.nvidia.com), URL mapping in model_metadata.py, aliases
(nim, nvidia-nim, build-nvidia, nemotron), and env var tests.
Docs updated: providers page, quickstart table, fallback
providers table, and README provider list.
2026-04-17 09:55:58 -07:00
|
|
|
|
"integrate.api.nvidia.com": "nvidia",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
|
"api.xiaomimimo.com": "xiaomi",
|
|
|
|
|
|
"xiaomimimo.com": "xiaomi",
|
2026-04-15 22:32:05 -07:00
|
|
|
|
"ollama.com": "ollama-cloud",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _infer_provider_from_url(base_url: str) -> Optional[str]:
|
|
|
|
|
|
"""Infer the models.dev provider name from a base URL.
|
|
|
|
|
|
|
|
|
|
|
|
This allows context length resolution via models.dev for custom endpoints
|
|
|
|
|
|
like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
|
|
|
|
|
|
explicitly set the provider name in config.
|
|
|
|
|
|
"""
|
2026-03-18 03:04:07 -07:00
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
|
if not normalized:
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
return None
|
2026-03-18 03:04:07 -07:00
|
|
|
|
parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
|
|
|
|
|
|
host = parsed.netloc.lower() or parsed.path.lower()
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
for url_part, provider in _URL_TO_PROVIDER.items():
|
|
|
|
|
|
if url_part in host:
|
|
|
|
|
|
return provider
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_known_provider_base_url(base_url: str) -> bool:
|
|
|
|
|
|
return _infer_provider_from_url(base_url) is not None
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
def is_local_endpoint(base_url: str) -> bool:
|
2026-04-13 06:17:13 +02:00
|
|
|
|
"""Return True if base_url points to a local machine.
|
|
|
|
|
|
|
|
|
|
|
|
Recognises loopback (``localhost``, ``127.0.0.0/8``, ``::1``),
|
|
|
|
|
|
container-internal DNS names (``host.docker.internal`` et al.),
|
|
|
|
|
|
RFC-1918 private ranges (``10/8``, ``172.16/12``, ``192.168/16``),
|
|
|
|
|
|
link-local, and Tailscale CGNAT (``100.64.0.0/10``). Tailscale CGNAT
|
|
|
|
|
|
is included so remote-but-trusted Ollama boxes reached over a
|
|
|
|
|
|
Tailscale mesh get the same timeout auto-bumps as localhost Ollama.
|
|
|
|
|
|
"""
|
2026-03-18 21:38:41 +01:00
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return False
|
|
|
|
|
|
url = normalized if "://" in normalized else f"http://{normalized}"
|
|
|
|
|
|
try:
|
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
|
host = parsed.hostname or ""
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return False
|
|
|
|
|
|
if host in _LOCAL_HOSTS:
|
|
|
|
|
|
return True
|
2026-04-11 14:46:18 -07:00
|
|
|
|
# Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
|
|
|
|
|
|
if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
|
|
|
|
|
|
return True
|
2026-04-13 06:17:13 +02:00
|
|
|
|
# RFC-1918 private ranges, link-local, and Tailscale CGNAT
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
|
|
|
|
|
addr = ipaddress.ip_address(host)
|
2026-04-13 06:17:13 +02:00
|
|
|
|
if addr.is_private or addr.is_loopback or addr.is_link_local:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if isinstance(addr, ipaddress.IPv4Address) and addr in _TAILSCALE_CGNAT:
|
|
|
|
|
|
return True
|
2026-03-18 21:38:41 +01:00
|
|
|
|
except ValueError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
# Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
|
2026-04-13 06:17:13 +02:00
|
|
|
|
# or Tailscale CGNAT (100.64.x.x–100.127.x.x).
|
2026-03-18 21:38:41 +01:00
|
|
|
|
parts = host.split(".")
|
|
|
|
|
|
if len(parts) == 4:
|
|
|
|
|
|
try:
|
|
|
|
|
|
first, second = int(parts[0]), int(parts[1])
|
|
|
|
|
|
if first == 10:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if first == 172 and 16 <= second <= 31:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if first == 192 and second == 168:
|
|
|
|
|
|
return True
|
2026-04-13 06:17:13 +02:00
|
|
|
|
if first == 100 and 64 <= second <= 127:
|
|
|
|
|
|
return True
|
2026-03-18 21:38:41 +01:00
|
|
|
|
except ValueError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
|
2026-03-18 21:38:41 +01:00
|
|
|
|
"""Detect which local server is running at base_url by probing known endpoints.
|
|
|
|
|
|
|
2026-03-19 21:32:04 +01:00
|
|
|
|
Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
|
2026-03-18 21:38:41 +01:00
|
|
|
|
"""
|
|
|
|
|
|
import httpx
|
|
|
|
|
|
|
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
|
server_url = normalized
|
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
headers = _auth_headers(api_key)
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
2026-04-20 20:49:44 -07:00
|
|
|
|
with httpx.Client(timeout=2.0, headers=headers) as client:
|
2026-03-19 21:32:04 +01:00
|
|
|
|
# LM Studio exposes /api/v1/models — check first (most specific)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
2026-03-19 21:32:04 +01:00
|
|
|
|
r = client.get(f"{server_url}/api/v1/models")
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if r.status_code == 200:
|
2026-03-19 21:32:04 +01:00
|
|
|
|
return "lm-studio"
|
2026-03-18 21:38:41 +01:00
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
2026-03-19 21:32:04 +01:00
|
|
|
|
# Ollama exposes /api/tags and responds with {"models": [...]}
|
|
|
|
|
|
# LM Studio returns {"error": "Unexpected endpoint"} with status 200
|
|
|
|
|
|
# on this path, so we must verify the response contains "models".
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
2026-03-19 21:32:04 +01:00
|
|
|
|
r = client.get(f"{server_url}/api/tags")
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if r.status_code == 200:
|
2026-03-19 21:32:04 +01:00
|
|
|
|
try:
|
|
|
|
|
|
data = r.json()
|
|
|
|
|
|
if "models" in data:
|
|
|
|
|
|
return "ollama"
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
2026-03-18 21:38:41 +01:00
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
2026-03-21 18:07:18 -07:00
|
|
|
|
# llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
2026-03-21 18:07:18 -07:00
|
|
|
|
r = client.get(f"{server_url}/v1/props")
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
|
|
r = client.get(f"{server_url}/props") # fallback for older builds
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if r.status_code == 200 and "default_generation_settings" in r.text:
|
|
|
|
|
|
return "llamacpp"
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
# vLLM: /version
|
|
|
|
|
|
try:
|
|
|
|
|
|
r = client.get(f"{server_url}/version")
|
|
|
|
|
|
if r.status_code == 200:
|
|
|
|
|
|
data = r.json()
|
|
|
|
|
|
if "version" in data:
|
|
|
|
|
|
return "vllm"
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
def _iter_nested_dicts(value: Any):
|
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
|
yield value
|
|
|
|
|
|
for nested in value.values():
|
|
|
|
|
|
yield from _iter_nested_dicts(nested)
|
|
|
|
|
|
elif isinstance(value, list):
|
|
|
|
|
|
for item in value:
|
|
|
|
|
|
yield from _iter_nested_dicts(item)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]:
|
|
|
|
|
|
try:
|
|
|
|
|
|
if isinstance(value, bool):
|
|
|
|
|
|
return None
|
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
value = value.strip().replace(",", "")
|
|
|
|
|
|
result = int(value)
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
return None
|
|
|
|
|
|
if minimum <= result <= maximum:
|
|
|
|
|
|
return result
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]:
|
|
|
|
|
|
keyset = {key.lower() for key in keys}
|
|
|
|
|
|
for mapping in _iter_nested_dicts(payload):
|
|
|
|
|
|
for key, value in mapping.items():
|
|
|
|
|
|
if str(key).lower() not in keyset:
|
|
|
|
|
|
continue
|
|
|
|
|
|
coerced = _coerce_reasonable_int(value)
|
|
|
|
|
|
if coerced is not None:
|
|
|
|
|
|
return coerced
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]:
|
|
|
|
|
|
return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]:
|
|
|
|
|
|
return _extract_first_int(payload, _MAX_COMPLETION_KEYS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
alias_map = {
|
|
|
|
|
|
"prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"),
|
|
|
|
|
|
"completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"),
|
|
|
|
|
|
"request": ("request", "request_cost"),
|
|
|
|
|
|
"cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"),
|
|
|
|
|
|
"cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"),
|
|
|
|
|
|
}
|
|
|
|
|
|
for mapping in _iter_nested_dicts(payload):
|
|
|
|
|
|
normalized = {str(key).lower(): value for key, value in mapping.items()}
|
|
|
|
|
|
if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()):
|
|
|
|
|
|
continue
|
|
|
|
|
|
pricing: Dict[str, Any] = {}
|
|
|
|
|
|
for target, aliases in alias_map.items():
|
|
|
|
|
|
for alias in aliases:
|
|
|
|
|
|
if alias in normalized and normalized[alias] not in (None, ""):
|
|
|
|
|
|
pricing[target] = normalized[alias]
|
|
|
|
|
|
break
|
|
|
|
|
|
if pricing:
|
|
|
|
|
|
return pricing
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None:
|
|
|
|
|
|
cache[model_id] = entry
|
|
|
|
|
|
if "/" in model_id:
|
|
|
|
|
|
bare_model = model_id.split("/", 1)[1]
|
|
|
|
|
|
cache.setdefault(bare_model, entry)
|
|
|
|
|
|
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
|
|
|
|
|
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
|
|
|
|
|
global _model_metadata_cache, _model_metadata_cache_time
|
|
|
|
|
|
|
|
|
|
|
|
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
|
|
|
|
|
return _model_metadata_cache
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
cache = {}
|
|
|
|
|
|
for model in data.get("data", []):
|
|
|
|
|
|
model_id = model.get("id", "")
|
2026-03-18 03:04:07 -07:00
|
|
|
|
entry = {
|
2026-02-21 22:31:43 -08:00
|
|
|
|
"context_length": model.get("context_length", 128000),
|
|
|
|
|
|
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
|
|
|
|
|
"name": model.get("name", model_id),
|
|
|
|
|
|
"pricing": model.get("pricing", {}),
|
|
|
|
|
|
}
|
2026-03-18 03:04:07 -07:00
|
|
|
|
_add_model_aliases(cache, model_id, entry)
|
2026-02-21 22:31:43 -08:00
|
|
|
|
canonical = model.get("canonical_slug", "")
|
|
|
|
|
|
if canonical and canonical != model_id:
|
2026-03-18 03:04:07 -07:00
|
|
|
|
_add_model_aliases(cache, canonical, entry)
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
_model_metadata_cache = cache
|
|
|
|
|
|
_model_metadata_cache_time = time.time()
|
|
|
|
|
|
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
|
|
|
|
|
return cache
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
|
|
|
|
|
return _model_metadata_cache or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
def fetch_endpoint_model_metadata(
|
|
|
|
|
|
base_url: str,
|
|
|
|
|
|
api_key: str = "",
|
|
|
|
|
|
force_refresh: bool = False,
|
|
|
|
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
|
|
|
|
"""Fetch model metadata from an OpenAI-compatible ``/models`` endpoint.
|
|
|
|
|
|
|
|
|
|
|
|
This is used for explicit custom endpoints where hardcoded global model-name
|
|
|
|
|
|
defaults are unreliable. Results are cached in memory per base URL.
|
|
|
|
|
|
"""
|
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
|
if not normalized or _is_openrouter_base_url(normalized):
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
if not force_refresh:
|
|
|
|
|
|
cached = _endpoint_model_metadata_cache.get(normalized)
|
|
|
|
|
|
cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0)
|
|
|
|
|
|
if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL:
|
|
|
|
|
|
return cached
|
|
|
|
|
|
|
|
|
|
|
|
candidates = [normalized]
|
|
|
|
|
|
if normalized.endswith("/v1"):
|
|
|
|
|
|
alternate = normalized[:-3].rstrip("/")
|
|
|
|
|
|
else:
|
|
|
|
|
|
alternate = normalized + "/v1"
|
|
|
|
|
|
if alternate and alternate not in candidates:
|
|
|
|
|
|
candidates.append(alternate)
|
|
|
|
|
|
|
|
|
|
|
|
headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
|
|
|
|
|
|
last_error: Optional[Exception] = None
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
if is_local_endpoint(normalized):
|
|
|
|
|
|
try:
|
|
|
|
|
|
if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
|
|
|
|
|
|
server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
|
|
|
|
|
|
response = requests.get(
|
|
|
|
|
|
server_url.rstrip("/") + "/api/v1/models",
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
timeout=10,
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
payload = response.json()
|
|
|
|
|
|
cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
for model in payload.get("models", []):
|
|
|
|
|
|
if not isinstance(model, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
model_id = model.get("key") or model.get("id")
|
|
|
|
|
|
if not model_id:
|
|
|
|
|
|
continue
|
|
|
|
|
|
entry: Dict[str, Any] = {"name": model.get("name", model_id)}
|
|
|
|
|
|
|
|
|
|
|
|
context_length = None
|
|
|
|
|
|
for inst in model.get("loaded_instances", []) or []:
|
|
|
|
|
|
if not isinstance(inst, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
cfg = inst.get("config", {})
|
|
|
|
|
|
ctx = cfg.get("context_length") if isinstance(cfg, dict) else None
|
|
|
|
|
|
if isinstance(ctx, int) and ctx > 0:
|
|
|
|
|
|
context_length = ctx
|
|
|
|
|
|
break
|
|
|
|
|
|
if context_length is None:
|
|
|
|
|
|
context_length = _extract_context_length(model)
|
|
|
|
|
|
if context_length is not None:
|
|
|
|
|
|
entry["context_length"] = context_length
|
|
|
|
|
|
|
|
|
|
|
|
max_completion_tokens = _extract_max_completion_tokens(model)
|
|
|
|
|
|
if max_completion_tokens is not None:
|
|
|
|
|
|
entry["max_completion_tokens"] = max_completion_tokens
|
|
|
|
|
|
|
|
|
|
|
|
pricing = _extract_pricing(model)
|
|
|
|
|
|
if pricing:
|
|
|
|
|
|
entry["pricing"] = pricing
|
|
|
|
|
|
|
|
|
|
|
|
_add_model_aliases(cache, model_id, entry)
|
|
|
|
|
|
alt_id = model.get("id")
|
|
|
|
|
|
if isinstance(alt_id, str) and alt_id and alt_id != model_id:
|
|
|
|
|
|
_add_model_aliases(cache, alt_id, entry)
|
|
|
|
|
|
|
|
|
|
|
|
_endpoint_model_metadata_cache[normalized] = cache
|
|
|
|
|
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
|
|
|
|
|
return cache
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
last_error = exc
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
for candidate in candidates:
|
|
|
|
|
|
url = candidate.rstrip("/") + "/models"
|
|
|
|
|
|
try:
|
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
payload = response.json()
|
|
|
|
|
|
cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
for model in payload.get("data", []):
|
|
|
|
|
|
if not isinstance(model, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
model_id = model.get("id")
|
|
|
|
|
|
if not model_id:
|
|
|
|
|
|
continue
|
|
|
|
|
|
entry: Dict[str, Any] = {"name": model.get("name", model_id)}
|
|
|
|
|
|
context_length = _extract_context_length(model)
|
|
|
|
|
|
if context_length is not None:
|
|
|
|
|
|
entry["context_length"] = context_length
|
|
|
|
|
|
max_completion_tokens = _extract_max_completion_tokens(model)
|
|
|
|
|
|
if max_completion_tokens is not None:
|
|
|
|
|
|
entry["max_completion_tokens"] = max_completion_tokens
|
|
|
|
|
|
pricing = _extract_pricing(model)
|
|
|
|
|
|
if pricing:
|
|
|
|
|
|
entry["pricing"] = pricing
|
|
|
|
|
|
_add_model_aliases(cache, model_id, entry)
|
|
|
|
|
|
|
2026-03-19 06:01:16 -07:00
|
|
|
|
# If this is a llama.cpp server, query /props for actual allocated context
|
|
|
|
|
|
is_llamacpp = any(
|
|
|
|
|
|
m.get("owned_by") == "llamacpp"
|
|
|
|
|
|
for m in payload.get("data", []) if isinstance(m, dict)
|
|
|
|
|
|
)
|
|
|
|
|
|
if is_llamacpp:
|
|
|
|
|
|
try:
|
2026-03-21 18:07:18 -07:00
|
|
|
|
# Try /v1/props first (current llama.cpp); fall back to /props for older builds
|
|
|
|
|
|
base = candidate.rstrip("/").replace("/v1", "")
|
|
|
|
|
|
props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5)
|
|
|
|
|
|
if not props_resp.ok:
|
|
|
|
|
|
props_resp = requests.get(base + "/props", headers=headers, timeout=5)
|
2026-03-19 06:01:16 -07:00
|
|
|
|
if props_resp.ok:
|
|
|
|
|
|
props = props_resp.json()
|
|
|
|
|
|
gen_settings = props.get("default_generation_settings", {})
|
|
|
|
|
|
n_ctx = gen_settings.get("n_ctx")
|
|
|
|
|
|
model_alias = props.get("model_alias", "")
|
|
|
|
|
|
if n_ctx and model_alias and model_alias in cache:
|
|
|
|
|
|
cache[model_alias]["context_length"] = n_ctx
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
_endpoint_model_metadata_cache[normalized] = cache
|
|
|
|
|
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
|
|
|
|
|
return cache
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
last_error = exc
|
|
|
|
|
|
|
|
|
|
|
|
if last_error:
|
|
|
|
|
|
logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error)
|
|
|
|
|
|
_endpoint_model_metadata_cache[normalized] = {}
|
|
|
|
|
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
|
def _get_context_cache_path() -> Path:
|
|
|
|
|
|
"""Return path to the persistent context length cache file."""
|
refactor: replace inline HERMES_HOME re-implementations with get_hermes_home()
16 callsites across 14 files were re-deriving the hermes home path
via os.environ.get('HERMES_HOME', ...) instead of using the canonical
get_hermes_home() from hermes_constants. This breaks profiles — each
profile has its own HERMES_HOME, and the inline fallback defaults to
~/.hermes regardless.
Fixed by importing and calling get_hermes_home() at each site. For
files already inside the hermes process (agent/, hermes_cli/, tools/,
gateway/, plugins/), this is always safe. Files that run outside the
process context (mcp_serve.py, mcp_oauth.py) already had correct
try/except ImportError fallbacks and were left alone.
Skipped: hermes_constants.py (IS the implementation), env_loader.py
(bootstrap), profiles.py (intentionally manipulates the env var),
standalone scripts (optional-skills/, skills/), and tests.
2026-04-07 10:40:34 -07:00
|
|
|
|
from hermes_constants import get_hermes_home
|
|
|
|
|
|
return get_hermes_home() / "context_length_cache.yaml"
|
2026-03-05 16:09:57 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_context_cache() -> Dict[str, int]:
|
2026-03-18 21:38:41 +01:00
|
|
|
|
"""Load the model+provider -> context_length cache from disk."""
|
2026-03-05 16:09:57 -08:00
|
|
|
|
path = _get_context_cache_path()
|
|
|
|
|
|
if not path.exists():
|
|
|
|
|
|
return {}
|
|
|
|
|
|
try:
|
|
|
|
|
|
with open(path) as f:
|
|
|
|
|
|
data = yaml.safe_load(f) or {}
|
|
|
|
|
|
return data.get("context_lengths", {})
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.debug("Failed to load context length cache: %s", e)
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_context_length(model: str, base_url: str, length: int) -> None:
|
|
|
|
|
|
"""Persist a discovered context length for a model+provider combo.
|
|
|
|
|
|
|
|
|
|
|
|
Cache key is ``model@base_url`` so the same model name served from
|
|
|
|
|
|
different providers can have different limits.
|
|
|
|
|
|
"""
|
|
|
|
|
|
key = f"{model}@{base_url}"
|
|
|
|
|
|
cache = _load_context_cache()
|
|
|
|
|
|
if cache.get(key) == length:
|
|
|
|
|
|
return # already stored
|
|
|
|
|
|
cache[key] = length
|
|
|
|
|
|
path = _get_context_cache_path()
|
|
|
|
|
|
try:
|
|
|
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
with open(path, "w") as f:
|
|
|
|
|
|
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
|
2026-03-05 16:09:57 -08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.debug("Failed to save context length cache: %s", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
|
|
|
|
|
"""Look up a previously discovered context length for model+provider."""
|
|
|
|
|
|
key = f"{model}@{base_url}"
|
|
|
|
|
|
cache = _load_context_cache()
|
|
|
|
|
|
return cache.get(key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
|
|
|
|
|
"""Return the next lower probe tier, or None if already at minimum."""
|
|
|
|
|
|
for tier in CONTEXT_PROBE_TIERS:
|
|
|
|
|
|
if tier < current_length:
|
|
|
|
|
|
return tier
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|
|
|
|
|
"""Try to extract the actual context limit from an API error message.
|
|
|
|
|
|
|
|
|
|
|
|
Many providers include the limit in their error text, e.g.:
|
|
|
|
|
|
- "maximum context length is 32768 tokens"
|
|
|
|
|
|
- "context_length_exceeded: 131072"
|
|
|
|
|
|
- "Maximum context size 32768 exceeded"
|
|
|
|
|
|
- "model's max context length is 65536"
|
|
|
|
|
|
"""
|
|
|
|
|
|
error_lower = error_msg.lower()
|
|
|
|
|
|
# Pattern: look for numbers near context-related keywords
|
|
|
|
|
|
patterns = [
|
|
|
|
|
|
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
|
|
|
|
|
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
|
|
|
|
|
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
test: comprehensive tests for model metadata + firecrawl config
model_metadata tests (61 tests, was 39):
- Token estimation: concrete value assertions, unicode, tool_call messages,
vision multimodal content, additive verification
- Context length resolution: cache-over-API priority, no-base_url skips cache,
missing context_length key in API response
- API metadata fetch: canonical_slug aliasing, TTL expiry with time mock,
stale cache fallback on API failure, malformed JSON resilience
- Probe tiers: above-max returns 2M, zero returns None
- Error parsing: Anthropic format ('X > Y maximum'), LM Studio, empty string,
unreasonably large numbers — also fixed parser to handle Anthropic format
- Cache: corruption resilience (garbage YAML, wrong structure), value updates,
special chars in model names
Firecrawl config tests (8 tests, was 4):
- Singleton caching (core purpose — verified constructor called once)
- Constructor failure recovery (retry after exception)
- Return value actually asserted (not just constructor args)
- Empty string env vars treated as absent
- Proper setup/teardown for env var isolation
2026-03-05 18:22:39 -08:00
|
|
|
|
r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum"
|
|
|
|
|
|
r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum"
|
2026-03-05 16:09:57 -08:00
|
|
|
|
]
|
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
|
match = re.search(pattern, error_lower)
|
|
|
|
|
|
if match:
|
|
|
|
|
|
limit = int(match.group(1))
|
|
|
|
|
|
# Sanity check: must be a reasonable context length
|
|
|
|
|
|
if 1024 <= limit <= 10_000_000:
|
|
|
|
|
|
return limit
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
fix(compaction): don't halve context_length on output-cap-too-large errors
When the API returns "max_tokens too large given prompt" (input tokens
are within the context window, but input + requested output > window),
the old code incorrectly routed through the same handler as "prompt too
long" errors, calling get_next_probe_tier() and permanently halving
context_length. This made things worse: the window was fine, only the
requested output size needed trimming for that one call.
Two distinct error classes now handled separately:
Prompt too long — input itself exceeds context window.
Fix: compress history + halve context_length (existing behaviour,
unchanged).
Output cap too large — input OK, but input + max_tokens > window.
Fix: parse available_tokens from the error message, set a one-shot
_ephemeral_max_output_tokens override for the retry, and leave
context_length completely untouched.
Changes:
- agent/model_metadata.py: add parse_available_output_tokens_from_error()
that detects Anthropic's "available_tokens: N" error format and returns
the available output budget, or None for all other error types.
- run_agent.py: call the new parser first in the is_context_length_error
block; if it fires, set _ephemeral_max_output_tokens (with a 64-token
safety margin) and break to retry without touching context_length.
_build_api_kwargs consumes the ephemeral value exactly once then clears
it so subsequent calls use self.max_tokens normally.
- agent/anthropic_adapter.py: expand build_anthropic_kwargs docstring to
clearly document the max_tokens (output cap) vs context_length (total
window) distinction, which is a persistent source of confusion due to
the OpenAI-inherited "max_tokens" name.
- cli-config.yaml.example: add inline comments explaining both keys side
by side where users are most likely to look.
- website/docs/integrations/providers.md: add a callout box at the top
of "Context Length Detection" and clarify the troubleshooting entry.
- tests/test_ctx_halving_fix.py: 24 tests across four classes covering
the parser, build_anthropic_kwargs clamping, ephemeral one-shot
consumption, and the invariant that context_length is never mutated
on output-cap errors.
2026-04-09 16:54:23 +02:00
|
|
|
|
def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
|
|
|
|
|
|
"""Detect an "output cap too large" error and return how many output tokens are available.
|
|
|
|
|
|
|
|
|
|
|
|
Background — two distinct context errors exist:
|
|
|
|
|
|
1. "Prompt too long" — the INPUT itself exceeds the context window.
|
|
|
|
|
|
Fix: compress history and/or halve context_length.
|
|
|
|
|
|
2. "max_tokens too large" — input is fine, but input + requested_output > window.
|
|
|
|
|
|
Fix: reduce max_tokens (the output cap) for this call.
|
|
|
|
|
|
Do NOT touch context_length — the window hasn't shrunk.
|
|
|
|
|
|
|
|
|
|
|
|
Anthropic's API returns errors like:
|
|
|
|
|
|
"max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"
|
|
|
|
|
|
|
|
|
|
|
|
Returns the number of output tokens that would fit (e.g. 10000 above), or None if
|
|
|
|
|
|
the error does not look like a max_tokens-too-large error.
|
|
|
|
|
|
"""
|
|
|
|
|
|
error_lower = error_msg.lower()
|
|
|
|
|
|
|
|
|
|
|
|
# Must look like an output-cap error, not a prompt-length error.
|
|
|
|
|
|
is_output_cap_error = (
|
|
|
|
|
|
"max_tokens" in error_lower
|
|
|
|
|
|
and ("available_tokens" in error_lower or "available tokens" in error_lower)
|
|
|
|
|
|
)
|
|
|
|
|
|
if not is_output_cap_error:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# Extract the available_tokens figure.
|
|
|
|
|
|
# Anthropic format: "… = available_tokens: 10000"
|
|
|
|
|
|
patterns = [
|
|
|
|
|
|
r'available_tokens[:\s]+(\d+)',
|
|
|
|
|
|
r'available\s+tokens[:\s]+(\d+)',
|
|
|
|
|
|
# fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
|
|
|
|
|
|
r'=\s*(\d+)\s*$',
|
|
|
|
|
|
]
|
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
|
match = re.search(pattern, error_lower)
|
|
|
|
|
|
if match:
|
|
|
|
|
|
tokens = int(match.group(1))
|
|
|
|
|
|
if tokens >= 1:
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
|
def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
|
|
|
|
|
|
"""Return True if *candidate_id* (from server) matches *lookup_model* (configured).
|
|
|
|
|
|
|
|
|
|
|
|
Supports two forms:
|
|
|
|
|
|
- Exact match: "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
|
|
|
|
|
|
- Slug match: "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
|
|
|
|
|
|
(the part after the last "/" equals lookup_model)
|
|
|
|
|
|
|
|
|
|
|
|
This covers LM Studio's native API which stores models as "publisher/slug"
|
|
|
|
|
|
while users typically configure only the slug after the "local:" prefix.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if candidate_id == lookup_model:
|
|
|
|
|
|
return True
|
|
|
|
|
|
# Slug match: basename of candidate equals the lookup name
|
|
|
|
|
|
if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Optional[int]:
|
2026-04-07 22:23:28 -07:00
|
|
|
|
"""Query an Ollama server for the model's context length.
|
|
|
|
|
|
|
|
|
|
|
|
Returns the model's maximum context from GGUF metadata via ``/api/show``,
|
|
|
|
|
|
or the explicit ``num_ctx`` from the Modelfile if set. Returns None if
|
|
|
|
|
|
the server is unreachable or not Ollama.
|
|
|
|
|
|
|
|
|
|
|
|
This is the value that should be passed as ``num_ctx`` in Ollama chat
|
|
|
|
|
|
requests to override the default 2048.
|
|
|
|
|
|
"""
|
|
|
|
|
|
import httpx
|
|
|
|
|
|
|
|
|
|
|
|
bare_model = _strip_provider_prefix(model)
|
|
|
|
|
|
server_url = base_url.rstrip("/")
|
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
2026-04-20 20:49:44 -07:00
|
|
|
|
server_type = detect_local_server_type(base_url, api_key=api_key)
|
2026-04-07 22:23:28 -07:00
|
|
|
|
except Exception:
|
|
|
|
|
|
return None
|
|
|
|
|
|
if server_type != "ollama":
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
headers = _auth_headers(api_key)
|
|
|
|
|
|
|
2026-04-07 22:23:28 -07:00
|
|
|
|
try:
|
2026-04-20 20:49:44 -07:00
|
|
|
|
with httpx.Client(timeout=3.0, headers=headers) as client:
|
2026-04-07 22:23:28 -07:00
|
|
|
|
resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
return None
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
|
|
|
|
|
|
# Prefer explicit num_ctx from Modelfile parameters (user override)
|
|
|
|
|
|
params = data.get("parameters", "")
|
|
|
|
|
|
if "num_ctx" in params:
|
|
|
|
|
|
for line in params.split("\n"):
|
|
|
|
|
|
if "num_ctx" in line:
|
|
|
|
|
|
parts = line.strip().split()
|
|
|
|
|
|
if len(parts) >= 2:
|
|
|
|
|
|
try:
|
|
|
|
|
|
return int(parts[-1])
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# Fall back to GGUF model_info context_length (training max)
|
|
|
|
|
|
model_info = data.get("model_info", {})
|
|
|
|
|
|
for key, value in model_info.items():
|
|
|
|
|
|
if "context_length" in key and isinstance(value, (int, float)):
|
|
|
|
|
|
return int(value)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
|
2026-03-18 21:38:41 +01:00
|
|
|
|
"""Query a local server for the model's context length."""
|
|
|
|
|
|
import httpx
|
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
|
# Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
|
|
|
|
|
|
# Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
|
|
|
|
|
|
model = _strip_provider_prefix(model)
|
2026-03-18 22:00:53 +01:00
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
# Strip /v1 suffix to get the server root
|
|
|
|
|
|
server_url = base_url.rstrip("/")
|
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
2026-04-20 20:49:44 -07:00
|
|
|
|
headers = _auth_headers(api_key)
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
try:
|
2026-04-20 20:49:44 -07:00
|
|
|
|
server_type = detect_local_server_type(base_url, api_key=api_key)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
except Exception:
|
|
|
|
|
|
server_type = None
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
2026-04-20 20:49:44 -07:00
|
|
|
|
with httpx.Client(timeout=3.0, headers=headers) as client:
|
2026-03-18 21:38:41 +01:00
|
|
|
|
# Ollama: /api/show returns model details with context info
|
|
|
|
|
|
if server_type == "ollama":
|
|
|
|
|
|
resp = client.post(f"{server_url}/api/show", json={"name": model})
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
data = resp.json()
|
2026-04-13 11:41:45 +02:00
|
|
|
|
# Prefer explicit num_ctx from Modelfile parameters: this is
|
|
|
|
|
|
# the *runtime* context Ollama will actually allocate KV cache
|
|
|
|
|
|
# for. The GGUF model_info.context_length is the training max,
|
|
|
|
|
|
# which can be larger than num_ctx — using it here would let
|
|
|
|
|
|
# Hermes grow conversations past the runtime limit and Ollama
|
|
|
|
|
|
# would silently truncate. Matches query_ollama_num_ctx().
|
2026-03-18 21:38:41 +01:00
|
|
|
|
params = data.get("parameters", "")
|
|
|
|
|
|
if "num_ctx" in params:
|
|
|
|
|
|
for line in params.split("\n"):
|
|
|
|
|
|
if "num_ctx" in line:
|
|
|
|
|
|
parts = line.strip().split()
|
|
|
|
|
|
if len(parts) >= 2:
|
|
|
|
|
|
try:
|
|
|
|
|
|
return int(parts[-1])
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
pass
|
2026-04-13 11:41:45 +02:00
|
|
|
|
# Fall back to GGUF model_info context_length (training max)
|
|
|
|
|
|
model_info = data.get("model_info", {})
|
|
|
|
|
|
for key, value in model_info.items():
|
|
|
|
|
|
if "context_length" in key and isinstance(value, (int, float)):
|
|
|
|
|
|
return int(value)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
|
# LM Studio native API: /api/v1/models returns max_context_length.
|
|
|
|
|
|
# This is more reliable than the OpenAI-compat /v1/models which
|
|
|
|
|
|
# doesn't include context window information for LM Studio servers.
|
|
|
|
|
|
# Use _model_id_matches for fuzzy matching: LM Studio stores models as
|
|
|
|
|
|
# "publisher/slug" but users configure only "slug" after "local:" prefix.
|
|
|
|
|
|
if server_type == "lm-studio":
|
|
|
|
|
|
resp = client.get(f"{server_url}/api/v1/models")
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
for m in data.get("models", []):
|
|
|
|
|
|
if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
|
|
|
|
|
|
# Prefer loaded instance context (actual runtime value)
|
|
|
|
|
|
for inst in m.get("loaded_instances", []):
|
|
|
|
|
|
cfg = inst.get("config", {})
|
|
|
|
|
|
ctx = cfg.get("context_length")
|
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
# Fall back to max_context_length (theoretical model max)
|
|
|
|
|
|
ctx = m.get("max_context_length") or m.get("context_length")
|
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
|
# LM Studio / vLLM / llama.cpp: try /v1/models/{model}
|
|
|
|
|
|
resp = client.get(f"{server_url}/v1/models/{model}")
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
# vLLM returns max_model_len
|
|
|
|
|
|
ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
|
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
|
# Try /v1/models and find the model in the list.
|
|
|
|
|
|
# Use _model_id_matches to handle "publisher/slug" vs bare "slug".
|
2026-03-18 21:38:41 +01:00
|
|
|
|
resp = client.get(f"{server_url}/v1/models")
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
models_list = data.get("data", [])
|
|
|
|
|
|
for m in models_list:
|
2026-03-18 22:00:53 +01:00
|
|
|
|
if _model_id_matches(m.get("id", ""), model):
|
2026-03-18 21:38:41 +01:00
|
|
|
|
ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
|
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
def _normalize_model_version(model: str) -> str:
|
|
|
|
|
|
"""Normalize version separators for matching.
|
|
|
|
|
|
|
|
|
|
|
|
Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
|
|
|
|
|
|
OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
|
|
|
|
|
|
Normalize both to dashes for comparison.
|
|
|
|
|
|
"""
|
|
|
|
|
|
return model.replace(".", "-")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
|
|
|
|
|
|
"""Query Anthropic's /v1/models endpoint for context length.
|
|
|
|
|
|
|
|
|
|
|
|
Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
|
|
|
|
|
|
OAuth tokens (sk-ant-oat*) from Claude Code return 401.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not api_key or api_key.startswith("sk-ant-oat"):
|
|
|
|
|
|
return None # OAuth tokens can't access /v1/models
|
|
|
|
|
|
try:
|
|
|
|
|
|
base = base_url.rstrip("/")
|
|
|
|
|
|
if base.endswith("/v1"):
|
|
|
|
|
|
base = base[:-3]
|
|
|
|
|
|
url = f"{base}/v1/models?limit=1000"
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
"x-api-key": api_key,
|
|
|
|
|
|
"anthropic-version": "2023-06-01",
|
|
|
|
|
|
}
|
|
|
|
|
|
resp = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
return None
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
for m in data.get("data", []):
|
|
|
|
|
|
if m.get("id") == model:
|
|
|
|
|
|
ctx = m.get("max_input_tokens")
|
|
|
|
|
|
if isinstance(ctx, int) and ctx > 0:
|
|
|
|
|
|
return ctx
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.debug("Anthropic /v1/models query failed: %s", e)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_nous_context_length(model: str) -> Optional[int]:
|
|
|
|
|
|
"""Resolve Nous Portal model context length via OpenRouter metadata.
|
|
|
|
|
|
|
|
|
|
|
|
Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
|
|
|
|
|
|
prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
|
|
|
|
|
|
with version normalization (dot↔dash).
|
|
|
|
|
|
"""
|
|
|
|
|
|
metadata = fetch_model_metadata() # OpenRouter cache
|
|
|
|
|
|
# Exact match first
|
|
|
|
|
|
if model in metadata:
|
|
|
|
|
|
return metadata[model].get("context_length")
|
|
|
|
|
|
|
|
|
|
|
|
normalized = _normalize_model_version(model).lower()
|
|
|
|
|
|
|
|
|
|
|
|
for or_id, entry in metadata.items():
|
|
|
|
|
|
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
|
|
|
|
|
if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
|
|
|
|
|
|
return entry.get("context_length")
|
|
|
|
|
|
|
|
|
|
|
|
# Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
|
|
|
|
|
|
# Require match to be at a word boundary (followed by -, :, or end of string)
|
|
|
|
|
|
model_lower = model.lower()
|
|
|
|
|
|
for or_id, entry in metadata.items():
|
|
|
|
|
|
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
|
|
|
|
|
for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
|
|
|
|
|
|
if candidate.startswith(query) and (
|
|
|
|
|
|
len(candidate) == len(query) or candidate[len(query)] in "-:."
|
|
|
|
|
|
):
|
|
|
|
|
|
return entry.get("context_length")
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-19 06:01:16 -07:00
|
|
|
|
def get_model_context_length(
|
|
|
|
|
|
model: str,
|
|
|
|
|
|
base_url: str = "",
|
|
|
|
|
|
api_key: str = "",
|
|
|
|
|
|
config_context_length: int | None = None,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
provider: str = "",
|
2026-03-19 06:01:16 -07:00
|
|
|
|
) -> int:
|
2026-03-05 16:09:57 -08:00
|
|
|
|
"""Get the context length for a model.
|
|
|
|
|
|
|
|
|
|
|
|
Resolution order:
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
0. Explicit config override (model.context_length or custom_providers per-model)
|
2026-03-05 16:09:57 -08:00
|
|
|
|
1. Persistent cache (previously discovered via probing)
|
2026-03-18 03:04:07 -07:00
|
|
|
|
2. Active endpoint metadata (/models for explicit custom endpoints)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
3. Local server query (for local endpoints)
|
|
|
|
|
|
4. Anthropic /v1/models API (API-key users only, not OAuth)
|
|
|
|
|
|
5. OpenRouter live API metadata
|
|
|
|
|
|
6. Nous suffix-match via OpenRouter cache
|
|
|
|
|
|
7. models.dev registry lookup (provider-aware)
|
|
|
|
|
|
8. Thin hardcoded defaults (broad family patterns)
|
|
|
|
|
|
9. Default fallback (128K)
|
2026-03-05 16:09:57 -08:00
|
|
|
|
"""
|
2026-03-19 06:01:16 -07:00
|
|
|
|
# 0. Explicit config override — user knows best
|
|
|
|
|
|
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
|
|
|
|
|
|
return config_context_length
|
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
|
# Normalise provider-prefixed model names (e.g. "local:model-name" →
|
|
|
|
|
|
# "model-name") so cache lookups and server queries use the bare ID that
|
2026-03-20 03:19:31 -07:00
|
|
|
|
# local servers actually know about. Ollama "model:tag" colons are preserved.
|
|
|
|
|
|
model = _strip_provider_prefix(model)
|
2026-03-18 22:00:53 +01:00
|
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
|
# 1. Check persistent cache (model+provider)
|
|
|
|
|
|
if base_url:
|
|
|
|
|
|
cached = get_cached_context_length(model, base_url)
|
|
|
|
|
|
if cached is not None:
|
|
|
|
|
|
return cached
|
|
|
|
|
|
|
2026-03-22 08:15:06 -07:00
|
|
|
|
# 2. Active endpoint metadata for truly custom/unknown endpoints.
|
|
|
|
|
|
# Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
|
|
|
|
|
|
# /models endpoint may report a provider-imposed limit (e.g. Copilot
|
|
|
|
|
|
# returns 128k) instead of the model's full context (400k). models.dev
|
|
|
|
|
|
# has the correct per-provider values and is checked at step 5+.
|
|
|
|
|
|
if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url):
|
2026-03-18 03:04:07 -07:00
|
|
|
|
endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
|
2026-03-19 06:01:16 -07:00
|
|
|
|
matched = endpoint_metadata.get(model)
|
|
|
|
|
|
if not matched:
|
|
|
|
|
|
# Single-model servers: if only one model is loaded, use it
|
|
|
|
|
|
if len(endpoint_metadata) == 1:
|
|
|
|
|
|
matched = next(iter(endpoint_metadata.values()))
|
|
|
|
|
|
else:
|
|
|
|
|
|
# Fuzzy match: substring in either direction
|
|
|
|
|
|
for key, entry in endpoint_metadata.items():
|
|
|
|
|
|
if model in key or key in model:
|
|
|
|
|
|
matched = entry
|
|
|
|
|
|
break
|
|
|
|
|
|
if matched:
|
|
|
|
|
|
context_length = matched.get("context_length")
|
2026-03-18 03:04:07 -07:00
|
|
|
|
if isinstance(context_length, int):
|
|
|
|
|
|
return context_length
|
|
|
|
|
|
if not _is_known_provider_base_url(base_url):
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 3. Try querying local server directly
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if is_local_endpoint(base_url):
|
2026-04-20 20:49:44 -07:00
|
|
|
|
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if local_ctx and local_ctx > 0:
|
|
|
|
|
|
save_context_length(model, base_url, local_ctx)
|
|
|
|
|
|
return local_ctx
|
2026-03-19 06:01:16 -07:00
|
|
|
|
logger.info(
|
|
|
|
|
|
"Could not detect context length for model %r at %s — "
|
|
|
|
|
|
"defaulting to %s tokens (probe-down). Set model.context_length "
|
|
|
|
|
|
"in config.yaml to override.",
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
|
2026-03-19 06:01:16 -07:00
|
|
|
|
)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
return DEFAULT_FALLBACK_CONTEXT
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
|
|
|
|
|
|
if provider == "anthropic" or (
|
fix: extend hostname-match provider detection across remaining call sites
Aslaaen's fix in the original PR covered _detect_api_mode_for_url and the
two openai/xai sites in run_agent.py. This finishes the sweep: the same
substring-match false-positive class (e.g. https://api.openai.com.evil/v1,
https://proxy/api.openai.com/v1, https://api.anthropic.com.example/v1)
existed in eight more call sites, and the hostname helper was duplicated
in two modules.
- utils: add shared base_url_hostname() (single source of truth).
- hermes_cli/runtime_provider, run_agent: drop local duplicates, import
from utils. Reuse the cached AIAgent._base_url_hostname attribute
everywhere it's already populated.
- agent/auxiliary_client: switch codex-wrap auto-detect, max_completion_tokens
gate (auxiliary_max_tokens_param), and custom-endpoint max_tokens kwarg
selection to hostname equality.
- run_agent: native-anthropic check in the Claude-style model branch
and in the AIAgent init provider-auto-detect branch.
- agent/model_metadata: Anthropic /v1/models context-length lookup.
- hermes_cli/providers.determine_api_mode: anthropic / openai URL
heuristics for custom/unknown providers (the /anthropic path-suffix
convention for third-party gateways is preserved).
- tools/delegate_tool: anthropic detection for delegated subagent
runtimes.
- hermes_cli/setup, hermes_cli/tools_config: setup-wizard vision-endpoint
native-OpenAI detection (paired with deduping the repeated check into
a single is_native_openai boolean per branch).
Tests:
- tests/test_base_url_hostname.py covers the helper directly
(path-containing-host, host-suffix, trailing dot, port, case).
- tests/hermes_cli/test_determine_api_mode_hostname.py adds the same
regression class for determine_api_mode, plus a test that the
/anthropic third-party gateway convention still wins.
Also: add asslaenn5@gmail.com → Aslaaen to scripts/release.py AUTHOR_MAP.
2026-04-20 20:58:01 -07:00
|
|
|
|
base_url and base_url_hostname(base_url) == "api.anthropic.com"
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
):
|
|
|
|
|
|
ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
|
|
|
|
|
|
if ctx:
|
|
|
|
|
|
return ctx
|
|
|
|
|
|
|
feat: native AWS Bedrock provider via Converse API
Salvaged from PR #7920 by JiaDe-Wu — cherry-picked Bedrock-specific
additions onto current main, skipping stale-branch reverts (293 commits
behind).
Dual-path architecture:
- Claude models → AnthropicBedrock SDK (prompt caching, thinking budgets)
- Non-Claude models → Converse API via boto3 (Nova, DeepSeek, Llama, Mistral)
Includes:
- Core adapter (agent/bedrock_adapter.py, 1098 lines)
- Full provider registration (auth, models, providers, config, runtime, main)
- IAM credential chain + Bedrock API Key auth modes
- Dynamic model discovery via ListFoundationModels + ListInferenceProfiles
- Streaming with delta callbacks, error classification, guardrails
- hermes doctor + hermes auth integration
- /usage pricing for 7 Bedrock models
- 130 automated tests (79 unit + 28 integration + follow-up fixes)
- Documentation (website/docs/guides/aws-bedrock.md)
- boto3 optional dependency (pip install hermes-agent[bedrock])
Co-authored-by: JiaDe WU <40445668+JiaDe-Wu@users.noreply.github.com>
2026-04-15 15:18:01 -07:00
|
|
|
|
# 4b. AWS Bedrock — use static context length table.
|
|
|
|
|
|
# Bedrock's ListFoundationModels doesn't expose context window sizes,
|
|
|
|
|
|
# so we maintain a curated table in bedrock_adapter.py.
|
fix: sweep remaining provider-URL substring checks across codebase
Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.
New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
'domain in base_url'. Accepts hostname equality and subdomain matches;
rejects path segments, host suffixes, and prefix collisions.
Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):
run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection
agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
(resolve custom, resolve auto, OpenRouter-fallback-to-custom,
_async_client_from_sync, resolve_provider_client explicit-custom,
resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff
agent/usage_pricing.py:
- resolve_billing_route openrouter branch
agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup
hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic
hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)
hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes
tools/delegate_tool.py:
- subagent Codex endpoint detection
trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
kimi-coding, arcee, minimax-cn, minimax)
cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)
Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.
Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
suite (exact match, subdomain, path-segment rejection, host-suffix
rejection, host-prefix rejection, empty-input, case-insensitivity,
trailing dot).
Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.
2026-04-20 21:17:28 -07:00
|
|
|
|
if provider == "bedrock" or (
|
|
|
|
|
|
base_url
|
|
|
|
|
|
and base_url_hostname(base_url).startswith("bedrock-runtime.")
|
|
|
|
|
|
and base_url_host_matches(base_url, "amazonaws.com")
|
|
|
|
|
|
):
|
feat: native AWS Bedrock provider via Converse API
Salvaged from PR #7920 by JiaDe-Wu — cherry-picked Bedrock-specific
additions onto current main, skipping stale-branch reverts (293 commits
behind).
Dual-path architecture:
- Claude models → AnthropicBedrock SDK (prompt caching, thinking budgets)
- Non-Claude models → Converse API via boto3 (Nova, DeepSeek, Llama, Mistral)
Includes:
- Core adapter (agent/bedrock_adapter.py, 1098 lines)
- Full provider registration (auth, models, providers, config, runtime, main)
- IAM credential chain + Bedrock API Key auth modes
- Dynamic model discovery via ListFoundationModels + ListInferenceProfiles
- Streaming with delta callbacks, error classification, guardrails
- hermes doctor + hermes auth integration
- /usage pricing for 7 Bedrock models
- 130 automated tests (79 unit + 28 integration + follow-up fixes)
- Documentation (website/docs/guides/aws-bedrock.md)
- boto3 optional dependency (pip install hermes-agent[bedrock])
Co-authored-by: JiaDe WU <40445668+JiaDe-Wu@users.noreply.github.com>
2026-04-15 15:18:01 -07:00
|
|
|
|
try:
|
|
|
|
|
|
from agent.bedrock_adapter import get_bedrock_context_length
|
|
|
|
|
|
return get_bedrock_context_length(model)
|
|
|
|
|
|
except ImportError:
|
|
|
|
|
|
pass # boto3 not installed — fall through to generic resolution
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 5. Provider-aware lookups (before generic OpenRouter cache)
|
|
|
|
|
|
# These are provider-specific and take priority over the generic OR cache,
|
|
|
|
|
|
# since the same model can have different context limits per provider
|
|
|
|
|
|
# (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
# If provider is generic (openrouter/custom/empty), try to infer from URL.
|
|
|
|
|
|
effective_provider = provider
|
|
|
|
|
|
if not effective_provider or effective_provider in ("openrouter", "custom"):
|
|
|
|
|
|
if base_url:
|
|
|
|
|
|
inferred = _infer_provider_from_url(base_url)
|
|
|
|
|
|
if inferred:
|
|
|
|
|
|
effective_provider = inferred
|
|
|
|
|
|
|
|
|
|
|
|
if effective_provider == "nous":
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
ctx = _resolve_nous_context_length(model)
|
|
|
|
|
|
if ctx:
|
|
|
|
|
|
return ctx
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
if effective_provider:
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
from agent.models_dev import lookup_models_dev_context
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
|
ctx = lookup_models_dev_context(effective_provider, model)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
if ctx:
|
|
|
|
|
|
return ctx
|
|
|
|
|
|
|
|
|
|
|
|
# 6. OpenRouter live API metadata (provider-unaware fallback)
|
2026-02-21 22:31:43 -08:00
|
|
|
|
metadata = fetch_model_metadata()
|
|
|
|
|
|
if model in metadata:
|
|
|
|
|
|
return metadata[model].get("context_length", 128000)
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
|
2026-03-20 08:52:37 -07:00
|
|
|
|
# Only check `default_model in model` (is the key a substring of the input).
|
|
|
|
|
|
# The reverse (`model in default_model`) causes shorter names like
|
|
|
|
|
|
# "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
|
2026-03-21 10:47:44 -07:00
|
|
|
|
model_lower = model.lower()
|
2026-03-17 04:12:08 -07:00
|
|
|
|
for default_model, length in sorted(
|
|
|
|
|
|
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
|
|
|
|
|
|
):
|
2026-03-21 10:47:44 -07:00
|
|
|
|
if default_model in model_lower:
|
2026-02-21 22:31:43 -08:00
|
|
|
|
return length
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 9. Query local server as last resort
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if base_url and is_local_endpoint(base_url):
|
2026-04-20 20:49:44 -07:00
|
|
|
|
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
if local_ctx and local_ctx > 0:
|
|
|
|
|
|
save_context_length(model, base_url, local_ctx)
|
|
|
|
|
|
return local_ctx
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
|
# 10. Default fallback — 128K
|
|
|
|
|
|
return DEFAULT_FALLBACK_CONTEXT
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_tokens_rough(text: str) -> int:
|
2026-04-11 16:33:35 -07:00
|
|
|
|
"""Rough token estimate (~4 chars/token) for pre-flight checks.
|
|
|
|
|
|
|
|
|
|
|
|
Uses ceiling division so short texts (1-3 chars) never estimate as
|
|
|
|
|
|
0 tokens, which would cause the compressor and pre-flight checks to
|
|
|
|
|
|
systematically undercount when many short tool results are present.
|
|
|
|
|
|
"""
|
2026-02-21 22:31:43 -08:00
|
|
|
|
if not text:
|
|
|
|
|
|
return 0
|
2026-04-11 16:33:35 -07:00
|
|
|
|
return (len(text) + 3) // 4
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
|
|
|
|
|
"""Rough token estimate for a message list (pre-flight only)."""
|
|
|
|
|
|
total_chars = sum(len(str(msg)) for msg in messages)
|
2026-04-11 16:33:35 -07:00
|
|
|
|
return (total_chars + 3) // 4
|
2026-03-26 02:00:50 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_request_tokens_rough(
|
|
|
|
|
|
messages: List[Dict[str, Any]],
|
|
|
|
|
|
*,
|
|
|
|
|
|
system_prompt: str = "",
|
|
|
|
|
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
"""Rough token estimate for a full chat-completions request.
|
|
|
|
|
|
|
|
|
|
|
|
Includes the major payload buckets Hermes sends to providers:
|
|
|
|
|
|
system prompt, conversation messages, and tool schemas. With 50+
|
|
|
|
|
|
tools enabled, schemas alone can add 20-30K tokens — a significant
|
|
|
|
|
|
blind spot when only counting messages.
|
|
|
|
|
|
"""
|
|
|
|
|
|
total_chars = 0
|
|
|
|
|
|
if system_prompt:
|
|
|
|
|
|
total_chars += len(system_prompt)
|
|
|
|
|
|
if messages:
|
|
|
|
|
|
total_chars += sum(len(str(msg)) for msg in messages)
|
|
|
|
|
|
if tools:
|
|
|
|
|
|
total_chars += len(str(tools))
|
2026-04-11 16:33:35 -07:00
|
|
|
|
return (total_chars + 3) // 4
|