2026-02-21 22:31:43 -08:00
|
|
|
"""Model metadata, context lengths, and token estimation utilities.
|
|
|
|
|
|
|
|
|
|
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
|
|
|
|
and run_agent.py for pre-flight context checks.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import logging
|
2026-03-05 16:09:57 -08:00
|
|
|
import re
|
2026-02-21 22:31:43 -08:00
|
|
|
import time
|
2026-03-05 16:09:57 -08:00
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Dict, List, Optional
|
2026-03-18 03:04:07 -07:00
|
|
|
from urllib.parse import urlparse
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
import requests
|
2026-03-05 16:09:57 -08:00
|
|
|
import yaml
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
from hermes_constants import OPENROUTER_MODELS_URL
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
# Provider names that can appear as a "provider:" prefix before a model ID.
|
|
|
|
|
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
|
|
|
|
|
# are preserved so the full model name reaches cache lookups and server queries.
|
|
|
|
|
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
|
|
|
|
|
"openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
|
2026-04-13 11:13:09 -07:00
|
|
|
"gemini", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
|
2026-03-20 03:19:31 -07:00
|
|
|
"opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
"qwen-oauth",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
"xiaomi",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
"arcee",
|
2026-03-20 03:19:31 -07:00
|
|
|
"custom", "local",
|
|
|
|
|
# Common aliases
|
2026-04-06 10:14:01 -07:00
|
|
|
"google", "google-gemini", "google-ai-studio",
|
2026-03-20 03:19:31 -07:00
|
|
|
"glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
|
2026-04-13 11:13:09 -07:00
|
|
|
"github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
|
2026-03-20 03:19:31 -07:00
|
|
|
"opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
"mimo", "xiaomi-mimo",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
"arcee-ai", "arceeai",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
"qwen-portal",
|
2026-03-20 03:19:31 -07:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
2026-03-20 08:52:37 -07:00
|
|
|
_OLLAMA_TAG_PATTERN = re.compile(
|
|
|
|
|
r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
|
|
|
|
|
re.IGNORECASE,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
def _strip_provider_prefix(model: str) -> str:
|
|
|
|
|
"""Strip a recognised provider prefix from a model string.
|
|
|
|
|
|
|
|
|
|
``"local:my-model"`` → ``"my-model"``
|
|
|
|
|
``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix)
|
2026-03-20 08:52:37 -07:00
|
|
|
``"qwen:0.5b"`` → ``"qwen:0.5b"`` (unchanged — Ollama model:tag)
|
|
|
|
|
``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag)
|
2026-03-20 03:19:31 -07:00
|
|
|
"""
|
|
|
|
|
if ":" not in model or model.startswith("http"):
|
|
|
|
|
return model
|
2026-03-20 08:52:37 -07:00
|
|
|
prefix, suffix = model.split(":", 1)
|
|
|
|
|
prefix_lower = prefix.strip().lower()
|
|
|
|
|
if prefix_lower in _PROVIDER_PREFIXES:
|
|
|
|
|
# Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
|
|
|
|
|
if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
|
|
|
|
|
return model
|
|
|
|
|
return suffix
|
2026-03-20 03:19:31 -07:00
|
|
|
return model
|
|
|
|
|
|
2026-02-21 22:31:43 -08:00
|
|
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
_model_metadata_cache_time: float = 0
|
|
|
|
|
_MODEL_CACHE_TTL = 3600
|
2026-03-18 03:04:07 -07:00
|
|
|
_endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {}
|
|
|
|
|
_endpoint_model_metadata_cache_time: Dict[str, float] = {}
|
|
|
|
|
_ENDPOINT_MODEL_CACHE_TTL = 300
|
2026-02-21 22:31:43 -08:00
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
# Descending tiers for context length probing when the model is unknown.
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# We start at 128K (a safe default for most modern models) and step down
|
|
|
|
|
# on context-length errors until one works.
|
2026-03-05 16:09:57 -08:00
|
|
|
CONTEXT_PROBE_TIERS = [
|
|
|
|
|
128_000,
|
|
|
|
|
64_000,
|
|
|
|
|
32_000,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
16_000,
|
|
|
|
|
8_000,
|
2026-03-05 16:09:57 -08:00
|
|
|
]
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# Default context length when no detection method succeeds.
|
|
|
|
|
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
|
|
|
|
|
fix: prevent agent from stopping mid-task — compression floor, budget overhaul, activity tracking
Three root causes of the 'agent stops mid-task' gateway bug:
1. Compression threshold floor (64K tokens minimum)
- The 50% threshold on a 100K-context model fired at 50K tokens,
causing premature compression that made models lose track of
multi-step plans. Now threshold_tokens = max(50% * context, 64K).
- Models with <64K context are rejected at startup with a clear error.
2. Budget warning removal — grace call instead
- Removed the 70%/90% iteration budget warnings entirely. These
injected '[BUDGET WARNING: Provide your final response NOW]' into
tool results, causing models to abandon complex tasks prematurely.
- Now: no warnings during normal execution. When the budget is
actually exhausted (90/90), inject a user message asking the model
to summarise, allow one grace API call, and only then fall back
to _handle_max_iterations.
3. Activity touches during long terminal execution
- _wait_for_process polls every 0.2s but never reported activity.
The gateway's inactivity timeout (default 1800s) would fire during
long-running commands that appeared 'idle.'
- Now: thread-local activity callback fires every 10s during the
poll loop, keeping the gateway's activity tracker alive.
- Agent wires _touch_activity into the callback before each tool call.
Also: docs update noting 64K minimum context requirement.
Closes #7915 (root cause was agent-loop termination, not Weixin delivery limits).
2026-04-11 16:18:57 -07:00
|
|
|
# Minimum context length required to run Hermes Agent. Models with fewer
|
|
|
|
|
# tokens cannot maintain enough working memory for tool-calling workflows.
|
|
|
|
|
# Sessions, model switches, and cron jobs should reject models below this.
|
|
|
|
|
MINIMUM_CONTEXT_LENGTH = 64_000
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# Thin fallback defaults — only broad model family patterns.
|
|
|
|
|
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
|
|
|
|
# all miss. Replaced the previous 80+ entry dict.
|
|
|
|
|
# For provider-specific context lengths, models.dev is the primary source.
|
2026-02-21 22:31:43 -08:00
|
|
|
DEFAULT_CONTEXT_LENGTHS = {
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
|
|
|
|
|
# fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
|
|
|
|
|
# substring of "anthropic/claude-sonnet-4.6").
|
|
|
|
|
# OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
|
2026-03-20 04:38:59 -07:00
|
|
|
"claude-opus-4-6": 1000000,
|
|
|
|
|
"claude-sonnet-4-6": 1000000,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
"claude-opus-4.6": 1000000,
|
|
|
|
|
"claude-sonnet-4.6": 1000000,
|
|
|
|
|
# Catch-all for older Claude models (must sort after specific entries)
|
|
|
|
|
"claude": 200000,
|
fix: correct GPT-5 family context lengths in fallback defaults (#9309)
The generic 'gpt-5' fallback was set to 128,000 — which is the max
OUTPUT tokens, not the context window. GPT-5 base and most variants
(codex, mini) have 400,000 context. This caused /model to report
128k for models like gpt-5.3-codex when models.dev was unavailable.
Added specific entries for GPT-5 variants with different context sizes:
- gpt-5.4, gpt-5.4-pro: 1,050,000 (1.05M)
- gpt-5.4-mini, gpt-5.4-nano: 400,000
- gpt-5.3-codex-spark: 128,000 (reduced)
- gpt-5.1-chat: 128,000 (chat variant)
- gpt-5 (catch-all): 400,000
Sources: https://developers.openai.com/api/docs/models
2026-04-13 19:22:23 -07:00
|
|
|
# OpenAI — GPT-5 family (most have 400k; specific overrides first)
|
|
|
|
|
# Source: https://developers.openai.com/api/docs/models
|
|
|
|
|
"gpt-5.4-nano": 400000, # 400k (not 1.05M like full 5.4)
|
|
|
|
|
"gpt-5.4-mini": 400000, # 400k (not 1.05M like full 5.4)
|
|
|
|
|
"gpt-5.4": 1050000, # GPT-5.4, GPT-5.4 Pro (1.05M context)
|
|
|
|
|
"gpt-5.3-codex-spark": 128000, # Spark variant has reduced 128k context
|
|
|
|
|
"gpt-5.1-chat": 128000, # Chat variant has 128k context
|
|
|
|
|
"gpt-5": 400000, # GPT-5.x base, mini, codex variants (400k)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
"gpt-4.1": 1047576,
|
|
|
|
|
"gpt-4": 128000,
|
|
|
|
|
# Google
|
|
|
|
|
"gemini": 1048576,
|
2026-04-06 10:14:01 -07:00
|
|
|
# Gemma (open models served via AI Studio)
|
2026-04-06 10:19:19 -07:00
|
|
|
"gemma-4-31b": 256000,
|
|
|
|
|
"gemma-4-26b": 256000,
|
2026-04-06 10:14:01 -07:00
|
|
|
"gemma-3": 131072,
|
|
|
|
|
"gemma": 8192, # fallback for older gemma models
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# DeepSeek
|
|
|
|
|
"deepseek": 128000,
|
|
|
|
|
# Meta
|
|
|
|
|
"llama": 131072,
|
2026-04-11 03:29:09 -07:00
|
|
|
# Qwen — specific model families before the catch-all.
|
|
|
|
|
# Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
|
|
|
|
|
"qwen3-coder-plus": 1000000, # 1M context
|
|
|
|
|
"qwen3-coder": 262144, # 256K context
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
"qwen": 131072,
|
fix: align MiniMax provider with official API docs
Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.
Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).
Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api
Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-04-10 03:53:18 -07:00
|
|
|
# MiniMax — official docs: 204,800 context for all models
|
|
|
|
|
# https://platform.minimax.io/docs/api-reference/text-anthropic-api
|
|
|
|
|
"minimax": 204800,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# GLM
|
|
|
|
|
"glm": 202752,
|
fix(model_metadata): add xAI Grok context length fallbacks
xAI /v1/models does not return context_length metadata, so Hermes
probes down to the 128k default whenever a user configures a custom
provider pointing at https://api.x.ai/v1. This forces every xAI user
to manually override model.context_length in config.yaml (2M for
Grok 4.20 / 4.1-fast / 4-fast) or lose most of the usable context
window.
Add DEFAULT_CONTEXT_LENGTHS entries for the Grok family so the
fallback lookup returns the correct value via substring matching.
Values sourced from models.dev (2026-04) and cross-checked against
the xAI /v1/models listing:
- grok-4.20-* 2,000,000 (reasoning, non-reasoning, multi-agent)
- grok-4-1-fast-* 2,000,000
- grok-4-fast-* 2,000,000
- grok-4 / grok-4-0709 256,000
- grok-code-fast-1 256,000
- grok-3* 131,072
- grok-2 / latest 131,072
- grok-2-vision* 8,192
- grok (catch-all) 131,072
Keys are ordered longest-first so that specific variants match before
the catch-all, consistent with the existing Claude/Gemma/MiniMax entries.
Add TestDefaultContextLengths.test_grok_models_context_lengths and
test_grok_substring_matching to pin the values and verify the full
lookup path. All 77 tests in test_model_metadata.py pass.
2026-04-10 12:08:16 +04:00
|
|
|
# xAI Grok — xAI /v1/models does not return context_length metadata,
|
|
|
|
|
# so these hardcoded fallbacks prevent Hermes from probing-down to
|
|
|
|
|
# the default 128k when the user points at https://api.x.ai/v1
|
|
|
|
|
# via a custom provider. Values sourced from models.dev (2026-04).
|
|
|
|
|
# Keys use substring matching (longest-first), so e.g. "grok-4.20"
|
|
|
|
|
# matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
|
|
|
|
|
"grok-code-fast": 256000, # grok-code-fast-1
|
|
|
|
|
"grok-4-1-fast": 2000000, # grok-4-1-fast-(non-)reasoning
|
|
|
|
|
"grok-2-vision": 8192, # grok-2-vision, -1212, -latest
|
|
|
|
|
"grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning
|
|
|
|
|
"grok-4.20": 2000000, # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
|
|
|
|
|
"grok-4": 256000, # grok-4, grok-4-0709
|
|
|
|
|
"grok-3": 131072, # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
|
|
|
|
|
"grok-2": 131072, # grok-2, grok-2-1212, grok-2-latest
|
|
|
|
|
"grok": 131072, # catch-all (grok-beta, unknown grok-*)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# Kimi
|
|
|
|
|
"kimi": 262144,
|
2026-04-03 13:45:16 -07:00
|
|
|
# Arcee
|
|
|
|
|
"trinity": 262144,
|
2026-04-13 21:16:14 -07:00
|
|
|
# OpenRouter
|
|
|
|
|
"elephant": 262144,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
# Hugging Face Inference Providers — model IDs use org/name format
|
|
|
|
|
"Qwen/Qwen3.5-397B-A17B": 131072,
|
2026-03-27 13:54:46 -07:00
|
|
|
"Qwen/Qwen3.5-35B-A3B": 131072,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
"deepseek-ai/DeepSeek-V3.2": 65536,
|
|
|
|
|
"moonshotai/Kimi-K2.5": 262144,
|
|
|
|
|
"moonshotai/Kimi-K2-Thinking": 262144,
|
fix: align MiniMax provider with official API docs
Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.
Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).
Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api
Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-04-10 03:53:18 -07:00
|
|
|
"MiniMaxAI/MiniMax-M2.5": 204800,
|
2026-04-11 11:02:58 -07:00
|
|
|
"XiaomiMiMo/MiMo-V2-Flash": 256000,
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
"mimo-v2-pro": 1000000,
|
|
|
|
|
"mimo-v2-omni": 256000,
|
|
|
|
|
"mimo-v2-flash": 256000,
|
feat: add Hugging Face as a first-class inference provider (#3419)
Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.
Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)
Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.
Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
|
|
|
"zai-org/GLM-5": 202752,
|
2026-02-21 22:31:43 -08:00
|
|
|
}
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
_CONTEXT_LENGTH_KEYS = (
|
|
|
|
|
"context_length",
|
|
|
|
|
"context_window",
|
|
|
|
|
"max_context_length",
|
|
|
|
|
"max_position_embeddings",
|
|
|
|
|
"max_model_len",
|
|
|
|
|
"max_input_tokens",
|
|
|
|
|
"max_sequence_length",
|
|
|
|
|
"max_seq_len",
|
2026-03-19 06:01:16 -07:00
|
|
|
"n_ctx_train",
|
|
|
|
|
"n_ctx",
|
2026-03-18 03:04:07 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
_MAX_COMPLETION_KEYS = (
|
|
|
|
|
"max_completion_tokens",
|
|
|
|
|
"max_output_tokens",
|
|
|
|
|
"max_tokens",
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
# Local server hostnames / address patterns
|
|
|
|
|
_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
|
2026-04-11 14:46:18 -07:00
|
|
|
# Docker / Podman / Lima DNS names that resolve to the host machine
|
|
|
|
|
_CONTAINER_LOCAL_SUFFIXES = (
|
|
|
|
|
".docker.internal",
|
|
|
|
|
".containers.internal",
|
|
|
|
|
".lima.internal",
|
|
|
|
|
)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
|
|
|
def _normalize_base_url(base_url: str) -> str:
|
|
|
|
|
return (base_url or "").strip().rstrip("/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_openrouter_base_url(base_url: str) -> bool:
|
|
|
|
|
return "openrouter.ai" in _normalize_base_url(base_url).lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_custom_endpoint(base_url: str) -> bool:
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
return bool(normalized) and not _is_openrouter_base_url(normalized)
|
|
|
|
|
|
|
|
|
|
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
_URL_TO_PROVIDER: Dict[str, str] = {
|
|
|
|
|
"api.openai.com": "openai",
|
|
|
|
|
"chatgpt.com": "openai",
|
|
|
|
|
"api.anthropic.com": "anthropic",
|
|
|
|
|
"api.z.ai": "zai",
|
|
|
|
|
"api.moonshot.ai": "kimi-coding",
|
2026-04-13 11:13:09 -07:00
|
|
|
"api.moonshot.cn": "kimi-coding-cn",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
"api.kimi.com": "kimi-coding",
|
feat(providers): add Arcee AI as direct API provider
Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.
Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.
Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.
2026-04-13 17:16:43 -07:00
|
|
|
"api.arcee.ai": "arcee",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
"api.minimax": "minimax",
|
|
|
|
|
"dashscope.aliyuncs.com": "alibaba",
|
2026-03-20 12:51:39 -07:00
|
|
|
"dashscope-intl.aliyuncs.com": "alibaba",
|
feat(qwen): add Qwen OAuth provider with portal request support
Based on #6079 by @tunamitom with critical fixes and comprehensive tests.
Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
(DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
5 inline 'portal.qwen.ai' string checks and share headers between init
and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion
New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
max_tokens suppression)
2026-04-08 20:48:21 +05:30
|
|
|
"portal.qwen.ai": "qwen-oauth",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
"openrouter.ai": "openrouter",
|
2026-04-06 10:14:01 -07:00
|
|
|
"generativelanguage.googleapis.com": "gemini",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
"inference-api.nousresearch.com": "nous",
|
|
|
|
|
"api.deepseek.com": "deepseek",
|
2026-03-22 08:15:06 -07:00
|
|
|
"api.githubcopilot.com": "copilot",
|
|
|
|
|
"models.github.ai": "copilot",
|
2026-03-30 20:37:08 -07:00
|
|
|
"api.fireworks.ai": "fireworks",
|
2026-04-02 19:59:19 -05:00
|
|
|
"opencode.ai": "opencode-go",
|
2026-04-10 12:51:30 +04:00
|
|
|
"api.x.ai": "xai",
|
feat(xiaomi): add Xiaomi MiMo as first-class provider
Cherry-picked from PR #7702 by kshitijk4poor.
Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)
Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.
Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.
Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.
36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.
2026-04-11 10:10:31 -07:00
|
|
|
"api.xiaomimimo.com": "xiaomi",
|
|
|
|
|
"xiaomimimo.com": "xiaomi",
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _infer_provider_from_url(base_url: str) -> Optional[str]:
|
|
|
|
|
"""Infer the models.dev provider name from a base URL.
|
|
|
|
|
|
|
|
|
|
This allows context length resolution via models.dev for custom endpoints
|
|
|
|
|
like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
|
|
|
|
|
explicitly set the provider name in config.
|
|
|
|
|
"""
|
2026-03-18 03:04:07 -07:00
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
if not normalized:
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
return None
|
2026-03-18 03:04:07 -07:00
|
|
|
parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
|
|
|
|
|
host = parsed.netloc.lower() or parsed.path.lower()
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
for url_part, provider in _URL_TO_PROVIDER.items():
|
|
|
|
|
if url_part in host:
|
|
|
|
|
return provider
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_known_provider_base_url(base_url: str) -> bool:
|
|
|
|
|
return _infer_provider_from_url(base_url) is not None
|
2026-03-18 03:04:07 -07:00
|
|
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
def is_local_endpoint(base_url: str) -> bool:
|
|
|
|
|
"""Return True if base_url points to a local machine (localhost / RFC-1918 / WSL)."""
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
if not normalized:
|
|
|
|
|
return False
|
|
|
|
|
url = normalized if "://" in normalized else f"http://{normalized}"
|
|
|
|
|
try:
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
host = parsed.hostname or ""
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
if host in _LOCAL_HOSTS:
|
|
|
|
|
return True
|
2026-04-11 14:46:18 -07:00
|
|
|
# Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
|
|
|
|
|
if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
|
|
|
|
|
return True
|
2026-03-18 21:38:41 +01:00
|
|
|
# RFC-1918 private ranges and link-local
|
|
|
|
|
import ipaddress
|
|
|
|
|
try:
|
|
|
|
|
addr = ipaddress.ip_address(host)
|
|
|
|
|
return addr.is_private or addr.is_loopback or addr.is_link_local
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
# Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
|
|
|
|
|
parts = host.split(".")
|
|
|
|
|
if len(parts) == 4:
|
|
|
|
|
try:
|
|
|
|
|
first, second = int(parts[0]), int(parts[1])
|
|
|
|
|
if first == 10:
|
|
|
|
|
return True
|
|
|
|
|
if first == 172 and 16 <= second <= 31:
|
|
|
|
|
return True
|
|
|
|
|
if first == 192 and second == 168:
|
|
|
|
|
return True
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_local_server_type(base_url: str) -> Optional[str]:
|
|
|
|
|
"""Detect which local server is running at base_url by probing known endpoints.
|
|
|
|
|
|
2026-03-19 21:32:04 +01:00
|
|
|
Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
|
2026-03-18 21:38:41 +01:00
|
|
|
"""
|
|
|
|
|
import httpx
|
|
|
|
|
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
server_url = normalized
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with httpx.Client(timeout=2.0) as client:
|
2026-03-19 21:32:04 +01:00
|
|
|
# LM Studio exposes /api/v1/models — check first (most specific)
|
2026-03-18 21:38:41 +01:00
|
|
|
try:
|
2026-03-19 21:32:04 +01:00
|
|
|
r = client.get(f"{server_url}/api/v1/models")
|
2026-03-18 21:38:41 +01:00
|
|
|
if r.status_code == 200:
|
2026-03-19 21:32:04 +01:00
|
|
|
return "lm-studio"
|
2026-03-18 21:38:41 +01:00
|
|
|
except Exception:
|
|
|
|
|
pass
|
2026-03-19 21:32:04 +01:00
|
|
|
# Ollama exposes /api/tags and responds with {"models": [...]}
|
|
|
|
|
# LM Studio returns {"error": "Unexpected endpoint"} with status 200
|
|
|
|
|
# on this path, so we must verify the response contains "models".
|
2026-03-18 21:38:41 +01:00
|
|
|
try:
|
2026-03-19 21:32:04 +01:00
|
|
|
r = client.get(f"{server_url}/api/tags")
|
2026-03-18 21:38:41 +01:00
|
|
|
if r.status_code == 200:
|
2026-03-19 21:32:04 +01:00
|
|
|
try:
|
|
|
|
|
data = r.json()
|
|
|
|
|
if "models" in data:
|
|
|
|
|
return "ollama"
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2026-03-18 21:38:41 +01:00
|
|
|
except Exception:
|
|
|
|
|
pass
|
2026-03-21 18:07:18 -07:00
|
|
|
# llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix)
|
2026-03-18 21:38:41 +01:00
|
|
|
try:
|
2026-03-21 18:07:18 -07:00
|
|
|
r = client.get(f"{server_url}/v1/props")
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
|
r = client.get(f"{server_url}/props") # fallback for older builds
|
2026-03-18 21:38:41 +01:00
|
|
|
if r.status_code == 200 and "default_generation_settings" in r.text:
|
|
|
|
|
return "llamacpp"
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
# vLLM: /version
|
|
|
|
|
try:
|
|
|
|
|
r = client.get(f"{server_url}/version")
|
|
|
|
|
if r.status_code == 200:
|
|
|
|
|
data = r.json()
|
|
|
|
|
if "version" in data:
|
|
|
|
|
return "vllm"
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
def _iter_nested_dicts(value: Any):
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
yield value
|
|
|
|
|
for nested in value.values():
|
|
|
|
|
yield from _iter_nested_dicts(nested)
|
|
|
|
|
elif isinstance(value, list):
|
|
|
|
|
for item in value:
|
|
|
|
|
yield from _iter_nested_dicts(item)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]:
|
|
|
|
|
try:
|
|
|
|
|
if isinstance(value, bool):
|
|
|
|
|
return None
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
value = value.strip().replace(",", "")
|
|
|
|
|
result = int(value)
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
return None
|
|
|
|
|
if minimum <= result <= maximum:
|
|
|
|
|
return result
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]:
|
|
|
|
|
keyset = {key.lower() for key in keys}
|
|
|
|
|
for mapping in _iter_nested_dicts(payload):
|
|
|
|
|
for key, value in mapping.items():
|
|
|
|
|
if str(key).lower() not in keyset:
|
|
|
|
|
continue
|
|
|
|
|
coerced = _coerce_reasonable_int(value)
|
|
|
|
|
if coerced is not None:
|
|
|
|
|
return coerced
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]:
|
|
|
|
|
return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]:
|
|
|
|
|
return _extract_first_int(payload, _MAX_COMPLETION_KEYS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
alias_map = {
|
|
|
|
|
"prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"),
|
|
|
|
|
"completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"),
|
|
|
|
|
"request": ("request", "request_cost"),
|
|
|
|
|
"cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"),
|
|
|
|
|
"cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"),
|
|
|
|
|
}
|
|
|
|
|
for mapping in _iter_nested_dicts(payload):
|
|
|
|
|
normalized = {str(key).lower(): value for key, value in mapping.items()}
|
|
|
|
|
if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()):
|
|
|
|
|
continue
|
|
|
|
|
pricing: Dict[str, Any] = {}
|
|
|
|
|
for target, aliases in alias_map.items():
|
|
|
|
|
for alias in aliases:
|
|
|
|
|
if alias in normalized and normalized[alias] not in (None, ""):
|
|
|
|
|
pricing[target] = normalized[alias]
|
|
|
|
|
break
|
|
|
|
|
if pricing:
|
|
|
|
|
return pricing
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None:
|
|
|
|
|
cache[model_id] = entry
|
|
|
|
|
if "/" in model_id:
|
|
|
|
|
bare_model = model_id.split("/", 1)[1]
|
|
|
|
|
cache.setdefault(bare_model, entry)
|
|
|
|
|
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
|
|
|
|
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
|
|
|
|
global _model_metadata_cache, _model_metadata_cache_time
|
|
|
|
|
|
|
|
|
|
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
|
|
|
|
return _model_metadata_cache
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
|
|
cache = {}
|
|
|
|
|
for model in data.get("data", []):
|
|
|
|
|
model_id = model.get("id", "")
|
2026-03-18 03:04:07 -07:00
|
|
|
entry = {
|
2026-02-21 22:31:43 -08:00
|
|
|
"context_length": model.get("context_length", 128000),
|
|
|
|
|
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
|
|
|
|
"name": model.get("name", model_id),
|
|
|
|
|
"pricing": model.get("pricing", {}),
|
|
|
|
|
}
|
2026-03-18 03:04:07 -07:00
|
|
|
_add_model_aliases(cache, model_id, entry)
|
2026-02-21 22:31:43 -08:00
|
|
|
canonical = model.get("canonical_slug", "")
|
|
|
|
|
if canonical and canonical != model_id:
|
2026-03-18 03:04:07 -07:00
|
|
|
_add_model_aliases(cache, canonical, entry)
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
_model_metadata_cache = cache
|
|
|
|
|
_model_metadata_cache_time = time.time()
|
|
|
|
|
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
|
|
|
|
return cache
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
|
|
|
|
return _model_metadata_cache or {}
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
def fetch_endpoint_model_metadata(
|
|
|
|
|
base_url: str,
|
|
|
|
|
api_key: str = "",
|
|
|
|
|
force_refresh: bool = False,
|
|
|
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
|
|
|
"""Fetch model metadata from an OpenAI-compatible ``/models`` endpoint.
|
|
|
|
|
|
|
|
|
|
This is used for explicit custom endpoints where hardcoded global model-name
|
|
|
|
|
defaults are unreliable. Results are cached in memory per base URL.
|
|
|
|
|
"""
|
|
|
|
|
normalized = _normalize_base_url(base_url)
|
|
|
|
|
if not normalized or _is_openrouter_base_url(normalized):
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
if not force_refresh:
|
|
|
|
|
cached = _endpoint_model_metadata_cache.get(normalized)
|
|
|
|
|
cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0)
|
|
|
|
|
if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL:
|
|
|
|
|
return cached
|
|
|
|
|
|
|
|
|
|
candidates = [normalized]
|
|
|
|
|
if normalized.endswith("/v1"):
|
|
|
|
|
alternate = normalized[:-3].rstrip("/")
|
|
|
|
|
else:
|
|
|
|
|
alternate = normalized + "/v1"
|
|
|
|
|
if alternate and alternate not in candidates:
|
|
|
|
|
candidates.append(alternate)
|
|
|
|
|
|
|
|
|
|
headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
|
|
|
|
|
last_error: Optional[Exception] = None
|
|
|
|
|
|
|
|
|
|
for candidate in candidates:
|
|
|
|
|
url = candidate.rstrip("/") + "/models"
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
payload = response.json()
|
|
|
|
|
cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
for model in payload.get("data", []):
|
|
|
|
|
if not isinstance(model, dict):
|
|
|
|
|
continue
|
|
|
|
|
model_id = model.get("id")
|
|
|
|
|
if not model_id:
|
|
|
|
|
continue
|
|
|
|
|
entry: Dict[str, Any] = {"name": model.get("name", model_id)}
|
|
|
|
|
context_length = _extract_context_length(model)
|
|
|
|
|
if context_length is not None:
|
|
|
|
|
entry["context_length"] = context_length
|
|
|
|
|
max_completion_tokens = _extract_max_completion_tokens(model)
|
|
|
|
|
if max_completion_tokens is not None:
|
|
|
|
|
entry["max_completion_tokens"] = max_completion_tokens
|
|
|
|
|
pricing = _extract_pricing(model)
|
|
|
|
|
if pricing:
|
|
|
|
|
entry["pricing"] = pricing
|
|
|
|
|
_add_model_aliases(cache, model_id, entry)
|
|
|
|
|
|
2026-03-19 06:01:16 -07:00
|
|
|
# If this is a llama.cpp server, query /props for actual allocated context
|
|
|
|
|
is_llamacpp = any(
|
|
|
|
|
m.get("owned_by") == "llamacpp"
|
|
|
|
|
for m in payload.get("data", []) if isinstance(m, dict)
|
|
|
|
|
)
|
|
|
|
|
if is_llamacpp:
|
|
|
|
|
try:
|
2026-03-21 18:07:18 -07:00
|
|
|
# Try /v1/props first (current llama.cpp); fall back to /props for older builds
|
|
|
|
|
base = candidate.rstrip("/").replace("/v1", "")
|
|
|
|
|
props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5)
|
|
|
|
|
if not props_resp.ok:
|
|
|
|
|
props_resp = requests.get(base + "/props", headers=headers, timeout=5)
|
2026-03-19 06:01:16 -07:00
|
|
|
if props_resp.ok:
|
|
|
|
|
props = props_resp.json()
|
|
|
|
|
gen_settings = props.get("default_generation_settings", {})
|
|
|
|
|
n_ctx = gen_settings.get("n_ctx")
|
|
|
|
|
model_alias = props.get("model_alias", "")
|
|
|
|
|
if n_ctx and model_alias and model_alias in cache:
|
|
|
|
|
cache[model_alias]["context_length"] = n_ctx
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
2026-03-18 03:04:07 -07:00
|
|
|
_endpoint_model_metadata_cache[normalized] = cache
|
|
|
|
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
|
|
|
|
return cache
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
last_error = exc
|
|
|
|
|
|
|
|
|
|
if last_error:
|
|
|
|
|
logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error)
|
|
|
|
|
_endpoint_model_metadata_cache[normalized] = {}
|
|
|
|
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
def _get_context_cache_path() -> Path:
|
|
|
|
|
"""Return path to the persistent context length cache file."""
|
refactor: replace inline HERMES_HOME re-implementations with get_hermes_home()
16 callsites across 14 files were re-deriving the hermes home path
via os.environ.get('HERMES_HOME', ...) instead of using the canonical
get_hermes_home() from hermes_constants. This breaks profiles — each
profile has its own HERMES_HOME, and the inline fallback defaults to
~/.hermes regardless.
Fixed by importing and calling get_hermes_home() at each site. For
files already inside the hermes process (agent/, hermes_cli/, tools/,
gateway/, plugins/), this is always safe. Files that run outside the
process context (mcp_serve.py, mcp_oauth.py) already had correct
try/except ImportError fallbacks and were left alone.
Skipped: hermes_constants.py (IS the implementation), env_loader.py
(bootstrap), profiles.py (intentionally manipulates the env var),
standalone scripts (optional-skills/, skills/), and tests.
2026-04-07 10:40:34 -07:00
|
|
|
from hermes_constants import get_hermes_home
|
|
|
|
|
return get_hermes_home() / "context_length_cache.yaml"
|
2026-03-05 16:09:57 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_context_cache() -> Dict[str, int]:
|
2026-03-18 21:38:41 +01:00
|
|
|
"""Load the model+provider -> context_length cache from disk."""
|
2026-03-05 16:09:57 -08:00
|
|
|
path = _get_context_cache_path()
|
|
|
|
|
if not path.exists():
|
|
|
|
|
return {}
|
|
|
|
|
try:
|
|
|
|
|
with open(path) as f:
|
|
|
|
|
data = yaml.safe_load(f) or {}
|
|
|
|
|
return data.get("context_lengths", {})
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.debug("Failed to load context length cache: %s", e)
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_context_length(model: str, base_url: str, length: int) -> None:
|
|
|
|
|
"""Persist a discovered context length for a model+provider combo.
|
|
|
|
|
|
|
|
|
|
Cache key is ``model@base_url`` so the same model name served from
|
|
|
|
|
different providers can have different limits.
|
|
|
|
|
"""
|
|
|
|
|
key = f"{model}@{base_url}"
|
|
|
|
|
cache = _load_context_cache()
|
|
|
|
|
if cache.get(key) == length:
|
|
|
|
|
return # already stored
|
|
|
|
|
cache[key] = length
|
|
|
|
|
path = _get_context_cache_path()
|
|
|
|
|
try:
|
|
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with open(path, "w") as f:
|
|
|
|
|
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
2026-03-18 21:38:41 +01:00
|
|
|
logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
|
2026-03-05 16:09:57 -08:00
|
|
|
except Exception as e:
|
|
|
|
|
logger.debug("Failed to save context length cache: %s", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
|
|
|
|
"""Look up a previously discovered context length for model+provider."""
|
|
|
|
|
key = f"{model}@{base_url}"
|
|
|
|
|
cache = _load_context_cache()
|
|
|
|
|
return cache.get(key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
|
|
|
|
"""Return the next lower probe tier, or None if already at minimum."""
|
|
|
|
|
for tier in CONTEXT_PROBE_TIERS:
|
|
|
|
|
if tier < current_length:
|
|
|
|
|
return tier
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|
|
|
|
"""Try to extract the actual context limit from an API error message.
|
|
|
|
|
|
|
|
|
|
Many providers include the limit in their error text, e.g.:
|
|
|
|
|
- "maximum context length is 32768 tokens"
|
|
|
|
|
- "context_length_exceeded: 131072"
|
|
|
|
|
- "Maximum context size 32768 exceeded"
|
|
|
|
|
- "model's max context length is 65536"
|
|
|
|
|
"""
|
|
|
|
|
error_lower = error_msg.lower()
|
|
|
|
|
# Pattern: look for numbers near context-related keywords
|
|
|
|
|
patterns = [
|
|
|
|
|
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
|
|
|
|
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
|
|
|
|
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
test: comprehensive tests for model metadata + firecrawl config
model_metadata tests (61 tests, was 39):
- Token estimation: concrete value assertions, unicode, tool_call messages,
vision multimodal content, additive verification
- Context length resolution: cache-over-API priority, no-base_url skips cache,
missing context_length key in API response
- API metadata fetch: canonical_slug aliasing, TTL expiry with time mock,
stale cache fallback on API failure, malformed JSON resilience
- Probe tiers: above-max returns 2M, zero returns None
- Error parsing: Anthropic format ('X > Y maximum'), LM Studio, empty string,
unreasonably large numbers — also fixed parser to handle Anthropic format
- Cache: corruption resilience (garbage YAML, wrong structure), value updates,
special chars in model names
Firecrawl config tests (8 tests, was 4):
- Singleton caching (core purpose — verified constructor called once)
- Constructor failure recovery (retry after exception)
- Return value actually asserted (not just constructor args)
- Empty string env vars treated as absent
- Proper setup/teardown for env var isolation
2026-03-05 18:22:39 -08:00
|
|
|
r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum"
|
|
|
|
|
r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum"
|
2026-03-05 16:09:57 -08:00
|
|
|
]
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
match = re.search(pattern, error_lower)
|
|
|
|
|
if match:
|
|
|
|
|
limit = int(match.group(1))
|
|
|
|
|
# Sanity check: must be a reasonable context length
|
|
|
|
|
if 1024 <= limit <= 10_000_000:
|
|
|
|
|
return limit
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
fix(compaction): don't halve context_length on output-cap-too-large errors
When the API returns "max_tokens too large given prompt" (input tokens
are within the context window, but input + requested output > window),
the old code incorrectly routed through the same handler as "prompt too
long" errors, calling get_next_probe_tier() and permanently halving
context_length. This made things worse: the window was fine, only the
requested output size needed trimming for that one call.
Two distinct error classes now handled separately:
Prompt too long — input itself exceeds context window.
Fix: compress history + halve context_length (existing behaviour,
unchanged).
Output cap too large — input OK, but input + max_tokens > window.
Fix: parse available_tokens from the error message, set a one-shot
_ephemeral_max_output_tokens override for the retry, and leave
context_length completely untouched.
Changes:
- agent/model_metadata.py: add parse_available_output_tokens_from_error()
that detects Anthropic's "available_tokens: N" error format and returns
the available output budget, or None for all other error types.
- run_agent.py: call the new parser first in the is_context_length_error
block; if it fires, set _ephemeral_max_output_tokens (with a 64-token
safety margin) and break to retry without touching context_length.
_build_api_kwargs consumes the ephemeral value exactly once then clears
it so subsequent calls use self.max_tokens normally.
- agent/anthropic_adapter.py: expand build_anthropic_kwargs docstring to
clearly document the max_tokens (output cap) vs context_length (total
window) distinction, which is a persistent source of confusion due to
the OpenAI-inherited "max_tokens" name.
- cli-config.yaml.example: add inline comments explaining both keys side
by side where users are most likely to look.
- website/docs/integrations/providers.md: add a callout box at the top
of "Context Length Detection" and clarify the troubleshooting entry.
- tests/test_ctx_halving_fix.py: 24 tests across four classes covering
the parser, build_anthropic_kwargs clamping, ephemeral one-shot
consumption, and the invariant that context_length is never mutated
on output-cap errors.
2026-04-09 16:54:23 +02:00
|
|
|
def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
|
|
|
|
|
"""Detect an "output cap too large" error and return how many output tokens are available.
|
|
|
|
|
|
|
|
|
|
Background — two distinct context errors exist:
|
|
|
|
|
1. "Prompt too long" — the INPUT itself exceeds the context window.
|
|
|
|
|
Fix: compress history and/or halve context_length.
|
|
|
|
|
2. "max_tokens too large" — input is fine, but input + requested_output > window.
|
|
|
|
|
Fix: reduce max_tokens (the output cap) for this call.
|
|
|
|
|
Do NOT touch context_length — the window hasn't shrunk.
|
|
|
|
|
|
|
|
|
|
Anthropic's API returns errors like:
|
|
|
|
|
"max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"
|
|
|
|
|
|
|
|
|
|
Returns the number of output tokens that would fit (e.g. 10000 above), or None if
|
|
|
|
|
the error does not look like a max_tokens-too-large error.
|
|
|
|
|
"""
|
|
|
|
|
error_lower = error_msg.lower()
|
|
|
|
|
|
|
|
|
|
# Must look like an output-cap error, not a prompt-length error.
|
|
|
|
|
is_output_cap_error = (
|
|
|
|
|
"max_tokens" in error_lower
|
|
|
|
|
and ("available_tokens" in error_lower or "available tokens" in error_lower)
|
|
|
|
|
)
|
|
|
|
|
if not is_output_cap_error:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Extract the available_tokens figure.
|
|
|
|
|
# Anthropic format: "… = available_tokens: 10000"
|
|
|
|
|
patterns = [
|
|
|
|
|
r'available_tokens[:\s]+(\d+)',
|
|
|
|
|
r'available\s+tokens[:\s]+(\d+)',
|
|
|
|
|
# fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
|
|
|
|
|
r'=\s*(\d+)\s*$',
|
|
|
|
|
]
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
match = re.search(pattern, error_lower)
|
|
|
|
|
if match:
|
|
|
|
|
tokens = int(match.group(1))
|
|
|
|
|
if tokens >= 1:
|
|
|
|
|
return tokens
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
|
|
|
|
|
"""Return True if *candidate_id* (from server) matches *lookup_model* (configured).
|
|
|
|
|
|
|
|
|
|
Supports two forms:
|
|
|
|
|
- Exact match: "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
|
|
|
|
|
- Slug match: "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
|
|
|
|
|
(the part after the last "/" equals lookup_model)
|
|
|
|
|
|
|
|
|
|
This covers LM Studio's native API which stores models as "publisher/slug"
|
|
|
|
|
while users typically configure only the slug after the "local:" prefix.
|
|
|
|
|
"""
|
|
|
|
|
if candidate_id == lookup_model:
|
|
|
|
|
return True
|
|
|
|
|
# Slug match: basename of candidate equals the lookup name
|
|
|
|
|
if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 22:23:28 -07:00
|
|
|
def query_ollama_num_ctx(model: str, base_url: str) -> Optional[int]:
|
|
|
|
|
"""Query an Ollama server for the model's context length.
|
|
|
|
|
|
|
|
|
|
Returns the model's maximum context from GGUF metadata via ``/api/show``,
|
|
|
|
|
or the explicit ``num_ctx`` from the Modelfile if set. Returns None if
|
|
|
|
|
the server is unreachable or not Ollama.
|
|
|
|
|
|
|
|
|
|
This is the value that should be passed as ``num_ctx`` in Ollama chat
|
|
|
|
|
requests to override the default 2048.
|
|
|
|
|
"""
|
|
|
|
|
import httpx
|
|
|
|
|
|
|
|
|
|
bare_model = _strip_provider_prefix(model)
|
|
|
|
|
server_url = base_url.rstrip("/")
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
server_type = detect_local_server_type(base_url)
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
if server_type != "ollama":
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with httpx.Client(timeout=3.0) as client:
|
|
|
|
|
resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
return None
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
|
|
|
|
# Prefer explicit num_ctx from Modelfile parameters (user override)
|
|
|
|
|
params = data.get("parameters", "")
|
|
|
|
|
if "num_ctx" in params:
|
|
|
|
|
for line in params.split("\n"):
|
|
|
|
|
if "num_ctx" in line:
|
|
|
|
|
parts = line.strip().split()
|
|
|
|
|
if len(parts) >= 2:
|
|
|
|
|
try:
|
|
|
|
|
return int(parts[-1])
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Fall back to GGUF model_info context_length (training max)
|
|
|
|
|
model_info = data.get("model_info", {})
|
|
|
|
|
for key, value in model_info.items():
|
|
|
|
|
if "context_length" in key and isinstance(value, (int, float)):
|
|
|
|
|
return int(value)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
|
|
|
|
"""Query a local server for the model's context length."""
|
|
|
|
|
import httpx
|
|
|
|
|
|
2026-03-20 03:19:31 -07:00
|
|
|
# Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
|
|
|
|
|
# Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
|
|
|
|
|
model = _strip_provider_prefix(model)
|
2026-03-18 22:00:53 +01:00
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
# Strip /v1 suffix to get the server root
|
|
|
|
|
server_url = base_url.rstrip("/")
|
|
|
|
|
if server_url.endswith("/v1"):
|
|
|
|
|
server_url = server_url[:-3]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
server_type = detect_local_server_type(base_url)
|
|
|
|
|
except Exception:
|
|
|
|
|
server_type = None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with httpx.Client(timeout=3.0) as client:
|
|
|
|
|
# Ollama: /api/show returns model details with context info
|
|
|
|
|
if server_type == "ollama":
|
|
|
|
|
resp = client.post(f"{server_url}/api/show", json={"name": model})
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
data = resp.json()
|
2026-04-13 11:41:45 +02:00
|
|
|
# Prefer explicit num_ctx from Modelfile parameters: this is
|
|
|
|
|
# the *runtime* context Ollama will actually allocate KV cache
|
|
|
|
|
# for. The GGUF model_info.context_length is the training max,
|
|
|
|
|
# which can be larger than num_ctx — using it here would let
|
|
|
|
|
# Hermes grow conversations past the runtime limit and Ollama
|
|
|
|
|
# would silently truncate. Matches query_ollama_num_ctx().
|
2026-03-18 21:38:41 +01:00
|
|
|
params = data.get("parameters", "")
|
|
|
|
|
if "num_ctx" in params:
|
|
|
|
|
for line in params.split("\n"):
|
|
|
|
|
if "num_ctx" in line:
|
|
|
|
|
parts = line.strip().split()
|
|
|
|
|
if len(parts) >= 2:
|
|
|
|
|
try:
|
|
|
|
|
return int(parts[-1])
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
2026-04-13 11:41:45 +02:00
|
|
|
# Fall back to GGUF model_info context_length (training max)
|
|
|
|
|
model_info = data.get("model_info", {})
|
|
|
|
|
for key, value in model_info.items():
|
|
|
|
|
if "context_length" in key and isinstance(value, (int, float)):
|
|
|
|
|
return int(value)
|
2026-03-18 21:38:41 +01:00
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
# LM Studio native API: /api/v1/models returns max_context_length.
|
|
|
|
|
# This is more reliable than the OpenAI-compat /v1/models which
|
|
|
|
|
# doesn't include context window information for LM Studio servers.
|
|
|
|
|
# Use _model_id_matches for fuzzy matching: LM Studio stores models as
|
|
|
|
|
# "publisher/slug" but users configure only "slug" after "local:" prefix.
|
|
|
|
|
if server_type == "lm-studio":
|
|
|
|
|
resp = client.get(f"{server_url}/api/v1/models")
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
data = resp.json()
|
|
|
|
|
for m in data.get("models", []):
|
|
|
|
|
if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
|
|
|
|
|
# Prefer loaded instance context (actual runtime value)
|
|
|
|
|
for inst in m.get("loaded_instances", []):
|
|
|
|
|
cfg = inst.get("config", {})
|
|
|
|
|
ctx = cfg.get("context_length")
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
return int(ctx)
|
|
|
|
|
# Fall back to max_context_length (theoretical model max)
|
|
|
|
|
ctx = m.get("max_context_length") or m.get("context_length")
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
2026-03-18 21:38:41 +01:00
|
|
|
# LM Studio / vLLM / llama.cpp: try /v1/models/{model}
|
|
|
|
|
resp = client.get(f"{server_url}/v1/models/{model}")
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
data = resp.json()
|
|
|
|
|
# vLLM returns max_model_len
|
|
|
|
|
ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
return int(ctx)
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
# Try /v1/models and find the model in the list.
|
|
|
|
|
# Use _model_id_matches to handle "publisher/slug" vs bare "slug".
|
2026-03-18 21:38:41 +01:00
|
|
|
resp = client.get(f"{server_url}/v1/models")
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
data = resp.json()
|
|
|
|
|
models_list = data.get("data", [])
|
|
|
|
|
for m in models_list:
|
2026-03-18 22:00:53 +01:00
|
|
|
if _model_id_matches(m.get("id", ""), model):
|
2026-03-18 21:38:41 +01:00
|
|
|
ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
|
|
|
|
|
if ctx and isinstance(ctx, (int, float)):
|
|
|
|
|
return int(ctx)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
def _normalize_model_version(model: str) -> str:
|
|
|
|
|
"""Normalize version separators for matching.
|
|
|
|
|
|
|
|
|
|
Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
|
|
|
|
|
OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
|
|
|
|
|
Normalize both to dashes for comparison.
|
|
|
|
|
"""
|
|
|
|
|
return model.replace(".", "-")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
|
|
|
|
|
"""Query Anthropic's /v1/models endpoint for context length.
|
|
|
|
|
|
|
|
|
|
Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
|
|
|
|
|
OAuth tokens (sk-ant-oat*) from Claude Code return 401.
|
|
|
|
|
"""
|
|
|
|
|
if not api_key or api_key.startswith("sk-ant-oat"):
|
|
|
|
|
return None # OAuth tokens can't access /v1/models
|
|
|
|
|
try:
|
|
|
|
|
base = base_url.rstrip("/")
|
|
|
|
|
if base.endswith("/v1"):
|
|
|
|
|
base = base[:-3]
|
|
|
|
|
url = f"{base}/v1/models?limit=1000"
|
|
|
|
|
headers = {
|
|
|
|
|
"x-api-key": api_key,
|
|
|
|
|
"anthropic-version": "2023-06-01",
|
|
|
|
|
}
|
|
|
|
|
resp = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
return None
|
|
|
|
|
data = resp.json()
|
|
|
|
|
for m in data.get("data", []):
|
|
|
|
|
if m.get("id") == model:
|
|
|
|
|
ctx = m.get("max_input_tokens")
|
|
|
|
|
if isinstance(ctx, int) and ctx > 0:
|
|
|
|
|
return ctx
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.debug("Anthropic /v1/models query failed: %s", e)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_nous_context_length(model: str) -> Optional[int]:
|
|
|
|
|
"""Resolve Nous Portal model context length via OpenRouter metadata.
|
|
|
|
|
|
|
|
|
|
Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
|
|
|
|
|
prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
|
|
|
|
|
with version normalization (dot↔dash).
|
|
|
|
|
"""
|
|
|
|
|
metadata = fetch_model_metadata() # OpenRouter cache
|
|
|
|
|
# Exact match first
|
|
|
|
|
if model in metadata:
|
|
|
|
|
return metadata[model].get("context_length")
|
|
|
|
|
|
|
|
|
|
normalized = _normalize_model_version(model).lower()
|
|
|
|
|
|
|
|
|
|
for or_id, entry in metadata.items():
|
|
|
|
|
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
|
|
|
|
if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
|
|
|
|
|
return entry.get("context_length")
|
|
|
|
|
|
|
|
|
|
# Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
|
|
|
|
|
# Require match to be at a word boundary (followed by -, :, or end of string)
|
|
|
|
|
model_lower = model.lower()
|
|
|
|
|
for or_id, entry in metadata.items():
|
|
|
|
|
bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
|
|
|
|
|
for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
|
|
|
|
|
if candidate.startswith(query) and (
|
|
|
|
|
len(candidate) == len(query) or candidate[len(query)] in "-:."
|
|
|
|
|
):
|
|
|
|
|
return entry.get("context_length")
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-19 06:01:16 -07:00
|
|
|
def get_model_context_length(
|
|
|
|
|
model: str,
|
|
|
|
|
base_url: str = "",
|
|
|
|
|
api_key: str = "",
|
|
|
|
|
config_context_length: int | None = None,
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
provider: str = "",
|
2026-03-19 06:01:16 -07:00
|
|
|
) -> int:
|
2026-03-05 16:09:57 -08:00
|
|
|
"""Get the context length for a model.
|
|
|
|
|
|
|
|
|
|
Resolution order:
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
0. Explicit config override (model.context_length or custom_providers per-model)
|
2026-03-05 16:09:57 -08:00
|
|
|
1. Persistent cache (previously discovered via probing)
|
2026-03-18 03:04:07 -07:00
|
|
|
2. Active endpoint metadata (/models for explicit custom endpoints)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
3. Local server query (for local endpoints)
|
|
|
|
|
4. Anthropic /v1/models API (API-key users only, not OAuth)
|
|
|
|
|
5. OpenRouter live API metadata
|
|
|
|
|
6. Nous suffix-match via OpenRouter cache
|
|
|
|
|
7. models.dev registry lookup (provider-aware)
|
|
|
|
|
8. Thin hardcoded defaults (broad family patterns)
|
|
|
|
|
9. Default fallback (128K)
|
2026-03-05 16:09:57 -08:00
|
|
|
"""
|
2026-03-19 06:01:16 -07:00
|
|
|
# 0. Explicit config override — user knows best
|
|
|
|
|
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
|
|
|
|
|
return config_context_length
|
|
|
|
|
|
2026-03-18 22:00:53 +01:00
|
|
|
# Normalise provider-prefixed model names (e.g. "local:model-name" →
|
|
|
|
|
# "model-name") so cache lookups and server queries use the bare ID that
|
2026-03-20 03:19:31 -07:00
|
|
|
# local servers actually know about. Ollama "model:tag" colons are preserved.
|
|
|
|
|
model = _strip_provider_prefix(model)
|
2026-03-18 22:00:53 +01:00
|
|
|
|
2026-03-05 16:09:57 -08:00
|
|
|
# 1. Check persistent cache (model+provider)
|
|
|
|
|
if base_url:
|
|
|
|
|
cached = get_cached_context_length(model, base_url)
|
|
|
|
|
if cached is not None:
|
|
|
|
|
return cached
|
|
|
|
|
|
2026-03-22 08:15:06 -07:00
|
|
|
# 2. Active endpoint metadata for truly custom/unknown endpoints.
|
|
|
|
|
# Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
|
|
|
|
|
# /models endpoint may report a provider-imposed limit (e.g. Copilot
|
|
|
|
|
# returns 128k) instead of the model's full context (400k). models.dev
|
|
|
|
|
# has the correct per-provider values and is checked at step 5+.
|
|
|
|
|
if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url):
|
2026-03-18 03:04:07 -07:00
|
|
|
endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
|
2026-03-19 06:01:16 -07:00
|
|
|
matched = endpoint_metadata.get(model)
|
|
|
|
|
if not matched:
|
|
|
|
|
# Single-model servers: if only one model is loaded, use it
|
|
|
|
|
if len(endpoint_metadata) == 1:
|
|
|
|
|
matched = next(iter(endpoint_metadata.values()))
|
|
|
|
|
else:
|
|
|
|
|
# Fuzzy match: substring in either direction
|
|
|
|
|
for key, entry in endpoint_metadata.items():
|
|
|
|
|
if model in key or key in model:
|
|
|
|
|
matched = entry
|
|
|
|
|
break
|
|
|
|
|
if matched:
|
|
|
|
|
context_length = matched.get("context_length")
|
2026-03-18 03:04:07 -07:00
|
|
|
if isinstance(context_length, int):
|
|
|
|
|
return context_length
|
|
|
|
|
if not _is_known_provider_base_url(base_url):
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# 3. Try querying local server directly
|
2026-03-18 21:38:41 +01:00
|
|
|
if is_local_endpoint(base_url):
|
|
|
|
|
local_ctx = _query_local_context_length(model, base_url)
|
|
|
|
|
if local_ctx and local_ctx > 0:
|
|
|
|
|
save_context_length(model, base_url, local_ctx)
|
|
|
|
|
return local_ctx
|
2026-03-19 06:01:16 -07:00
|
|
|
logger.info(
|
|
|
|
|
"Could not detect context length for model %r at %s — "
|
|
|
|
|
"defaulting to %s tokens (probe-down). Set model.context_length "
|
|
|
|
|
"in config.yaml to override.",
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
|
2026-03-19 06:01:16 -07:00
|
|
|
)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
return DEFAULT_FALLBACK_CONTEXT
|
2026-03-18 03:04:07 -07:00
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
|
|
|
|
|
if provider == "anthropic" or (
|
|
|
|
|
base_url and "api.anthropic.com" in base_url
|
|
|
|
|
):
|
|
|
|
|
ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
|
|
|
|
|
if ctx:
|
|
|
|
|
return ctx
|
|
|
|
|
|
|
|
|
|
# 5. Provider-aware lookups (before generic OpenRouter cache)
|
|
|
|
|
# These are provider-specific and take priority over the generic OR cache,
|
|
|
|
|
# since the same model can have different context limits per provider
|
|
|
|
|
# (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
# If provider is generic (openrouter/custom/empty), try to infer from URL.
|
|
|
|
|
effective_provider = provider
|
|
|
|
|
if not effective_provider or effective_provider in ("openrouter", "custom"):
|
|
|
|
|
if base_url:
|
|
|
|
|
inferred = _infer_provider_from_url(base_url)
|
|
|
|
|
if inferred:
|
|
|
|
|
effective_provider = inferred
|
|
|
|
|
|
|
|
|
|
if effective_provider == "nous":
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
ctx = _resolve_nous_context_length(model)
|
|
|
|
|
if ctx:
|
|
|
|
|
return ctx
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
if effective_provider:
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
from agent.models_dev import lookup_models_dev_context
|
fix: infer provider from base URL for models.dev context length lookup
Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.
Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.
Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.
2026-03-20 11:57:24 -07:00
|
|
|
ctx = lookup_models_dev_context(effective_provider, model)
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
if ctx:
|
|
|
|
|
return ctx
|
|
|
|
|
|
|
|
|
|
# 6. OpenRouter live API metadata (provider-unaware fallback)
|
2026-02-21 22:31:43 -08:00
|
|
|
metadata = fetch_model_metadata()
|
|
|
|
|
if model in metadata:
|
|
|
|
|
return metadata[model].get("context_length", 128000)
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
|
2026-03-20 08:52:37 -07:00
|
|
|
# Only check `default_model in model` (is the key a substring of the input).
|
|
|
|
|
# The reverse (`model in default_model`) causes shorter names like
|
|
|
|
|
# "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
|
2026-03-21 10:47:44 -07:00
|
|
|
model_lower = model.lower()
|
2026-03-17 04:12:08 -07:00
|
|
|
for default_model, length in sorted(
|
|
|
|
|
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
|
|
|
|
|
):
|
2026-03-21 10:47:44 -07:00
|
|
|
if default_model in model_lower:
|
2026-02-21 22:31:43 -08:00
|
|
|
return length
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# 9. Query local server as last resort
|
2026-03-18 21:38:41 +01:00
|
|
|
if base_url and is_local_endpoint(base_url):
|
|
|
|
|
local_ctx = _query_local_context_length(model, base_url)
|
|
|
|
|
if local_ctx and local_ctx > 0:
|
|
|
|
|
save_context_length(model, base_url, local_ctx)
|
|
|
|
|
return local_ctx
|
|
|
|
|
|
feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)
Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.
Key changes:
- New agent/models_dev.py: Fetches and caches the models.dev registry
(3800+ models across 100+ providers with per-provider context windows).
In-memory cache (1hr TTL) + disk cache for cold starts.
- Rewritten get_model_context_length() resolution chain:
0. Config override (model.context_length)
1. Custom providers per-model context_length
2. Persistent disk cache
3. Endpoint /models (local servers)
4. Anthropic /v1/models API (max_input_tokens, API-key only)
5. OpenRouter live API (existing, unchanged)
6. Nous suffix-match via OpenRouter (dot/dash normalization)
7. models.dev registry lookup (provider-aware)
8. Thin hardcoded defaults (broad family patterns)
9. 128K fallback (was 2M)
- Provider-aware context: same model now correctly resolves to different
context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
128K on GitHub Copilot). Provider name flows through ContextCompressor.
- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
models.dev replaces the per-model hardcoding.
- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.
- hermes model: prompts for context_length when configuring custom
endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
per-model config.
- custom_providers schema extended with optional models dict for
per-model context_length (backward compatible).
- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
normalization. Handles all 15 current Nous models.
- Anthropic direct: queries /v1/models for max_input_tokens. Only works
with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
to models.dev for OAuth users.
Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md
Co-authored-by: Test <test@test.com>
2026-03-20 06:04:33 -07:00
|
|
|
# 10. Default fallback — 128K
|
|
|
|
|
return DEFAULT_FALLBACK_CONTEXT
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_tokens_rough(text: str) -> int:
|
2026-04-11 16:33:35 -07:00
|
|
|
"""Rough token estimate (~4 chars/token) for pre-flight checks.
|
|
|
|
|
|
|
|
|
|
Uses ceiling division so short texts (1-3 chars) never estimate as
|
|
|
|
|
0 tokens, which would cause the compressor and pre-flight checks to
|
|
|
|
|
systematically undercount when many short tool results are present.
|
|
|
|
|
"""
|
2026-02-21 22:31:43 -08:00
|
|
|
if not text:
|
|
|
|
|
return 0
|
2026-04-11 16:33:35 -07:00
|
|
|
return (len(text) + 3) // 4
|
2026-02-21 22:31:43 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
|
|
|
|
"""Rough token estimate for a message list (pre-flight only)."""
|
|
|
|
|
total_chars = sum(len(str(msg)) for msg in messages)
|
2026-04-11 16:33:35 -07:00
|
|
|
return (total_chars + 3) // 4
|
2026-03-26 02:00:50 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_request_tokens_rough(
|
|
|
|
|
messages: List[Dict[str, Any]],
|
|
|
|
|
*,
|
|
|
|
|
system_prompt: str = "",
|
|
|
|
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
|
|
|
) -> int:
|
|
|
|
|
"""Rough token estimate for a full chat-completions request.
|
|
|
|
|
|
|
|
|
|
Includes the major payload buckets Hermes sends to providers:
|
|
|
|
|
system prompt, conversation messages, and tool schemas. With 50+
|
|
|
|
|
tools enabled, schemas alone can add 20-30K tokens — a significant
|
|
|
|
|
blind spot when only counting messages.
|
|
|
|
|
"""
|
|
|
|
|
total_chars = 0
|
|
|
|
|
if system_prompt:
|
|
|
|
|
total_chars += len(system_prompt)
|
|
|
|
|
if messages:
|
|
|
|
|
total_chars += sum(len(str(msg)) for msg in messages)
|
|
|
|
|
if tools:
|
|
|
|
|
total_chars += len(str(tools))
|
2026-04-11 16:33:35 -07:00
|
|
|
return (total_chars + 3) // 4
|