agent/model_metadata.py

"""Model metadata, context lengths, and token estimation utilities.

Pure utility functions with no AIAgent dependency. Used by ContextCompressor
and run_agent.py for pre-flight context checks.
"""

import ipaddress
import logging
import re
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

import requests
import yaml

from utils import base_url_host_matches, base_url_hostname

from hermes_constants import OPENROUTER_MODELS_URL

logger = logging.getLogger(__name__)

# Provider names that can appear as a "provider:" prefix before a model ID.
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
# are preserved so the full model name reaches cache lookups and server queries.
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
    "gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "stepfun", "minimax", "minimax-cn", "anthropic", "deepseek",
    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
    "qwen-oauth",
    "xiaomi",
    "arcee",
    "custom", "local",
    # Common aliases
    "google", "google-gemini", "google-ai-studio",
    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
    "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
    "ollama",
    "stepfun", "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
    "mimo", "xiaomi-mimo",
    "arcee-ai", "arceeai",
    "xai", "x-ai", "x.ai", "grok",
    "nvidia", "nim", "nvidia-nim", "nemotron",
    "qwen-portal",
})


_OLLAMA_TAG_PATTERN = re.compile(
    r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
    re.IGNORECASE,
)


# Tailscale's CGNAT range (RFC 6598). `ipaddress.is_private` excludes this
# block, so without an explicit check Ollama reached over Tailscale (e.g.
# `http://100.77.243.5:11434`) wouldn't be treated as local and its stream
# read / stale timeouts wouldn't get auto-bumped. Built once at import time.
_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")


def _strip_provider_prefix(model: str) -> str:
    """Strip a recognised provider prefix from a model string.

    ``"local:my-model"`` → ``"my-model"``
    ``"qwen3.5:27b"``   → ``"qwen3.5:27b"``  (unchanged — not a provider prefix)
    ``"qwen:0.5b"``     → ``"qwen:0.5b"``    (unchanged — Ollama model:tag)
    ``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag)
    """
    if ":" not in model or model.startswith("http"):
        return model
    prefix, suffix = model.split(":", 1)
    prefix_lower = prefix.strip().lower()
    if prefix_lower in _PROVIDER_PREFIXES:
        # Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
        if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
            return model
        return suffix
    return model

_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
_model_metadata_cache_time: float = 0
_MODEL_CACHE_TTL = 3600
_endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {}
_endpoint_model_metadata_cache_time: Dict[str, float] = {}
_ENDPOINT_MODEL_CACHE_TTL = 300

# Descending tiers for context length probing when the model is unknown.
# We start at 128K (a safe default for most modern models) and step down
# on context-length errors until one works.
CONTEXT_PROBE_TIERS = [
    128_000,
    64_000,
    32_000,
    16_000,
    8_000,
]

# Default context length when no detection method succeeds.
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]

# Minimum context length required to run Hermes Agent.  Models with fewer
# tokens cannot maintain enough working memory for tool-calling workflows.
# Sessions, model switches, and cron jobs should reject models below this.
MINIMUM_CONTEXT_LENGTH = 64_000

# Thin fallback defaults — only broad model family patterns.
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
# all miss. Replaced the previous 80+ entry dict.
# For provider-specific context lengths, models.dev is the primary source.
DEFAULT_CONTEXT_LENGTHS = {
    # Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
    # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
    # substring of "anthropic/claude-sonnet-4.6").
    # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
    "claude-opus-4-7": 1000000,
    "claude-opus-4.7": 1000000,
    "claude-opus-4-6": 1000000,
    "claude-sonnet-4-6": 1000000,
    "claude-opus-4.6": 1000000,
    "claude-sonnet-4.6": 1000000,
    # Catch-all for older Claude models (must sort after specific entries)
    "claude": 200000,
    # OpenAI — GPT-5 family (most have 400k; specific overrides first)
    # Source: https://developers.openai.com/api/docs/models
    # GPT-5.5 (launched Apr 23 2026). Verified via live ChatGPT codex/models
    # endpoint: bare slug `gpt-5.5`, no -pro/-mini variants. 400k context on Codex.
    "gpt-5.5": 400000,
    "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
    "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
    "gpt-5.4": 1050000,               # GPT-5.4, GPT-5.4 Pro (1.05M context)
    "gpt-5.1-chat": 128000,           # Chat variant has 128k context
    "gpt-5": 400000,                  # GPT-5.x base, mini, codex variants (400k)
    "gpt-4.1": 1047576,
    "gpt-4": 128000,
    # Google
    "gemini": 1048576,
    # Gemma (open models served via AI Studio)
    "gemma-4": 256000,  # Gemma 4 family
    "gemma4": 256000,  # Ollama-style naming (e.g. gemma4:31b-cloud)
    "gemma-4-31b": 256000,
    "gemma-3": 131072,
    "gemma": 8192,  # fallback for older gemma models
    # DeepSeek
    "deepseek": 128000,
    # Meta
    "llama": 131072,
    # Qwen — specific model families before the catch-all.
    # Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
    "qwen3-coder-plus": 1000000,  # 1M context
    "qwen3-coder": 262144,        # 256K context
    "qwen": 131072,
    # MiniMax — official docs: 204,800 context for all models
    # https://platform.minimax.io/docs/api-reference/text-anthropic-api
    "minimax": 204800,
    # GLM
    "glm": 202752,
    # xAI Grok — xAI /v1/models does not return context_length metadata,
    # so these hardcoded fallbacks prevent Hermes from probing-down to
    # the default 128k when the user points at https://api.x.ai/v1
    # via a custom provider. Values sourced from models.dev (2026-04).
    # Keys use substring matching (longest-first), so e.g. "grok-4.20"
    # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
    "grok-code-fast": 256000,   # grok-code-fast-1
    "grok-4-1-fast": 2000000,   # grok-4-1-fast-(non-)reasoning
    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning
    "grok-4.20": 2000000,       # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
    "grok-4": 256000,           # grok-4, grok-4-0709
    "grok-3": 131072,           # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
    "grok-2": 131072,           # grok-2, grok-2-1212, grok-2-latest
    "grok": 131072,             # catch-all (grok-beta, unknown grok-*)
    # Kimi
    "kimi": 262144,
    # Nemotron — NVIDIA's open-weights series (128K context across all sizes)
    "nemotron": 131072,
    # Arcee
    "trinity": 262144,
    # OpenRouter
    "elephant": 262144,
    # Hugging Face Inference Providers — model IDs use org/name format
    "Qwen/Qwen3.5-397B-A17B": 131072,
    "Qwen/Qwen3.5-35B-A3B": 131072,
    "deepseek-ai/DeepSeek-V3.2": 65536,
    "moonshotai/Kimi-K2.5": 262144,
    "moonshotai/Kimi-K2.6": 262144,
    "moonshotai/Kimi-K2-Thinking": 262144,
    "MiniMaxAI/MiniMax-M2.5": 204800,
    "XiaomiMiMo/MiMo-V2-Flash": 262144,
    "mimo-v2-pro": 1048576,
    "mimo-v2.5-pro": 1048576,
    "mimo-v2.5": 1048576,
    "mimo-v2-omni": 262144,
    "mimo-v2-flash": 262144,
    "zai-org/GLM-5": 202752,
}

_CONTEXT_LENGTH_KEYS = (
    "context_length",
    "context_window",
    "max_context_length",
    "max_position_embeddings",
    "max_model_len",
    "max_input_tokens",
    "max_sequence_length",
    "max_seq_len",
    "n_ctx_train",
    "n_ctx",
    "ctx_size",
)

_MAX_COMPLETION_KEYS = (
    "max_completion_tokens",
    "max_output_tokens",
    "max_tokens",
)

# Local server hostnames / address patterns
_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
# Docker / Podman / Lima DNS names that resolve to the host machine
_CONTAINER_LOCAL_SUFFIXES = (
    ".docker.internal",
    ".containers.internal",
    ".lima.internal",
)


def _normalize_base_url(base_url: str) -> str:
    return (base_url or "").strip().rstrip("/")


def _auth_headers(api_key: str = "") -> Dict[str, str]:
    token = str(api_key or "").strip()
    if not token:
        return {}
    return {"Authorization": f"Bearer {token}"}


def _is_openrouter_base_url(base_url: str) -> bool:
    return base_url_host_matches(base_url, "openrouter.ai")


def _is_custom_endpoint(base_url: str) -> bool:
    normalized = _normalize_base_url(base_url)
    return bool(normalized) and not _is_openrouter_base_url(normalized)


_URL_TO_PROVIDER: Dict[str, str] = {
    "api.openai.com": "openai",
    "chatgpt.com": "openai",
    "api.anthropic.com": "anthropic",
    "api.z.ai": "zai",
    "open.bigmodel.cn": "zai",
    "api.moonshot.ai": "kimi-coding",
    "api.moonshot.cn": "kimi-coding-cn",
    "api.kimi.com": "kimi-coding",
    "api.stepfun.ai": "stepfun",
    "api.stepfun.com": "stepfun",
    "api.arcee.ai": "arcee",
    "api.minimax": "minimax",
    "dashscope.aliyuncs.com": "alibaba",
    "dashscope-intl.aliyuncs.com": "alibaba",
    "portal.qwen.ai": "qwen-oauth",
    "openrouter.ai": "openrouter",
    "generativelanguage.googleapis.com": "gemini",
    "inference-api.nousresearch.com": "nous",
    "api.deepseek.com": "deepseek",
    "api.githubcopilot.com": "copilot",
    "models.github.ai": "copilot",
    "api.fireworks.ai": "fireworks",
    "opencode.ai": "opencode-go",
    "api.x.ai": "xai",
    "integrate.api.nvidia.com": "nvidia",
    "api.xiaomimimo.com": "xiaomi",
    "xiaomimimo.com": "xiaomi",
    "ollama.com": "ollama-cloud",
}


def _infer_provider_from_url(base_url: str) -> Optional[str]:
    """Infer the models.dev provider name from a base URL.

    This allows context length resolution via models.dev for custom endpoints
    like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
    explicitly set the provider name in config.
    """
    normalized = _normalize_base_url(base_url)
    if not normalized:
        return None
    parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
    host = parsed.netloc.lower() or parsed.path.lower()
    for url_part, provider in _URL_TO_PROVIDER.items():
        if url_part in host:
            return provider
    return None


def _is_known_provider_base_url(base_url: str) -> bool:
    return _infer_provider_from_url(base_url) is not None


def is_local_endpoint(base_url: str) -> bool:
    """Return True if base_url points to a local machine.

    Recognises loopback (``localhost``, ``127.0.0.0/8``, ``::1``),
    container-internal DNS names (``host.docker.internal`` et al.),
    RFC-1918 private ranges (``10/8``, ``172.16/12``, ``192.168/16``),
    link-local, and Tailscale CGNAT (``100.64.0.0/10``). Tailscale CGNAT
    is included so remote-but-trusted Ollama boxes reached over a
    Tailscale mesh get the same timeout auto-bumps as localhost Ollama.
    """
    normalized = _normalize_base_url(base_url)
    if not normalized:
        return False
    url = normalized if "://" in normalized else f"http://{normalized}"
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ""
    except Exception:
        return False
    if host in _LOCAL_HOSTS:
        return True
    # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
    if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
        return True
    # RFC-1918 private ranges, link-local, and Tailscale CGNAT
    try:
        addr = ipaddress.ip_address(host)
        if addr.is_private or addr.is_loopback or addr.is_link_local:
            return True
        if isinstance(addr, ipaddress.IPv4Address) and addr in _TAILSCALE_CGNAT:
            return True
    except ValueError:
        pass
    # Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
    # or Tailscale CGNAT (100.64.x.x–100.127.x.x).
    parts = host.split(".")
    if len(parts) == 4:
        try:
            first, second = int(parts[0]), int(parts[1])
            if first == 10:
                return True
            if first == 172 and 16 <= second <= 31:
                return True
            if first == 192 and second == 168:
                return True
            if first == 100 and 64 <= second <= 127:
                return True
        except ValueError:
            pass
    return False


def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
    """Detect which local server is running at base_url by probing known endpoints.

    Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
    """
    import httpx

    normalized = _normalize_base_url(base_url)
    server_url = normalized
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]

    headers = _auth_headers(api_key)

    try:
        with httpx.Client(timeout=2.0, headers=headers) as client:
            # LM Studio exposes /api/v1/models — check first (most specific)
            try:
                r = client.get(f"{server_url}/api/v1/models")
                if r.status_code == 200:
                    return "lm-studio"
            except Exception:
                pass
            # Ollama exposes /api/tags and responds with {"models": [...]}
            # LM Studio returns {"error": "Unexpected endpoint"} with status 200
            # on this path, so we must verify the response contains "models".
            try:
                r = client.get(f"{server_url}/api/tags")
                if r.status_code == 200:
                    try:
                        data = r.json()
                        if "models" in data:
                            return "ollama"
                    except Exception:
                        pass
            except Exception:
                pass
            # llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix)
            try:
                r = client.get(f"{server_url}/v1/props")
                if r.status_code != 200:
                    r = client.get(f"{server_url}/props")  # fallback for older builds
                if r.status_code == 200 and "default_generation_settings" in r.text:
                    return "llamacpp"
            except Exception:
                pass
            # vLLM: /version
            try:
                r = client.get(f"{server_url}/version")
                if r.status_code == 200:
                    data = r.json()
                    if "version" in data:
                        return "vllm"
            except Exception:
                pass
    except Exception:
        pass

    return None


def _iter_nested_dicts(value: Any):
    if isinstance(value, dict):
        yield value
        for nested in value.values():
            yield from _iter_nested_dicts(nested)
    elif isinstance(value, list):
        for item in value:
            yield from _iter_nested_dicts(item)


def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]:
    try:
        if isinstance(value, bool):
            return None
        if isinstance(value, str):
            value = value.strip().replace(",", "")
        result = int(value)
    except (TypeError, ValueError):
        return None
    if minimum <= result <= maximum:
        return result
    return None


def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]:
    keyset = {key.lower() for key in keys}
    for mapping in _iter_nested_dicts(payload):
        for key, value in mapping.items():
            if str(key).lower() not in keyset:
                continue
            coerced = _coerce_reasonable_int(value)
            if coerced is not None:
                return coerced
    return None


def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]:
    return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS)


def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]:
    return _extract_first_int(payload, _MAX_COMPLETION_KEYS)


def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]:
    alias_map = {
        "prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"),
        "completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"),
        "request": ("request", "request_cost"),
        "cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"),
        "cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"),
    }
    for mapping in _iter_nested_dicts(payload):
        normalized = {str(key).lower(): value for key, value in mapping.items()}
        if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()):
            continue
        pricing: Dict[str, Any] = {}
        for target, aliases in alias_map.items():
            for alias in aliases:
                if alias in normalized and normalized[alias] not in (None, ""):
                    pricing[target] = normalized[alias]
                    break
        if pricing:
            return pricing
    return {}


def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None:
    cache[model_id] = entry
    if "/" in model_id:
        bare_model = model_id.split("/", 1)[1]
        cache.setdefault(bare_model, entry)


def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
    """Fetch model metadata from OpenRouter (cached for 1 hour)."""
    global _model_metadata_cache, _model_metadata_cache_time

    if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
        return _model_metadata_cache

    try:
        response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
        response.raise_for_status()
        data = response.json()

        cache = {}
        for model in data.get("data", []):
            model_id = model.get("id", "")
            entry = {
                "context_length": model.get("context_length", 128000),
                "max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
                "name": model.get("name", model_id),
                "pricing": model.get("pricing", {}),
            }
            _add_model_aliases(cache, model_id, entry)
            canonical = model.get("canonical_slug", "")
            if canonical and canonical != model_id:
                _add_model_aliases(cache, canonical, entry)

        _model_metadata_cache = cache
        _model_metadata_cache_time = time.time()
        logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
        return cache

    except Exception as e:
        logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
        return _model_metadata_cache or {}


def fetch_endpoint_model_metadata(
    base_url: str,
    api_key: str = "",
    force_refresh: bool = False,
) -> Dict[str, Dict[str, Any]]:
    """Fetch model metadata from an OpenAI-compatible ``/models`` endpoint.

    This is used for explicit custom endpoints where hardcoded global model-name
    defaults are unreliable. Results are cached in memory per base URL.
    """
    normalized = _normalize_base_url(base_url)
    if not normalized or _is_openrouter_base_url(normalized):
        return {}

    if not force_refresh:
        cached = _endpoint_model_metadata_cache.get(normalized)
        cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0)
        if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL:
            return cached

    candidates = [normalized]
    if normalized.endswith("/v1"):
        alternate = normalized[:-3].rstrip("/")
    else:
        alternate = normalized + "/v1"
    if alternate and alternate not in candidates:
        candidates.append(alternate)

    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
    last_error: Optional[Exception] = None

    if is_local_endpoint(normalized):
        try:
            if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
                server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
                response = requests.get(
                    server_url.rstrip("/") + "/api/v1/models",
                    headers=headers,
                    timeout=10,
                )
                response.raise_for_status()
                payload = response.json()
                cache: Dict[str, Dict[str, Any]] = {}
                for model in payload.get("models", []):
                    if not isinstance(model, dict):
                        continue
                    model_id = model.get("key") or model.get("id")
                    if not model_id:
                        continue
                    entry: Dict[str, Any] = {"name": model.get("name", model_id)}

                    context_length = None
                    for inst in model.get("loaded_instances", []) or []:
                        if not isinstance(inst, dict):
                            continue
                        cfg = inst.get("config", {})
                        ctx = cfg.get("context_length") if isinstance(cfg, dict) else None
                        if isinstance(ctx, int) and ctx > 0:
                            context_length = ctx
                            break
                    if context_length is None:
                        context_length = _extract_context_length(model)
                    if context_length is not None:
                        entry["context_length"] = context_length

                    max_completion_tokens = _extract_max_completion_tokens(model)
                    if max_completion_tokens is not None:
                        entry["max_completion_tokens"] = max_completion_tokens

                    pricing = _extract_pricing(model)
                    if pricing:
                        entry["pricing"] = pricing

                    _add_model_aliases(cache, model_id, entry)
                    alt_id = model.get("id")
                    if isinstance(alt_id, str) and alt_id and alt_id != model_id:
                        _add_model_aliases(cache, alt_id, entry)

                _endpoint_model_metadata_cache[normalized] = cache
                _endpoint_model_metadata_cache_time[normalized] = time.time()
                return cache
        except Exception as exc:
            last_error = exc

    for candidate in candidates:
        url = candidate.rstrip("/") + "/models"
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            payload = response.json()
            cache: Dict[str, Dict[str, Any]] = {}
            for model in payload.get("data", []):
                if not isinstance(model, dict):
                    continue
                model_id = model.get("id")
                if not model_id:
                    continue
                entry: Dict[str, Any] = {"name": model.get("name", model_id)}
                context_length = _extract_context_length(model)
                if context_length is not None:
                    entry["context_length"] = context_length
                max_completion_tokens = _extract_max_completion_tokens(model)
                if max_completion_tokens is not None:
                    entry["max_completion_tokens"] = max_completion_tokens
                pricing = _extract_pricing(model)
                if pricing:
                    entry["pricing"] = pricing
                _add_model_aliases(cache, model_id, entry)

            # If this is a llama.cpp server, query /props for actual allocated context
            is_llamacpp = any(
                m.get("owned_by") == "llamacpp"
                for m in payload.get("data", []) if isinstance(m, dict)
            )
            if is_llamacpp:
                try:
                    # Try /v1/props first (current llama.cpp); fall back to /props for older builds
                    base = candidate.rstrip("/").replace("/v1", "")
                    props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5)
                    if not props_resp.ok:
                        props_resp = requests.get(base + "/props", headers=headers, timeout=5)
                    if props_resp.ok:
                        props = props_resp.json()
                        gen_settings = props.get("default_generation_settings", {})
                        n_ctx = gen_settings.get("n_ctx")
                        model_alias = props.get("model_alias", "")
                        if n_ctx and model_alias and model_alias in cache:
                            cache[model_alias]["context_length"] = n_ctx
                except Exception:
                    pass

            _endpoint_model_metadata_cache[normalized] = cache
            _endpoint_model_metadata_cache_time[normalized] = time.time()
            return cache
        except Exception as exc:
            last_error = exc

    if last_error:
        logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error)
    _endpoint_model_metadata_cache[normalized] = {}
    _endpoint_model_metadata_cache_time[normalized] = time.time()
    return {}


def _get_context_cache_path() -> Path:
    """Return path to the persistent context length cache file."""
    from hermes_constants import get_hermes_home
    return get_hermes_home() / "context_length_cache.yaml"


def _load_context_cache() -> Dict[str, int]:
    """Load the model+provider -> context_length cache from disk."""
    path = _get_context_cache_path()
    if not path.exists():
        return {}
    try:
        with open(path) as f:
            data = yaml.safe_load(f) or {}
        return data.get("context_lengths", {})
    except Exception as e:
        logger.debug("Failed to load context length cache: %s", e)
        return {}


def save_context_length(model: str, base_url: str, length: int) -> None:
    """Persist a discovered context length for a model+provider combo.

    Cache key is ``model@base_url`` so the same model name served from
    different providers can have different limits.
    """
    key = f"{model}@{base_url}"
    cache = _load_context_cache()
    if cache.get(key) == length:
        return  # already stored
    cache[key] = length
    path = _get_context_cache_path()
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as f:
            yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
        logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
    except Exception as e:
        logger.debug("Failed to save context length cache: %s", e)


def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
    """Look up a previously discovered context length for model+provider."""
    key = f"{model}@{base_url}"
    cache = _load_context_cache()
    return cache.get(key)


def get_next_probe_tier(current_length: int) -> Optional[int]:
    """Return the next lower probe tier, or None if already at minimum."""
    for tier in CONTEXT_PROBE_TIERS:
        if tier < current_length:
            return tier
    return None


def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    """Try to extract the actual context limit from an API error message.

    Many providers include the limit in their error text, e.g.:
      - "maximum context length is 32768 tokens"
      - "context_length_exceeded: 131072"
      - "Maximum context size 32768 exceeded"
      - "model's max context length is 65536"
    """
    error_lower = error_msg.lower()
    # Pattern: look for numbers near context-related keywords
    patterns = [
        r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
        r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
        r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
        r'>\s*(\d{4,})\s*(?:max|limit|token)',  # "250000 tokens > 200000 maximum"
        r'(\d{4,})\s*(?:max(?:imum)?)\b',  # "200000 maximum"
    ]
    for pattern in patterns:
        match = re.search(pattern, error_lower)
        if match:
            limit = int(match.group(1))
            # Sanity check: must be a reasonable context length
            if 1024 <= limit <= 10_000_000:
                return limit
    return None


def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
    """Detect an "output cap too large" error and return how many output tokens are available.

    Background — two distinct context errors exist:
      1. "Prompt too long"  — the INPUT itself exceeds the context window.
           Fix: compress history and/or halve context_length.
      2. "max_tokens too large" — input is fine, but input + requested_output > window.
           Fix: reduce max_tokens (the output cap) for this call.
           Do NOT touch context_length — the window hasn't shrunk.

    Anthropic's API returns errors like:
      "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"

    Returns the number of output tokens that would fit (e.g. 10000 above), or None if
    the error does not look like a max_tokens-too-large error.
    """
    error_lower = error_msg.lower()

    # Must look like an output-cap error, not a prompt-length error.
    is_output_cap_error = (
        "max_tokens" in error_lower
        and ("available_tokens" in error_lower or "available tokens" in error_lower)
    )
    if not is_output_cap_error:
        return None

    # Extract the available_tokens figure.
    # Anthropic format: "… = available_tokens: 10000"
    patterns = [
        r'available_tokens[:\s]+(\d+)',
        r'available\s+tokens[:\s]+(\d+)',
        # fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
        r'=\s*(\d+)\s*$',
    ]
    for pattern in patterns:
        match = re.search(pattern, error_lower)
        if match:
            tokens = int(match.group(1))
            if tokens >= 1:
                return tokens
    return None


def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
    """Return True if *candidate_id* (from server) matches *lookup_model* (configured).

    Supports two forms:
    - Exact match:  "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
    - Slug match:   "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
                    (the part after the last "/" equals lookup_model)

    This covers LM Studio's native API which stores models as "publisher/slug"
    while users typically configure only the slug after the "local:" prefix.
    """
    if candidate_id == lookup_model:
        return True
    # Slug match: basename of candidate equals the lookup name
    if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
        return True
    return False


def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query an Ollama server for the model's context length.

    Returns the model's maximum context from GGUF metadata via ``/api/show``,
    or the explicit ``num_ctx`` from the Modelfile if set.  Returns None if
    the server is unreachable or not Ollama.

    This is the value that should be passed as ``num_ctx`` in Ollama chat
    requests to override the default 2048.
    """
    import httpx

    bare_model = _strip_provider_prefix(model)
    server_url = base_url.rstrip("/")
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]

    try:
        server_type = detect_local_server_type(base_url, api_key=api_key)
    except Exception:
        return None
    if server_type != "ollama":
        return None

    headers = _auth_headers(api_key)

    try:
        with httpx.Client(timeout=3.0, headers=headers) as client:
            resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
            if resp.status_code != 200:
                return None
            data = resp.json()

            # Prefer explicit num_ctx from Modelfile parameters (user override)
            params = data.get("parameters", "")
            if "num_ctx" in params:
                for line in params.split("\n"):
                    if "num_ctx" in line:
                        parts = line.strip().split()
                        if len(parts) >= 2:
                            try:
                                return int(parts[-1])
                            except ValueError:
                                pass

            # Fall back to GGUF model_info context_length (training max)
            model_info = data.get("model_info", {})
            for key, value in model_info.items():
                if "context_length" in key and isinstance(value, (int, float)):
                    return int(value)
    except Exception:
        pass
    return None


def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query a local server for the model's context length."""
    import httpx

    # Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
    # Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
    model = _strip_provider_prefix(model)

    # Strip /v1 suffix to get the server root
    server_url = base_url.rstrip("/")
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]

    headers = _auth_headers(api_key)

    try:
        server_type = detect_local_server_type(base_url, api_key=api_key)
    except Exception:
        server_type = None

    try:
        with httpx.Client(timeout=3.0, headers=headers) as client:
            # Ollama: /api/show returns model details with context info
            if server_type == "ollama":
                resp = client.post(f"{server_url}/api/show", json={"name": model})
                if resp.status_code == 200:
                    data = resp.json()
                    # Prefer explicit num_ctx from Modelfile parameters: this is
                    # the *runtime* context Ollama will actually allocate KV cache
                    # for. The GGUF model_info.context_length is the training max,
                    # which can be larger than num_ctx — using it here would let
                    # Hermes grow conversations past the runtime limit and Ollama
                    # would silently truncate. Matches query_ollama_num_ctx().
                    params = data.get("parameters", "")
                    if "num_ctx" in params:
                        for line in params.split("\n"):
                            if "num_ctx" in line:
                                parts = line.strip().split()
                                if len(parts) >= 2:
                                    try:
                                        return int(parts[-1])
                                    except ValueError:
                                        pass
                    # Fall back to GGUF model_info context_length (training max)
                    model_info = data.get("model_info", {})
                    for key, value in model_info.items():
                        if "context_length" in key and isinstance(value, (int, float)):
                            return int(value)

            # LM Studio native API: /api/v1/models returns max_context_length.
            # This is more reliable than the OpenAI-compat /v1/models which
            # doesn't include context window information for LM Studio servers.
            # Use _model_id_matches for fuzzy matching: LM Studio stores models as
            # "publisher/slug" but users configure only "slug" after "local:" prefix.
            if server_type == "lm-studio":
                resp = client.get(f"{server_url}/api/v1/models")
                if resp.status_code == 200:
                    data = resp.json()
                    for m in data.get("models", []):
                        if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
                            # Prefer loaded instance context (actual runtime value)
                            for inst in m.get("loaded_instances", []):
                                cfg = inst.get("config", {})
                                ctx = cfg.get("context_length")
                                if ctx and isinstance(ctx, (int, float)):
                                    return int(ctx)
                            # Fall back to max_context_length (theoretical model max)
                            ctx = m.get("max_context_length") or m.get("context_length")
                            if ctx and isinstance(ctx, (int, float)):
                                return int(ctx)

            # LM Studio / vLLM / llama.cpp: try /v1/models/{model}
            resp = client.get(f"{server_url}/v1/models/{model}")
            if resp.status_code == 200:
                data = resp.json()
                # vLLM returns max_model_len
                ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
                if ctx and isinstance(ctx, (int, float)):
                    return int(ctx)

            # Try /v1/models and find the model in the list.
            # Use _model_id_matches to handle "publisher/slug" vs bare "slug".
            resp = client.get(f"{server_url}/v1/models")
            if resp.status_code == 200:
                data = resp.json()
                models_list = data.get("data", [])
                for m in models_list:
                    if _model_id_matches(m.get("id", ""), model):
                        ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
                        if ctx and isinstance(ctx, (int, float)):
                            return int(ctx)
    except Exception:
        pass

    return None


def _normalize_model_version(model: str) -> str:
    """Normalize version separators for matching.

    Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
    OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
    Normalize both to dashes for comparison.
    """
    return model.replace(".", "-")


def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
    """Query Anthropic's /v1/models endpoint for context length.

    Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
    OAuth tokens (sk-ant-oat*) from Claude Code return 401.
    """
    if not api_key or api_key.startswith("sk-ant-oat"):
        return None  # OAuth tokens can't access /v1/models
    try:
        base = base_url.rstrip("/")
        if base.endswith("/v1"):
            base = base[:-3]
        url = f"{base}/v1/models?limit=1000"
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
        }
        resp = requests.get(url, headers=headers, timeout=10)
        if resp.status_code != 200:
            return None
        data = resp.json()
        for m in data.get("data", []):
            if m.get("id") == model:
                ctx = m.get("max_input_tokens")
                if isinstance(ctx, int) and ctx > 0:
                    return ctx
    except Exception as e:
        logger.debug("Anthropic /v1/models query failed: %s", e)
    return None


def _resolve_nous_context_length(model: str) -> Optional[int]:
    """Resolve Nous Portal model context length via OpenRouter metadata.

    Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
    prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
    with version normalization (dot↔dash).
    """
    metadata = fetch_model_metadata()  # OpenRouter cache
    # Exact match first
    if model in metadata:
        return metadata[model].get("context_length")

    normalized = _normalize_model_version(model).lower()

    for or_id, entry in metadata.items():
        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
        if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
            return entry.get("context_length")

    # Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
    # Require match to be at a word boundary (followed by -, :, or end of string)
    model_lower = model.lower()
    for or_id, entry in metadata.items():
        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
        for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
            if candidate.startswith(query) and (
                len(candidate) == len(query) or candidate[len(query)] in "-:."
            ):
                return entry.get("context_length")

    return None


def get_model_context_length(
    model: str,
    base_url: str = "",
    api_key: str = "",
    config_context_length: int | None = None,
    provider: str = "",
) -> int:
    """Get the context length for a model.

    Resolution order:
    0. Explicit config override (model.context_length or custom_providers per-model)
    1. Persistent cache (previously discovered via probing)
    2. Active endpoint metadata (/models for explicit custom endpoints)
    3. Local server query (for local endpoints)
    4. Anthropic /v1/models API (API-key users only, not OAuth)
    5. OpenRouter live API metadata
    6. Nous suffix-match via OpenRouter cache
    7. models.dev registry lookup (provider-aware)
    8. Thin hardcoded defaults (broad family patterns)
    9. Default fallback (128K)
    """
    # 0. Explicit config override — user knows best
    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
        return config_context_length

    # Normalise provider-prefixed model names (e.g. "local:model-name" →
    # "model-name") so cache lookups and server queries use the bare ID that
    # local servers actually know about.  Ollama "model:tag" colons are preserved.
    model = _strip_provider_prefix(model)

    # 1. Check persistent cache (model+provider)
    if base_url:
        cached = get_cached_context_length(model, base_url)
        if cached is not None:
            return cached

    # 2. Active endpoint metadata for truly custom/unknown endpoints.
    # Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
    # /models endpoint may report a provider-imposed limit (e.g. Copilot
    # returns 128k) instead of the model's full context (400k).  models.dev
    # has the correct per-provider values and is checked at step 5+.
    if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url):
        endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
        matched = endpoint_metadata.get(model)
        if not matched:
            # Single-model servers: if only one model is loaded, use it
            if len(endpoint_metadata) == 1:
                matched = next(iter(endpoint_metadata.values()))
            else:
                # Fuzzy match: substring in either direction
                for key, entry in endpoint_metadata.items():
                    if model in key or key in model:
                        matched = entry
                        break
        if matched:
            context_length = matched.get("context_length")
            if isinstance(context_length, int):
                return context_length
        if not _is_known_provider_base_url(base_url):
            # 3. Try querying local server directly
            if is_local_endpoint(base_url):
                local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
                if local_ctx and local_ctx > 0:
                    save_context_length(model, base_url, local_ctx)
                    return local_ctx
            logger.info(
                "Could not detect context length for model %r at %s — "
                "defaulting to %s tokens (probe-down). Set model.context_length "
                "in config.yaml to override.",
                model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
            )
            return DEFAULT_FALLBACK_CONTEXT

    # 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
    if provider == "anthropic" or (
        base_url and base_url_hostname(base_url) == "api.anthropic.com"
    ):
        ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
        if ctx:
            return ctx

    # 4b. AWS Bedrock — use static context length table.
    # Bedrock's ListFoundationModels doesn't expose context window sizes,
    # so we maintain a curated table in bedrock_adapter.py.
    if provider == "bedrock" or (
        base_url
        and base_url_hostname(base_url).startswith("bedrock-runtime.")
        and base_url_host_matches(base_url, "amazonaws.com")
    ):
        try:
            from agent.bedrock_adapter import get_bedrock_context_length
            return get_bedrock_context_length(model)
        except ImportError:
            pass  # boto3 not installed — fall through to generic resolution

    # 5. Provider-aware lookups (before generic OpenRouter cache)
    # These are provider-specific and take priority over the generic OR cache,
    # since the same model can have different context limits per provider
    # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
    # If provider is generic (openrouter/custom/empty), try to infer from URL.
    effective_provider = provider
    if not effective_provider or effective_provider in ("openrouter", "custom"):
        if base_url:
            inferred = _infer_provider_from_url(base_url)
            if inferred:
                effective_provider = inferred

    if effective_provider == "nous":
        ctx = _resolve_nous_context_length(model)
        if ctx:
            return ctx
    if effective_provider:
        from agent.models_dev import lookup_models_dev_context
        ctx = lookup_models_dev_context(effective_provider, model)
        if ctx:
            return ctx

    # 6. OpenRouter live API metadata (provider-unaware fallback)
    metadata = fetch_model_metadata()
    if model in metadata:
        return metadata[model].get("context_length", 128000)

    # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
    # Only check `default_model in model` (is the key a substring of the input).
    # The reverse (`model in default_model`) causes shorter names like
    # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
    model_lower = model.lower()
    for default_model, length in sorted(
        DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
    ):
        if default_model in model_lower:
            return length

    # 9. Query local server as last resort
    if base_url and is_local_endpoint(base_url):
        local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
        if local_ctx and local_ctx > 0:
            save_context_length(model, base_url, local_ctx)
            return local_ctx

    # 10. Default fallback — 128K
    return DEFAULT_FALLBACK_CONTEXT


def estimate_tokens_rough(text: str) -> int:
    """Rough token estimate (~4 chars/token) for pre-flight checks.

    Uses ceiling division so short texts (1-3 chars) never estimate as
    0 tokens, which would cause the compressor and pre-flight checks to
    systematically undercount when many short tool results are present.
    """
    if not text:
        return 0
    return (len(text) + 3) // 4


def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
    """Rough token estimate for a message list (pre-flight only)."""
    total_chars = sum(len(str(msg)) for msg in messages)
    return (total_chars + 3) // 4


def estimate_request_tokens_rough(
    messages: List[Dict[str, Any]],
    *,
    system_prompt: str = "",
    tools: Optional[List[Dict[str, Any]]] = None,
) -> int:
    """Rough token estimate for a full chat-completions request.

    Includes the major payload buckets Hermes sends to providers:
    system prompt, conversation messages, and tool schemas.  With 50+
    tools enabled, schemas alone can add 20-30K tokens — a significant
    blind spot when only counting messages.
    """
    total_chars = 0
    if system_prompt:
        total_chars += len(system_prompt)
    if messages:
        total_chars += sum(len(str(msg)) for msg in messages)
    if tools:
        total_chars += len(str(tools))
    return (total_chars + 3) // 4
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								"""Model metadata, context lengths, and token estimation utilities.
 								Pure utility functions with no AIAgent dependency. Used by ContextCompressor
 								and run_agent.py for pre-flight context checks.
 								"""
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								import ipaddress
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								import logging
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								import re
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								import time
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								from pathlib import Path
 								from typing import Any, Dict, List, Optional
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								from urllib.parse import urlparse
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
 								import requests
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								import yaml
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
-												fix: sweep remaining provider-URL substring checks across codebase

Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.

New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
  'domain in base_url'. Accepts hostname equality and subdomain matches;
  rejects path segments, host suffixes, and prefix collisions.

Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):

run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection

agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
  (resolve custom, resolve auto, OpenRouter-fallback-to-custom,
  _async_client_from_sync, resolve_provider_client explicit-custom,
  resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff

agent/usage_pricing.py:
- resolve_billing_route openrouter branch

agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup

hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic

hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)

hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes

tools/delegate_tool.py:
- subagent Codex endpoint detection

trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
  kimi-coding, arcee, minimax-cn, minimax)

cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)

Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.

Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
  suite (exact match, subdomain, path-segment rejection, host-suffix
  rejection, host-prefix rejection, empty-input, case-insensitivity,
  trailing dot).

Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.

											
										
										
											2026-04-20 21:17:28 -07:00
+								from utils import base_url_host_matches, base_url_hostname
-												fix: extend hostname-match provider detection across remaining call sites

Aslaaen's fix in the original PR covered _detect_api_mode_for_url and the
two openai/xai sites in run_agent.py. This finishes the sweep: the same
substring-match false-positive class (e.g. https://api.openai.com.evil/v1,
https://proxy/api.openai.com/v1, https://api.anthropic.com.example/v1)
existed in eight more call sites, and the hostname helper was duplicated
in two modules.

- utils: add shared base_url_hostname() (single source of truth).
- hermes_cli/runtime_provider, run_agent: drop local duplicates, import
  from utils. Reuse the cached AIAgent._base_url_hostname attribute
  everywhere it's already populated.
- agent/auxiliary_client: switch codex-wrap auto-detect, max_completion_tokens
  gate (auxiliary_max_tokens_param), and custom-endpoint max_tokens kwarg
  selection to hostname equality.
- run_agent: native-anthropic check in the Claude-style model branch
  and in the AIAgent init provider-auto-detect branch.
- agent/model_metadata: Anthropic /v1/models context-length lookup.
- hermes_cli/providers.determine_api_mode: anthropic / openai URL
  heuristics for custom/unknown providers (the /anthropic path-suffix
  convention for third-party gateways is preserved).
- tools/delegate_tool: anthropic detection for delegated subagent
  runtimes.
- hermes_cli/setup, hermes_cli/tools_config: setup-wizard vision-endpoint
  native-OpenAI detection (paired with deduping the repeated check into
  a single is_native_openai boolean per branch).

Tests:
- tests/test_base_url_hostname.py covers the helper directly
  (path-containing-host, host-suffix, trailing dot, port, case).
- tests/hermes_cli/test_determine_api_mode_hostname.py adds the same
  regression class for determine_api_mode, plus a test that the
  /anthropic third-party gateway convention still wins.

Also: add asslaenn5@gmail.com → Aslaaen to scripts/release.py AUTHOR_MAP.

											
										
										
											2026-04-20 20:58:01 -07:00
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								from hermes_constants import OPENROUTER_MODELS_URL
 								logger = logging.getLogger(__name__)
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								# Provider names that can appear as a "provider:" prefix before a model ID.
 								# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
 								# are preserved so the full model name reaches cache lookups and server queries.
 								_PROVIDER_PREFIXES: frozenset[str] = frozenset({
 								    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
-												feat: add Step Plan provider support (salvage #6005)

Adds a first-class 'stepfun' API-key provider surfaced as Step Plan:

- Support Step Plan setup for both International and China regions
- Discover Step Plan models live from /step_plan/v1/models, with a
  small coding-focused fallback catalog when discovery is unavailable
- Thread StepFun through provider metadata, setup persistence, status
  and doctor output, auxiliary routing, and model normalization
- Add tests for provider resolution, model validation, metadata
  mapping, and StepFun region/model persistence

Based on #6005 by @hengm3467.

Co-authored-by: hengm3467 <100685635+hengm3467@users.noreply.github.com>

											
										
										
											2026-04-22 13:28:01 +05:30
+								    "gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "stepfun", "minimax", "minimax-cn", "anthropic", "deepseek",
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
-												feat(qwen): add Qwen OAuth provider with portal request support

Based on #6079 by @tunamitom with critical fixes and comprehensive tests.

Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
  field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
  credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
  (DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
  fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
  5 inline 'portal.qwen.ai' string checks and share headers between init
  and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
  credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion

New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
  None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
  default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
  missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
  max_tokens suppression)

											
										
										
											2026-04-08 20:48:21 +05:30
+								    "qwen-oauth",
-												feat(xiaomi): add Xiaomi MiMo as first-class provider

Cherry-picked from PR #7702 by kshitijk4poor.

Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)

Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.

Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.

Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.

36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.

											
										
										
											2026-04-11 10:10:31 -07:00
+								    "xiaomi",
-												feat(providers): add Arcee AI as direct API provider

Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.

Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.

Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.

											
										
										
											2026-04-13 17:16:43 -07:00
+								    "arcee",
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    "custom", "local",
 								    # Common aliases
-												feat(providers): add Google AI Studio (Gemini) as a first-class provider

Cherry-picked from PR #5494 by kshitijk4poor.
Adds native Gemini support via Google's OpenAI-compatible endpoint.
Zero new dependencies.

											
										
										
											2026-04-06 10:14:01 -07:00
+								    "google", "google-gemini", "google-ai-studio",
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
-												feat(providers): add kimi-coding-cn provider for mainland China users

Cherry-picked from PR #7637 by hcshen0111.
Adds kimi-coding-cn provider with dedicated KIMI_CN_API_KEY env var
and api.moonshot.cn/v1 endpoint for China-region Moonshot users.

											
										
										
											2026-04-13 11:13:09 -07:00
+								    "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
-												feat: add Ollama Cloud as built-in provider

Add ollama-cloud as a first-class provider with full parity to existing
API-key providers (gemini, zai, minimax, etc.):

- PROVIDER_REGISTRY entry with OLLAMA_API_KEY env var
- Provider aliases: ollama -> custom (local), ollama_cloud -> ollama-cloud
- models.dev integration for accurate context lengths
- URL-to-provider mapping (ollama.com -> ollama-cloud)
- Passthrough model normalization (preserves Ollama model:tag format)
- Default auxiliary model (nemotron-3-nano:30b)
- HermesOverlay in providers.py
- CLI --provider choices, CANONICAL_PROVIDERS entry
- Dynamic model discovery with disk caching (1hr TTL)
- 37 provider-specific tests

Cherry-picked from PR #6038 by kshitijk4poor. Closes #3926

											
										
										
											2026-04-15 22:32:05 -07:00
+								    "ollama",
-												feat: add Step Plan provider support (salvage #6005)

Adds a first-class 'stepfun' API-key provider surfaced as Step Plan:

- Support Step Plan setup for both International and China regions
- Discover Step Plan models live from /step_plan/v1/models, with a
  small coding-focused fallback catalog when discovery is unavailable
- Thread StepFun through provider metadata, setup persistence, status
  and doctor output, auxiliary routing, and model normalization
- Add tests for provider resolution, model validation, metadata
  mapping, and StepFun region/model persistence

Based on #6005 by @hengm3467.

Co-authored-by: hengm3467 <100685635+hengm3467@users.noreply.github.com>

											
										
										
											2026-04-22 13:28:01 +05:30
+								    "stepfun", "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
-												feat(xiaomi): add Xiaomi MiMo as first-class provider

Cherry-picked from PR #7702 by kshitijk4poor.

Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)

Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.

Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.

Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.

36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.

											
										
										
											2026-04-11 10:10:31 -07:00
+								    "mimo", "xiaomi-mimo",
-												feat(providers): add Arcee AI as direct API provider

Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.

Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.

Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.

											
										
										
											2026-04-13 17:16:43 -07:00
+								    "arcee-ai", "arceeai",
-												feat(xai): add xAI/Grok to provider prefix stripping

Add 'xai', 'x-ai', 'x.ai', 'grok' to _PROVIDER_PREFIXES so that
colon-prefixed model names (e.g. xai:grok-4.20) are stripped correctly
for context length lookups.

Cherry-picked from PR #9184 by @Julientalbot.

											
										
										
											2026-04-14 16:43:23 -07:00
+								    "xai", "x-ai", "x.ai", "grok",
-												feat(providers): add native NVIDIA NIM provider

Adds NVIDIA NIM as a first-class provider: ProviderConfig in
auth.py, HermesOverlay in providers.py, curated models
(Nemotron plus other open source models hosted on
build.nvidia.com), URL mapping in model_metadata.py, aliases
(nim, nvidia-nim, build-nvidia, nemotron), and env var tests.

Docs updated: providers page, quickstart table, fallback
providers table, and README provider list.

											
										
										
											2026-04-17 09:55:58 -07:00
+								    "nvidia", "nim", "nvidia-nim", "nemotron",
-												feat(qwen): add Qwen OAuth provider with portal request support

Based on #6079 by @tunamitom with critical fixes and comprehensive tests.

Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
  field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
  credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
  (DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
  fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
  5 inline 'portal.qwen.ai' string checks and share headers between init
  and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
  credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion

New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
  None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
  default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
  missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
  max_tokens suppression)

											
										
										
											2026-04-08 20:48:21 +05:30
+								    "qwen-portal",
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								})
-												fix: 6 bugs in model metadata, reasoning detection, and delegate tool

Cherry-picked from PR #2169 by @0xbyt4.

1. _strip_provider_prefix: skip Ollama model:tag names (qwen:0.5b)
2. Fuzzy match: remove reverse direction that made claude-sonnet-4
   resolve to 1M instead of 200K
3. _has_content_after_think_block: reuse _strip_think_blocks() to
   handle all tag variants (thinking, reasoning, REASONING_SCRATCHPAD)
4. models.dev lookup: elif→if so nous provider also queries models.dev
5. Disk cache fallback: use 5-min TTL instead of full hour so network
   is retried soon
6. Delegate build: wrap child construction in try/finally so
   _last_resolved_tool_names is always restored on exception

											
										
										
											2026-03-20 08:52:37 -07:00
+								_OLLAMA_TAG_PATTERN = re.compile(
 								    r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
 								    re.IGNORECASE,
 								)
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								# Tailscale's CGNAT range (RFC 6598). `ipaddress.is_private` excludes this
 								# block, so without an explicit check Ollama reached over Tailscale (e.g.
 								# `http://100.77.243.5:11434`) wouldn't be treated as local and its stream
 								# read / stale timeouts wouldn't get auto-bumped. Built once at import time.
 								_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								def _strip_provider_prefix(model: str) -> str:
 								    """Strip a recognised provider prefix from a model string.
 								    ``"local:my-model"`` → ``"my-model"``
 								    ``"qwen3.5:27b"``   → ``"qwen3.5:27b"``  (unchanged — not a provider prefix)
-												fix: 6 bugs in model metadata, reasoning detection, and delegate tool

Cherry-picked from PR #2169 by @0xbyt4.

1. _strip_provider_prefix: skip Ollama model:tag names (qwen:0.5b)
2. Fuzzy match: remove reverse direction that made claude-sonnet-4
   resolve to 1M instead of 200K
3. _has_content_after_think_block: reuse _strip_think_blocks() to
   handle all tag variants (thinking, reasoning, REASONING_SCRATCHPAD)
4. models.dev lookup: elif→if so nous provider also queries models.dev
5. Disk cache fallback: use 5-min TTL instead of full hour so network
   is retried soon
6. Delegate build: wrap child construction in try/finally so
   _last_resolved_tool_names is always restored on exception

											
										
										
											2026-03-20 08:52:37 -07:00
+								    ``"qwen:0.5b"``     → ``"qwen:0.5b"``    (unchanged — Ollama model:tag)
 								    ``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag)
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    """
 								    if ":" not in model or model.startswith("http"):
 								        return model
-												fix: 6 bugs in model metadata, reasoning detection, and delegate tool

Cherry-picked from PR #2169 by @0xbyt4.

1. _strip_provider_prefix: skip Ollama model:tag names (qwen:0.5b)
2. Fuzzy match: remove reverse direction that made claude-sonnet-4
   resolve to 1M instead of 200K
3. _has_content_after_think_block: reuse _strip_think_blocks() to
   handle all tag variants (thinking, reasoning, REASONING_SCRATCHPAD)
4. models.dev lookup: elif→if so nous provider also queries models.dev
5. Disk cache fallback: use 5-min TTL instead of full hour so network
   is retried soon
6. Delegate build: wrap child construction in try/finally so
   _last_resolved_tool_names is always restored on exception

											
										
										
											2026-03-20 08:52:37 -07:00
+								    prefix, suffix = model.split(":", 1)
 								    prefix_lower = prefix.strip().lower()
 								    if prefix_lower in _PROVIDER_PREFIXES:
 								        # Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
 								        if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
 								            return model
 								        return suffix
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    return model
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
 								_model_metadata_cache_time: float = 0
 								_MODEL_CACHE_TTL = 3600
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								_endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {}
 								_endpoint_model_metadata_cache_time: Dict[str, float] = {}
 								_ENDPOINT_MODEL_CACHE_TTL = 300
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								# Descending tiers for context length probing when the model is unknown.
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								# We start at 128K (a safe default for most modern models) and step down
 								# on context-length errors until one works.
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								CONTEXT_PROBE_TIERS = [
 _000,
 _000,
 _000,
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+_000,
 _000,
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								]
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								# Default context length when no detection method succeeds.
 								DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
-												fix: prevent agent from stopping mid-task — compression floor, budget overhaul, activity tracking

Three root causes of the 'agent stops mid-task' gateway bug:

1. Compression threshold floor (64K tokens minimum)
   - The 50% threshold on a 100K-context model fired at 50K tokens,
     causing premature compression that made models lose track of
     multi-step plans.  Now threshold_tokens = max(50% * context, 64K).
   - Models with <64K context are rejected at startup with a clear error.

2. Budget warning removal — grace call instead
   - Removed the 70%/90% iteration budget warnings entirely.  These
     injected '[BUDGET WARNING: Provide your final response NOW]' into
     tool results, causing models to abandon complex tasks prematurely.
   - Now: no warnings during normal execution.  When the budget is
     actually exhausted (90/90), inject a user message asking the model
     to summarise, allow one grace API call, and only then fall back
     to _handle_max_iterations.

3. Activity touches during long terminal execution
   - _wait_for_process polls every 0.2s but never reported activity.
     The gateway's inactivity timeout (default 1800s) would fire during
     long-running commands that appeared 'idle.'
   - Now: thread-local activity callback fires every 10s during the
     poll loop, keeping the gateway's activity tracker alive.
   - Agent wires _touch_activity into the callback before each tool call.

Also: docs update noting 64K minimum context requirement.

Closes #7915 (root cause was agent-loop termination, not Weixin delivery limits).
											
										
										
											2026-04-11 16:18:57 -07:00
+								# Minimum context length required to run Hermes Agent.  Models with fewer
 								# tokens cannot maintain enough working memory for tool-calling workflows.
 								# Sessions, model switches, and cron jobs should reject models below this.
 								MINIMUM_CONTEXT_LENGTH = 64_000
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								# Thin fallback defaults — only broad model family patterns.
 								# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 								# all miss. Replaced the previous 80+ entry dict.
 								# For provider-specific context lengths, models.dev is the primary source.
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								DEFAULT_CONTEXT_LENGTHS = {
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
 								    # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
 								    # substring of "anthropic/claude-sonnet-4.6").
 								    # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
-												fix(agent): complete Claude Opus 4.7 API migration

Claude Opus 4.7 introduced several breaking API changes that the current
codebase partially handled but not completely. This patch finishes the
migration per the official migration guide at
https://platform.claude.com/docs/en/about-claude/models/migration-guide

Fixes NousResearch/hermes-agent#11137

Breaking-change coverage:

1. Adaptive thinking + output_config.effort — 4.7 is now recognized by
   _supports_adaptive_thinking() (extends previous 4.6-only gate).

2. Sampling parameter stripping — 4.7 returns 400 for any non-default
   temperature / top_p / top_k. build_anthropic_kwargs drops them as a
   safety net; the OpenAI-protocol auxiliary path (_build_call_kwargs)
   and AnthropicCompletionsAdapter.create() both early-exit before
   setting temperature for 4.7+ models. This keeps flush_memories and
   structured-JSON aux paths that hardcode temperature from 400ing
   when the aux model is flipped to 4.7.

3. thinking.display = "summarized" — 4.7 defaults display to "omitted",
   which silently hides reasoning text from Hermes's CLI activity feed
   during long tool runs. Restoring "summarized" preserves 4.6 UX.

4. Effort level mapping — xhigh now maps to xhigh (was xhigh→max, which
   silently over-efforted every coding/agentic request). max is now a
   distinct ceiling per Anthropic's 5-level effort model.

5. New stop_reason values — refusal and model_context_window_exceeded
   were silently collapsed to "stop" (end_turn) by the adapter's
   stop_reason_map. Now mapped to "content_filter" and "length"
   respectively, matching upstream finish-reason handling already in
   bedrock_adapter.

6. Model catalogs — claude-opus-4-7 added to the Anthropic provider
   list, anthropic/claude-opus-4.7 added at top of OpenRouter fallback
   catalog (recommended), claude-opus-4-7 added to model_metadata
   DEFAULT_CONTEXT_LENGTHS (1M, matching 4.6 per migration guide).

7. Prefill docstrings — run_agent.AIAgent and BatchRunner now document
   that Anthropic Sonnet/Opus 4.6+ reject a trailing assistant-role
   prefill (400).

8. Tests — 4 new tests in test_anthropic_adapter covering display
   default, xhigh preservation, max on 4.7, refusal / context-overflow
   stop_reason mapping, plus the sampling-param predicate. test_model_metadata
   accepts 4.7 at 1M context.

Tested on macOS 15.5 (darwin). 119 tests pass in
tests/agent/test_anthropic_adapter.py, 1320 pass in tests/agent/.

											
										
										
											2026-04-16 12:35:43 -05:00
+								    "claude-opus-4-7": 1000000,
 								    "claude-opus-4.7": 1000000,
-												fix: update claude 4.6 context length from 200K to 1M (#2155)

* fix: preserve Ollama model:tag colons in context length detection

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

* fix: update claude-opus-4-6 and claude-sonnet-4-6 context length from 200K to 1M

Both models support 1,000,000 token context windows. The hardcoded defaults
were set before Anthropic expanded the context for the 4.6 generation.
Verified via models.dev and OpenRouter API data.

---------

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 04:38:59 -07:00
+								    "claude-opus-4-6": 1000000,
 								    "claude-sonnet-4-6": 1000000,
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    "claude-opus-4.6": 1000000,
 								    "claude-sonnet-4.6": 1000000,
 								    # Catch-all for older Claude models (must sort after specific entries)
 								    "claude": 200000,
-												fix: correct GPT-5 family context lengths in fallback defaults (#9309)

The generic 'gpt-5' fallback was set to 128,000 — which is the max
OUTPUT tokens, not the context window. GPT-5 base and most variants
(codex, mini) have 400,000 context. This caused /model to report
128k for models like gpt-5.3-codex when models.dev was unavailable.

Added specific entries for GPT-5 variants with different context sizes:
- gpt-5.4, gpt-5.4-pro: 1,050,000 (1.05M)
- gpt-5.4-mini, gpt-5.4-nano: 400,000
- gpt-5.3-codex-spark: 128,000 (reduced)
- gpt-5.1-chat: 128,000 (chat variant)
- gpt-5 (catch-all): 400,000

Sources: https://developers.openai.com/api/docs/models
											
										
										
											2026-04-13 19:22:23 -07:00
+								    # OpenAI — GPT-5 family (most have 400k; specific overrides first)
 								    # Source: https://developers.openai.com/api/docs/models
-												feat(codex): add gpt-5.5 and wire live model discovery into picker (#14720)

OpenAI launched GPT-5.5 on Codex today (Apr 23 2026). Adds it to the static
catalog and pipes the user's OAuth access token into the openai-codex path of
provider_model_ids() so /model mid-session and the gateway picker hit the
live ChatGPT codex/models endpoint — new models appear for each user
according to what ChatGPT actually lists for their account, without a Hermes
release.

Verified live: 'gpt-5.5' returns priority 0 (featured) from the endpoint,
400k context per OpenAI's launch article. 'hermes chat --provider
openai-codex --model gpt-5.5' completes end-to-end.

Changes:
- hermes_cli/codex_models.py: add gpt-5.5 to DEFAULT_CODEX_MODELS + forward-compat
- agent/model_metadata.py: 400k context length entry
- hermes_cli/models.py: resolve codex OAuth token before calling
  get_codex_model_ids() in provider_model_ids('openai-codex')
											
										
										
											2026-04-23 13:32:43 -07:00
+								    # GPT-5.5 (launched Apr 23 2026). Verified via live ChatGPT codex/models
 								    # endpoint: bare slug `gpt-5.5`, no -pro/-mini variants. 400k context on Codex.
 								    "gpt-5.5": 400000,
-												fix: correct GPT-5 family context lengths in fallback defaults (#9309)

The generic 'gpt-5' fallback was set to 128,000 — which is the max
OUTPUT tokens, not the context window. GPT-5 base and most variants
(codex, mini) have 400,000 context. This caused /model to report
128k for models like gpt-5.3-codex when models.dev was unavailable.

Added specific entries for GPT-5 variants with different context sizes:
- gpt-5.4, gpt-5.4-pro: 1,050,000 (1.05M)
- gpt-5.4-mini, gpt-5.4-nano: 400,000
- gpt-5.3-codex-spark: 128,000 (reduced)
- gpt-5.1-chat: 128,000 (chat variant)
- gpt-5 (catch-all): 400,000

Sources: https://developers.openai.com/api/docs/models
											
										
										
											2026-04-13 19:22:23 -07:00
+								    "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
 								    "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
 								    "gpt-5.4": 1050000,               # GPT-5.4, GPT-5.4 Pro (1.05M context)
 								    "gpt-5.1-chat": 128000,           # Chat variant has 128k context
 								    "gpt-5": 400000,                  # GPT-5.x base, mini, codex variants (400k)
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    "gpt-4.1": 1047576,
 								    "gpt-4": 128000,
 								    # Google
 								    "gemini": 1048576,
-												feat(providers): add Google AI Studio (Gemini) as a first-class provider

Cherry-picked from PR #5494 by kshitijk4poor.
Adds native Gemini support via Google's OpenAI-compatible endpoint.
Zero new dependencies.

											
										
										
											2026-04-06 10:14:01 -07:00
+								    # Gemma (open models served via AI Studio)
-												fix(model_metadata): add gemma-4 and gemma4 context length entries

Fixes #12976

The generic "gemma": 8192 fallback was incorrectly matching gemma4:31b-cloud
before the more specific Gemma 4 entries could match, causing Hermes to assign
only 8K context instead of 262K. Added "gemma-4" and "gemma4" entries before
the fallback to correctly handle Gemma 4 model naming conventions.

											
										
										
											2026-04-22 22:56:54 +01:00
+								    "gemma-4": 256000,  # Gemma 4 family
 								    "gemma4": 256000,  # Ollama-style naming (e.g. gemma4:31b-cloud)
-												fix: update Gemini model catalog + wire models.dev as live model source

Follow-up for salvaged PR #5494:
- Update model catalog to Gemini 3.x + Gemma 4 (drop deprecated 2.0)
- Add list_agentic_models() to models_dev.py with noise filter
- Wire models.dev into _model_flow_api_key_provider as primary source
  (static curated list serves as offline fallback)
- Add gemini -> google mapping in PROVIDER_TO_MODELS_DEV
- Fix Gemma 4 context lengths to 256K (models.dev values)
- Update auxiliary model to gemini-3-flash-preview
- Expand tests: 3.x catalog, context lengths, models.dev integration

											
										
										
											2026-04-06 10:19:19 -07:00
+								    "gemma-4-31b": 256000,
-												feat(providers): add Google AI Studio (Gemini) as a first-class provider

Cherry-picked from PR #5494 by kshitijk4poor.
Adds native Gemini support via Google's OpenAI-compatible endpoint.
Zero new dependencies.

											
										
										
											2026-04-06 10:14:01 -07:00
+								    "gemma-3": 131072,
 								    "gemma": 8192,  # fallback for older gemma models
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # DeepSeek
 								    "deepseek": 128000,
 								    # Meta
 								    "llama": 131072,
-												fix(qwen): correct context lengths for qwen3-coder models and send max_tokens to portal

Based on PR #7285 by @kshitijk4poor.

Two bugs affecting Qwen OAuth users:

1. Wrong context window — qwen3-coder-plus showed 128K instead of 1M.
   Added specific entries before the generic qwen catch-all:
   - qwen3-coder-plus: 1,000,000 (corrected from PR's 1,048,576 per
     official Alibaba Cloud docs and OpenRouter)
   - qwen3-coder: 262,144

2. Random stopping — max_tokens was suppressed for Qwen Portal, so the
   server applied its own low default. Reasoning models exhaust that on
   thinking tokens. Now: honor explicit max_tokens, default to 65536
   when unset.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-04-11 03:29:09 -07:00
+								    # Qwen — specific model families before the catch-all.
 								    # Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
 								    "qwen3-coder-plus": 1000000,  # 1M context
 								    "qwen3-coder": 262144,        # 256K context
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    "qwen": 131072,
-												fix: align MiniMax provider with official API docs

Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.

Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).

Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-04-10 03:53:18 -07:00
+								    # MiniMax — official docs: 204,800 context for all models
 								    # https://platform.minimax.io/docs/api-reference/text-anthropic-api
 								    "minimax": 204800,
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # GLM
 								    "glm": 202752,
-												fix(model_metadata): add xAI Grok context length fallbacks

xAI /v1/models does not return context_length metadata, so Hermes
probes down to the 128k default whenever a user configures a custom
provider pointing at https://api.x.ai/v1. This forces every xAI user
to manually override model.context_length in config.yaml (2M for
Grok 4.20 / 4.1-fast / 4-fast) or lose most of the usable context
window.

Add DEFAULT_CONTEXT_LENGTHS entries for the Grok family so the
fallback lookup returns the correct value via substring matching.
Values sourced from models.dev (2026-04) and cross-checked against
the xAI /v1/models listing:

  - grok-4.20-*          2,000,000  (reasoning, non-reasoning, multi-agent)
  - grok-4-1-fast-*      2,000,000
  - grok-4-fast-*        2,000,000
  - grok-4 / grok-4-0709   256,000
  - grok-code-fast-1       256,000
  - grok-3*                131,072
  - grok-2 / latest        131,072
  - grok-2-vision*           8,192
  - grok (catch-all)       131,072

Keys are ordered longest-first so that specific variants match before
the catch-all, consistent with the existing Claude/Gemma/MiniMax entries.

Add TestDefaultContextLengths.test_grok_models_context_lengths and
test_grok_substring_matching to pin the values and verify the full
lookup path. All 77 tests in test_model_metadata.py pass.

											
										
										
											2026-04-10 12:08:16 +04:00
+								    # xAI Grok — xAI /v1/models does not return context_length metadata,
 								    # so these hardcoded fallbacks prevent Hermes from probing-down to
 								    # the default 128k when the user points at https://api.x.ai/v1
 								    # via a custom provider. Values sourced from models.dev (2026-04).
 								    # Keys use substring matching (longest-first), so e.g. "grok-4.20"
 								    # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
 								    "grok-code-fast": 256000,   # grok-code-fast-1
 								    "grok-4-1-fast": 2000000,   # grok-4-1-fast-(non-)reasoning
 								    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
 								    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning
 								    "grok-4.20": 2000000,       # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
 								    "grok-4": 256000,           # grok-4, grok-4-0709
 								    "grok-3": 131072,           # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
 								    "grok-2": 131072,           # grok-2, grok-2-1212, grok-2-latest
 								    "grok": 131072,             # catch-all (grok-beta, unknown grok-*)
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # Kimi
 								    "kimi": 262144,
-												fix(providers): complete NVIDIA NIM parity with other providers

Follow-up on the native NVIDIA NIM provider salvage. The original PR wired
PROVIDER_REGISTRY + HERMES_OVERLAYS correctly but missed several touchpoints
required for full parity with other OpenAI-compatible providers (xai,
huggingface, deepseek, zai).

Gaps closed:

- hermes_cli/main.py:
  - Add 'nvidia' to the _model_flow_api_key_provider dispatch tuple so
    selecting 'NVIDIA NIM' in `hermes model` actually runs the api-key
    provider flow (previously fell through silently).
  - Add 'nvidia' to `hermes chat --provider` argparse choices so the
    documented test command (`hermes chat --provider nvidia --model ...`)
    parses successfully.

- hermes_cli/config.py: Register NVIDIA_API_KEY and NVIDIA_BASE_URL in
  OPTIONAL_ENV_VARS so setup wizard can prompt for them and they're
  auto-added to the subprocess env blocklist.

- hermes_cli/doctor.py: Add NVIDIA NIM row to `_apikey_providers` so
  `hermes doctor` probes https://integrate.api.nvidia.com/v1/models.

- hermes_cli/dump.py: Add NVIDIA_API_KEY → 'nvidia' mapping for
  `hermes dump` credential masking.

- tests/tools/test_local_env_blocklist.py: Extend registry_vars fixture
  with NVIDIA_API_KEY to verify it's blocked from leaking into subprocesses.

- agent/model_metadata.py: Add 'nemotron' → 131072 context-length entry
  so all Nemotron variants get 128K context via substring match (rather
  than falling back to MINIMUM_CONTEXT_LENGTH).

- hermes_cli/models.py: Fix hallucinated model ID
  'nvidia/nemotron-3-nano-8b-a4b' → 'nvidia/nemotron-3-nano-30b-a3b'
  (verified against live integrate.api.nvidia.com/v1/models catalog).
  Expand curated list from 5 to 9 agentic models mapping to OpenRouter
  defaults per provider-guide convention: add qwen3.5-397b-a17b,
  deepseek-v3.2, llama-3.3-nemotron-super-49b-v1.5, gpt-oss-120b.

- cli-config.yaml.example: Document 'nvidia' provider option.

- scripts/release.py: Map asurla@nvidia.com → anniesurla in AUTHOR_MAP
  for CI attribution.

E2E verified: `hermes chat --provider nvidia ...` now reaches NVIDIA's
endpoint (returns 401 with bogus key instead of argparse error);
`hermes doctor` detects NVIDIA NIM when NVIDIA_API_KEY is set.

											
										
										
											2026-04-17 13:09:14 -07:00
+								    # Nemotron — NVIDIA's open-weights series (128K context across all sizes)
 								    "nemotron": 131072,
-												feat: add arcee-ai/trinity-large-thinking to recommended models

Added to OPENROUTER_MODELS and _PROVIDER_MODELS['nous'] lists.
Also added 'trinity' family entry to DEFAULT_CONTEXT_LENGTHS (262K).

											
										
										
											2026-04-03 13:45:16 -07:00
+								    # Arcee
 								    "trinity": 262144,
-												feat: add openrouter/elephant-alpha to curated model lists (#9378)

* Add hermes debug share instructions to all issue templates

- bug_report.yml: Add required Debug Report section with hermes debug share
  and /debug instructions, make OS/Python/Hermes version optional (covered
  by debug report), demote old logs field to optional supplementary
- setup_help.yml: Replace hermes doctor reference with hermes debug share,
  add Debug Report section with fallback chain (debug share -> --local -> doctor)
- feature_request.yml: Add optional Debug Report section for environment context

All templates now guide users to run hermes debug share (or /debug in chat)
and paste the resulting paste.rs links, giving maintainers system info,
config, and recent logs in one step.

* feat: add openrouter/elephant-alpha to curated model lists

- Add to OPENROUTER_MODELS (free, positioned above GPT models)
- Add to _PROVIDER_MODELS["nous"] mirror list
- Add 256K context window fallback in model_metadata.py
											
										
										
											2026-04-13 21:16:14 -07:00
+								    # OpenRouter
 								    "elephant": 262144,
-												feat: add Hugging Face as a first-class inference provider (#3419)

Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.

Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)

Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.

Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
											
										
										
											2026-03-27 12:41:59 -07:00
+								    # Hugging Face Inference Providers — model IDs use org/name format
 								    "Qwen/Qwen3.5-397B-A17B": 131072,
-												feat: curate HF model picker with OpenRouter analogues (#3440)

Show only agentic models that map to OpenRouter defaults:

  Qwen/Qwen3.5-397B-A17B          ↔ qwen/qwen3.5-plus
  Qwen/Qwen3.5-35B-A3B            ↔ qwen/qwen3.5-35b-a3b
  deepseek-ai/DeepSeek-V3.2       ↔ deepseek/deepseek-chat
  moonshotai/Kimi-K2.5             ↔ moonshotai/kimi-k2.5
  MiniMaxAI/MiniMax-M2.5           ↔ minimax/minimax-m2.5
  zai-org/GLM-5                    ↔ z-ai/glm-5
  XiaomiMiMo/MiMo-V2-Flash         ↔ xiaomi/mimo-v2-pro
  moonshotai/Kimi-K2-Thinking      ↔ moonshotai/kimi-k2-thinking

Users can still pick any HF model via Enter custom model name.
											
										
										
											2026-03-27 13:54:46 -07:00
+								    "Qwen/Qwen3.5-35B-A3B": 131072,
-												feat: add Hugging Face as a first-class inference provider (#3419)

Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.

Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)

Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.

Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
											
										
										
											2026-03-27 12:41:59 -07:00
+								    "deepseek-ai/DeepSeek-V3.2": 65536,
 								    "moonshotai/Kimi-K2.5": 262144,
-												test: stop testing mutable data — convert change-detectors to invariants (#13363)

Catalog snapshots, config version literals, and enumeration counts are data
that changes as designed. Tests that assert on those values add no
behavioral coverage — they just break CI on every routine update and cost
engineering time to 'fix.'

Replace with invariants where one exists, delete where none does.

Deleted (pure snapshots):
- TestMinimaxModelCatalog (3 tests): 'MiniMax-M2.7 in models' et al
- TestGeminiModelCatalog: 'gemini-2.5-pro in models', 'gemini-3.x in models'
- test_browser_camofox_state::test_config_version_matches_current_schema
  (docstring literally said it would break on unrelated bumps)

Relaxed (keep plumbing check, drop snapshot):
- Xiaomi / Arcee / Kimi moonshot / Kimi coding / HuggingFace static lists:
  now assert 'provider exists and has >= 1 entry' instead of specific names
- HuggingFace main/models.py consistency test: drop 'len >= 6' floor

Dynamicized (follow source, not a literal):
- 3x test_config.py migration tests: raw['_config_version'] ==
  DEFAULT_CONFIG['_config_version'] instead of hardcoded 21

Fixed stale tests against intentional behavior changes:
- test_insights::test_gateway_format_hides_cost: name matches new behavior
  (no dollar figures); remove contradicting '$' in text assertion
- test_config::prefers_api_then_url_then_base_url: flipped per PR #9332;
  rename + update to base_url > url > api
- test_anthropic_adapter: relax assert_called_once() (xdist-flaky) to
  assert called — contract is 'credential flowed through'
- test_interrupt_propagation: add provider/model/_base_url to bare-agent
  fixture so the stale-timeout code path resolves

Fixed stale integration tests against opt-in plugin gate:
- transform_tool_result + transform_terminal_output: write plugins.enabled
  allow-list to config.yaml and reset the plugin manager singleton

Source fix (real consistency invariant):
- agent/model_metadata.py: add moonshotai/Kimi-K2.6 context length
  (262144, same as K2.5). test_model_metadata_has_context_lengths was
  correctly catching the gap.

Policy:
- AGENTS.md Testing section: new subsection 'Don't write change-detector
  tests' with do/don't examples. Reviewers should reject catalog-snapshot
  assertions in new tests.

Covers every test that failed on the last completed main CI run
(24703345583) except test_modal_sandbox_fixes::test_terminal_tool_present
+ test_terminal_and_file_toolsets_resolve_all_tools, which now pass both
alone and with the full tests/tools/ directory (xdist ordering flake that
resolved itself).
											
										
										
											2026-04-20 23:20:33 -07:00
+								    "moonshotai/Kimi-K2.6": 262144,
-												feat: add Hugging Face as a first-class inference provider (#3419)

Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.

Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)

Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.

Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
											
										
										
											2026-03-27 12:41:59 -07:00
+								    "moonshotai/Kimi-K2-Thinking": 262144,
-												fix: align MiniMax provider with official API docs

Aligns MiniMax provider with official API documentation. Fixes 6 bugs:
transport mismatch (openai_chat -> anthropic_messages), credential leak
in switch_model(), prompt caching sent to non-Anthropic endpoints,
dot-to-hyphen model name corruption, trajectory compressor URL routing,
and stale doctor health check.

Also corrects context window (204,800), thinking support (manual mode),
max output (131,072), and model catalog (M2 family only on /anthropic).

Source: https://platform.minimax.io/docs/api-reference/text-anthropic-api

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-04-10 03:53:18 -07:00
+								    "MiniMaxAI/MiniMax-M2.5": 204800,
-												feat: add Xiaomi MiMo v2.5-pro and v2.5 model support (#14635)

## Merged

Adds MiMo v2.5-pro and v2.5 support to Xiaomi native provider, OpenCode Go, and setup wizard.

### Changes
- Context lengths: added v2.5-pro (1M) and v2.5 (1M), corrected existing MiMo entries to exact values (262144)
- Provider lists: xiaomi, opencode-go, setup wizard
- Vision: upgraded from mimo-v2-omni to mimo-v2.5 (omnimodal)
- Config description updated for XIAOMI_API_KEY
- Tests updated for new vision model preference

### Verification
- 4322 tests passed, 0 new regressions
- Live API tested on Xiaomi portal: basic, reasoning, tool calling, multi-tool, file ops, system prompt, vision — all pass
- Self-review found and fixed 2 issues (redundant vision check, stale HuggingFace context length)
											
										
										
											2026-04-23 10:06:25 -07:00
+								    "XiaomiMiMo/MiMo-V2-Flash": 262144,
 								    "mimo-v2-pro": 1048576,
 								    "mimo-v2.5-pro": 1048576,
 								    "mimo-v2.5": 1048576,
 								    "mimo-v2-omni": 262144,
 								    "mimo-v2-flash": 262144,
-												feat: add Hugging Face as a first-class inference provider (#3419)

Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main.

Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider:
- hermes chat --provider huggingface (or --provider hf)
- 18 curated open models via hermes model picker
- HF_TOKEN in ~/.hermes/.env
- OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.)

Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests.

Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
											
										
										
											2026-03-27 12:41:59 -07:00
+								    "zai-org/GLM-5": 202752,
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								}
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								_CONTEXT_LENGTH_KEYS = (
 								    "context_length",
 								    "context_window",
 								    "max_context_length",
 								    "max_position_embeddings",
 								    "max_model_len",
 								    "max_input_tokens",
 								    "max_sequence_length",
 								    "max_seq_len",
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								    "n_ctx_train",
 								    "n_ctx",
-												feat: add ctx_size to context length keys for Lemonade server support

- Adds 'ctx_size' field to _CONTEXT_LENGTH_KEYS tuple
- Enables hermes agent to correctly detect context size from custom LLMs
  running on Lemonade server that use this field name instead of the
  standard keys (max_seq_len, n_ctx_train, n_ctx)

											
										
										
											2026-04-12 14:13:02 -04:00
+								    "ctx_size",
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								)
 								_MAX_COMPLETION_KEYS = (
 								    "max_completion_tokens",
 								    "max_output_tokens",
 								    "max_tokens",
 								)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								# Local server hostnames / address patterns
 								_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
-												fix: is_local_endpoint misses Docker/Podman DNS names (#7950)

* fix(tools): neutralize shell injection in _write_to_sandbox via path quoting

_write_to_sandbox interpolated storage_dir and remote_path directly into
a shell command passed to env.execute(). Paths containing shell
metacharacters (spaces, semicolons, $(), backticks) could trigger
arbitrary command execution inside the sandbox.

Fix: wrap both paths with shlex.quote(). Clean paths (alphanumeric +
slashes/hyphens/dots) are left unmodified by shlex.quote, so existing
behavior is unchanged. Paths with unsafe characters get single-quoted.

Tests added for spaces, $(command) substitution, and semicolon injection.

* fix: is_local_endpoint misses Docker/Podman DNS names

host.docker.internal, host.containers.internal, gateway.docker.internal,
and host.lima.internal are well-known DNS names that container runtimes
use to resolve the host machine. Users running Ollama on the host with
the agent in Docker/Podman hit the default 120s stream timeout instead
of the bumped 1800s because these hostnames weren't recognized as local.

Add _CONTAINER_LOCAL_SUFFIXES tuple and suffix check in
is_local_endpoint(). Tests cover all three runtime families plus a
negative case for domains that merely contain the suffix as a substring.
											
										
										
											2026-04-11 14:46:18 -07:00
+								# Docker / Podman / Lima DNS names that resolve to the host machine
 								_CONTAINER_LOCAL_SUFFIXES = (
 								    ".docker.internal",
 								    ".containers.internal",
 								    ".lima.internal",
 								)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
 								def _normalize_base_url(base_url: str) -> str:
 								    return (base_url or "").strip().rstrip("/")
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								def _auth_headers(api_key: str = "") -> Dict[str, str]:
 								    token = str(api_key or "").strip()
 								    if not token:
 								        return {}
 								    return {"Authorization": f"Bearer {token}"}
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								def _is_openrouter_base_url(base_url: str) -> bool:
-												fix: sweep remaining provider-URL substring checks across codebase

Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.

New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
  'domain in base_url'. Accepts hostname equality and subdomain matches;
  rejects path segments, host suffixes, and prefix collisions.

Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):

run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection

agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
  (resolve custom, resolve auto, OpenRouter-fallback-to-custom,
  _async_client_from_sync, resolve_provider_client explicit-custom,
  resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff

agent/usage_pricing.py:
- resolve_billing_route openrouter branch

agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup

hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic

hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)

hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes

tools/delegate_tool.py:
- subagent Codex endpoint detection

trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
  kimi-coding, arcee, minimax-cn, minimax)

cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)

Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.

Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
  suite (exact match, subdomain, path-segment rejection, host-suffix
  rejection, host-prefix rejection, empty-input, case-insensitivity,
  trailing dot).

Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.

											
										
										
											2026-04-20 21:17:28 -07:00
+								    return base_url_host_matches(base_url, "openrouter.ai")
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
 								def _is_custom_endpoint(base_url: str) -> bool:
 								    normalized = _normalize_base_url(base_url)
 								    return bool(normalized) and not _is_openrouter_base_url(normalized)
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								_URL_TO_PROVIDER: Dict[str, str] = {
 								    "api.openai.com": "openai",
 								    "chatgpt.com": "openai",
 								    "api.anthropic.com": "anthropic",
 								    "api.z.ai": "zai",
-												fix(provider): recognize open.bigmodel.cn as Zhipu/ZAI provider

Zhipu AI (智谱) serves both international users via api.z.ai and
China-based users via open.bigmodel.cn. The domestic endpoint was not
mapped in _URL_TO_PROVIDER, causing Hermes to treat it as an unknown
custom endpoint and fall back to the default 128K context length
instead of resolving the correct 200K+ context via models.dev or the
hardcoded GLM defaults.

This affects users of both the standard API
(https://open.bigmodel.cn/api/paas/v4) and the Coding Plan
(https://open.bigmodel.cn/api/coding/paas/v4).

											
										
										
											2026-04-11 23:20:53 +08:00
+								    "open.bigmodel.cn": "zai",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    "api.moonshot.ai": "kimi-coding",
-												feat(providers): add kimi-coding-cn provider for mainland China users

Cherry-picked from PR #7637 by hcshen0111.
Adds kimi-coding-cn provider with dedicated KIMI_CN_API_KEY env var
and api.moonshot.cn/v1 endpoint for China-region Moonshot users.

											
										
										
											2026-04-13 11:13:09 -07:00
+								    "api.moonshot.cn": "kimi-coding-cn",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    "api.kimi.com": "kimi-coding",
-												feat: add Step Plan provider support (salvage #6005)

Adds a first-class 'stepfun' API-key provider surfaced as Step Plan:

- Support Step Plan setup for both International and China regions
- Discover Step Plan models live from /step_plan/v1/models, with a
  small coding-focused fallback catalog when discovery is unavailable
- Thread StepFun through provider metadata, setup persistence, status
  and doctor output, auxiliary routing, and model normalization
- Add tests for provider resolution, model validation, metadata
  mapping, and StepFun region/model persistence

Based on #6005 by @hengm3467.

Co-authored-by: hengm3467 <100685635+hengm3467@users.noreply.github.com>

											
										
										
											2026-04-22 13:28:01 +05:30
+								    "api.stepfun.ai": "stepfun",
 								    "api.stepfun.com": "stepfun",
-												feat(providers): add Arcee AI as direct API provider

Adds Arcee AI as a standard direct provider (ARCEEAI_API_KEY) with
Trinity models: trinity-large-thinking, trinity-large-preview, trinity-mini.

Standard OpenAI-compatible provider checklist: auth.py, config.py,
models.py, main.py, providers.py, doctor.py, model_normalize.py,
model_metadata.py, setup.py, trajectory_compressor.py.

Based on PR #9274 by arthurbr11, simplified to a standard direct
provider without dual-endpoint OpenRouter routing.

											
										
										
											2026-04-13 17:16:43 -07:00
+								    "api.arcee.ai": "arcee",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    "api.minimax": "minimax",
 								    "dashscope.aliyuncs.com": "alibaba",
-												fix: add dashscope-intl.aliyuncs.com to URL-to-provider mapping

The official international DashScope endpoint uses dashscope-intl.aliyuncs.com
(per Alibaba docs), which the substring match on dashscope.aliyuncs.com misses
because of the hyphenated prefix.

											
										
										
											2026-03-20 12:51:39 -07:00
+								    "dashscope-intl.aliyuncs.com": "alibaba",
-												feat(qwen): add Qwen OAuth provider with portal request support

Based on #6079 by @tunamitom with critical fixes and comprehensive tests.

Changes from #6079:
- Fix: sanitization overwrite bug — Qwen message prep now runs AFTER codex
  field sanitization, not before (was silently discarding Qwen transforms)
- Fix: missing try/except AuthError in runtime_provider.py — stale Qwen
  credentials now fall through to next provider on auto-detect
- Fix: 'qwen' alias conflict — bare 'qwen' stays mapped to 'alibaba'
  (DashScope); use 'qwen-portal' or 'qwen-cli' for the OAuth provider
- Fix: hardcoded ['coder-model'] replaced with live API fetch + curated
  fallback list (qwen3-coder-plus, qwen3-coder)
- Fix: extract _is_qwen_portal() helper + _qwen_portal_headers() to replace
  5 inline 'portal.qwen.ai' string checks and share headers between init
  and credential swap
- Fix: add Qwen branch to _apply_client_headers_for_base_url for mid-session
  credential swaps
- Fix: remove suspicious TypeError catch blocks around _prompt_provider_choice
- Fix: handle bare string items in content lists (were silently dropped)
- Fix: remove redundant dict() copies after deepcopy in message prep
- Revert: unrelated ai-gateway test mock removal and model_switch.py comment deletion

New tests (30 test functions):
- _qwen_cli_auth_path, _read_qwen_cli_tokens (success + 3 error paths)
- _save_qwen_cli_tokens (roundtrip, parent creation, permissions)
- _qwen_access_token_is_expiring (5 edge cases: fresh, expired, within skew,
  None, non-numeric)
- _refresh_qwen_cli_tokens (success, preserve old refresh, 4 error paths,
  default expires_in, disk persistence)
- resolve_qwen_runtime_credentials (fresh, auto-refresh, force-refresh,
  missing token, env override)
- get_qwen_auth_status (logged in, not logged in)
- Runtime provider resolution (direct, pool entry, alias)
- _build_api_kwargs (metadata, vl_high_resolution_images, message formatting,
  max_tokens suppression)

											
										
										
											2026-04-08 20:48:21 +05:30
+								    "portal.qwen.ai": "qwen-oauth",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    "openrouter.ai": "openrouter",
-												feat(providers): add Google AI Studio (Gemini) as a first-class provider

Cherry-picked from PR #5494 by kshitijk4poor.
Adds native Gemini support via Google's OpenAI-compatible endpoint.
Zero new dependencies.

											
										
										
											2026-04-06 10:14:01 -07:00
+								    "generativelanguage.googleapis.com": "gemini",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    "inference-api.nousresearch.com": "nous",
 								    "api.deepseek.com": "deepseek",
-												fix(model_metadata): skip endpoint probe for known providers (Copilot context bug) (#2507)

The context length resolver was querying the /models endpoint for known
providers like GitHub Copilot, which returns a provider-imposed limit
(128k) instead of the model's actual context window (400k for gpt-5.4).
Since this check happened before the models.dev lookup, the wrong value
won every time.

Fix:
- Add api.githubcopilot.com and models.github.ai to _URL_TO_PROVIDER
- Skip the endpoint metadata probe for known providers — their /models
  data is unreliable for context length. models.dev has the correct
  per-provider values.

Reported by danny [DUMB] — gpt-5.4 via Copilot was resolving to 128k
instead of the correct 400k from models.dev.
											
										
										
											2026-03-22 08:15:06 -07:00
+								    "api.githubcopilot.com": "copilot",
 								    "models.github.ai": "copilot",
-												feat: add Fireworks context length detection support (#4158)

- Add api.fireworks.ai to _URL_TO_PROVIDER for automatic provider detection
- Add fireworks to PROVIDER_TO_MODELS_DEV mapped to 'fireworks-ai' (the
  correct models.dev provider key — original PR used 'fireworks' which
  would silently fail the lookup)


Cherry-picked from PR #3989 with models.dev key fix.

Co-authored-by: sroecker <sroecker@users.noreply.github.com>
											
										
										
											2026-03-30 20:37:08 -07:00
+								    "api.fireworks.ai": "fireworks",
-												fix: resolve opencode.ai context window to 1M and clean up display formatting

Two issues resolved:

1. Add opencode.ai to _URL_TO_PROVIDER mapping so base_url routes through
   models.dev lookup (which has mimo-v2-pro at 1M context) instead of
   falling back to probing /models (404) and defaulting to 128K.

2. Fix _format_context_length to round cleanly: 1048576 → '1M' instead
   of '1.048576M'. Applies same rounding logic to K values.

											
										
										
											2026-04-02 19:59:19 -05:00
+								    "opencode.ai": "opencode-go",
-												feat(providers): add native xAI provider

Adds xAI as a first-class provider: ProviderConfig in auth.py,
HermesOverlay in providers.py, 11 curated Grok models, URL mapping
in model_metadata.py, aliases (x-ai, x.ai), and env var tests.
Uses standard OpenAI-compatible chat completions.

Closes #7050

											
										
										
											2026-04-10 12:51:30 +04:00
+								    "api.x.ai": "xai",
-												feat(providers): add native NVIDIA NIM provider

Adds NVIDIA NIM as a first-class provider: ProviderConfig in
auth.py, HermesOverlay in providers.py, curated models
(Nemotron plus other open source models hosted on
build.nvidia.com), URL mapping in model_metadata.py, aliases
(nim, nvidia-nim, build-nvidia, nemotron), and env var tests.

Docs updated: providers page, quickstart table, fallback
providers table, and README provider list.

											
										
										
											2026-04-17 09:55:58 -07:00
+								    "integrate.api.nvidia.com": "nvidia",
-												feat(xiaomi): add Xiaomi MiMo as first-class provider

Cherry-picked from PR #7702 by kshitijk4poor.

Adds Xiaomi MiMo as a direct provider (XIAOMI_API_KEY) with models:
- mimo-v2-pro (1M context), mimo-v2-omni (256K, multimodal), mimo-v2-flash (256K, cheapest)

Standard OpenAI-compatible provider checklist: auth.py, config.py, models.py,
main.py, providers.py, doctor.py, model_normalize.py, model_metadata.py,
models_dev.py, auxiliary_client.py, .env.example, cli-config.yaml.example.

Follow-up: vision tasks use mimo-v2-omni (multimodal) instead of the user's
main model. Non-vision aux uses the user's selected model. Added
_PROVIDER_VISION_MODELS dict for provider-specific vision model overrides.
On failure, falls back to aggregators (gemini flash) via existing fallback chain.

Corrects pre-existing context lengths: mimo-v2-pro 1048576→1000000,
mimo-v2-omni 1048576→256000, adds mimo-v2-flash 256000.

36 tests covering registry, aliases, auto-detect, credentials, models.dev,
normalization, URL mapping, providers module, doctor, aux client, vision
model override, and agent init.

											
										
										
											2026-04-11 10:10:31 -07:00
+								    "api.xiaomimimo.com": "xiaomi",
 								    "xiaomimimo.com": "xiaomi",
-												feat: add Ollama Cloud as built-in provider

Add ollama-cloud as a first-class provider with full parity to existing
API-key providers (gemini, zai, minimax, etc.):

- PROVIDER_REGISTRY entry with OLLAMA_API_KEY env var
- Provider aliases: ollama -> custom (local), ollama_cloud -> ollama-cloud
- models.dev integration for accurate context lengths
- URL-to-provider mapping (ollama.com -> ollama-cloud)
- Passthrough model normalization (preserves Ollama model:tag format)
- Default auxiliary model (nemotron-3-nano:30b)
- HermesOverlay in providers.py
- CLI --provider choices, CANONICAL_PROVIDERS entry
- Dynamic model discovery with disk caching (1hr TTL)
- 37 provider-specific tests

Cherry-picked from PR #6038 by kshitijk4poor. Closes #3926

											
										
										
											2026-04-15 22:32:05 -07:00
+								    "ollama.com": "ollama-cloud",
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								}
 								def _infer_provider_from_url(base_url: str) -> Optional[str]:
 								    """Infer the models.dev provider name from a base URL.
 								    This allows context length resolution via models.dev for custom endpoints
 								    like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
 								    explicitly set the provider name in config.
 								    """
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								    normalized = _normalize_base_url(base_url)
 								    if not normalized:
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								        return None
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								    parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
 								    host = parsed.netloc.lower() or parsed.path.lower()
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    for url_part, provider in _URL_TO_PROVIDER.items():
 								        if url_part in host:
 								            return provider
 								    return None
 								def _is_known_provider_base_url(base_url: str) -> bool:
 								    return _infer_provider_from_url(base_url) is not None
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								def is_local_endpoint(base_url: str) -> bool:
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								    """Return True if base_url points to a local machine.
 								    Recognises loopback (``localhost``, ``127.0.0.0/8``, ``::1``),
 								    container-internal DNS names (``host.docker.internal`` et al.),
 								    RFC-1918 private ranges (``10/8``, ``172.16/12``, ``192.168/16``),
 								    link-local, and Tailscale CGNAT (``100.64.0.0/10``). Tailscale CGNAT
 								    is included so remote-but-trusted Ollama boxes reached over a
 								    Tailscale mesh get the same timeout auto-bumps as localhost Ollama.
 								    """
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    normalized = _normalize_base_url(base_url)
 								    if not normalized:
 								        return False
 								    url = normalized if "://" in normalized else f"http://{normalized}"
 								    try:
 								        parsed = urlparse(url)
 								        host = parsed.hostname or ""
 								    except Exception:
 								        return False
 								    if host in _LOCAL_HOSTS:
 								        return True
-												fix: is_local_endpoint misses Docker/Podman DNS names (#7950)

* fix(tools): neutralize shell injection in _write_to_sandbox via path quoting

_write_to_sandbox interpolated storage_dir and remote_path directly into
a shell command passed to env.execute(). Paths containing shell
metacharacters (spaces, semicolons, $(), backticks) could trigger
arbitrary command execution inside the sandbox.

Fix: wrap both paths with shlex.quote(). Clean paths (alphanumeric +
slashes/hyphens/dots) are left unmodified by shlex.quote, so existing
behavior is unchanged. Paths with unsafe characters get single-quoted.

Tests added for spaces, $(command) substitution, and semicolon injection.

* fix: is_local_endpoint misses Docker/Podman DNS names

host.docker.internal, host.containers.internal, gateway.docker.internal,
and host.lima.internal are well-known DNS names that container runtimes
use to resolve the host machine. Users running Ollama on the host with
the agent in Docker/Podman hit the default 120s stream timeout instead
of the bumped 1800s because these hostnames weren't recognized as local.

Add _CONTAINER_LOCAL_SUFFIXES tuple and suffix check in
is_local_endpoint(). Tests cover all three runtime families plus a
negative case for domains that merely contain the suffix as a substring.
											
										
										
											2026-04-11 14:46:18 -07:00
+								    # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
 								    if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
 								        return True
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								    # RFC-1918 private ranges, link-local, and Tailscale CGNAT
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    try:
 								        addr = ipaddress.ip_address(host)
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								        if addr.is_private or addr.is_loopback or addr.is_link_local:
 								            return True
 								        if isinstance(addr, ipaddress.IPv4Address) and addr in _TAILSCALE_CGNAT:
 								            return True
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    except ValueError:
 								        pass
 								    # Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								    # or Tailscale CGNAT (100.64.x.x–100.127.x.x).
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    parts = host.split(".")
 								    if len(parts) == 4:
 								        try:
 								            first, second = int(parts[0]), int(parts[1])
 								            if first == 10:
 								                return True
 								            if first == 172 and 16 <= second <= 31:
 								                return True
 								            if first == 192 and second == 168:
 								                return True
-												fix(agent): recognize Tailscale CGNAT (100.64.0.0/10) as local for Ollama timeouts

`is_local_endpoint()` leaned on `ipaddress.is_private`, which classifies
RFC-1918 ranges and link-local as private but deliberately excludes the
RFC 6598 CGNAT block (100.64.0.0/10) — the range Tailscale uses for its
mesh IPs. As a result, Ollama reached over Tailscale (e.g.
`http://100.77.243.5:11434`) was treated as remote and missed the
automatic stream-read / stale-stream timeout bumps, so cold model load
plus long prefill would trip the 300 s watchdog before the first token.

Add a module-level `_TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")`
(built once) and extend `is_local_endpoint()` to match the block both
via the parsed-`IPv4Address` path and the existing bare-string fallback
(for symmetry with the 10/172/192 checks). Also hoist the previously
function-local `import ipaddress` to module scope now that it's used by
the constant.

Extend `TestIsLocalEndpoint` with a CGNAT positive set (lower bound,
representative host, MagicDNS anchor, upper bound) and a near-miss
negative set (just below 100.64.0.0, just above 100.127.255.255, well
outside the block, and first-octet-wrong).

											
										
										
											2026-04-13 06:17:13 +02:00
+								            if first == 100 and 64 <= second <= 127:
 								                return True
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								        except ValueError:
 								            pass
 								    return False
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    """Detect which local server is running at base_url by probing known endpoints.
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								    Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    """
 								    import httpx
 								    normalized = _normalize_base_url(base_url)
 								    server_url = normalized
 								    if server_url.endswith("/v1"):
 								        server_url = server_url[:-3]
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								    headers = _auth_headers(api_key)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    try:
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        with httpx.Client(timeout=2.0, headers=headers) as client:
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								            # LM Studio exposes /api/v1/models — check first (most specific)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            try:
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								                r = client.get(f"{server_url}/api/v1/models")
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                if r.status_code == 200:
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								                    return "lm-studio"
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            except Exception:
 								                pass
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								            # Ollama exposes /api/tags and responds with {"models": [...]}
 								            # LM Studio returns {"error": "Unexpected endpoint"} with status 200
 								            # on this path, so we must verify the response contains "models".
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            try:
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								                r = client.get(f"{server_url}/api/tags")
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                if r.status_code == 200:
-												feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).

											
										
										
											2026-03-19 21:32:04 +01:00
+								                    try:
 								                        data = r.json()
 								                        if "models" in data:
 								                            return "ollama"
 								                    except Exception:
 								                        pass
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            except Exception:
 								                pass
-												fix(model_metadata): use /v1/props endpoint for llama.cpp context detection

Recent versions of llama.cpp moved the server properties endpoint from
/props to /v1/props (consistent with the /v1 API prefix convention).

The server-type detection path and the n_ctx reading path both used the
old /props URL, which returns 404 on current builds. This caused the
allocated context window size to fall back to a hardcoded default,
resulting in an incorrect (too small) value being displayed in the TUI
context bar.

Fix: try /v1/props first, fall back to /props for backward compatibility
with older llama.cpp builds. Both paths are now handled gracefully.

											
										
										
											2026-03-21 18:07:18 -07:00
+								            # llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            try:
-												fix(model_metadata): use /v1/props endpoint for llama.cpp context detection

Recent versions of llama.cpp moved the server properties endpoint from
/props to /v1/props (consistent with the /v1 API prefix convention).

The server-type detection path and the n_ctx reading path both used the
old /props URL, which returns 404 on current builds. This caused the
allocated context window size to fall back to a hardcoded default,
resulting in an incorrect (too small) value being displayed in the TUI
context bar.

Fix: try /v1/props first, fall back to /props for backward compatibility
with older llama.cpp builds. Both paths are now handled gracefully.

											
										
										
											2026-03-21 18:07:18 -07:00
+								                r = client.get(f"{server_url}/v1/props")
 								                if r.status_code != 200:
 								                    r = client.get(f"{server_url}/props")  # fallback for older builds
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                if r.status_code == 200 and "default_generation_settings" in r.text:
 								                    return "llamacpp"
 								            except Exception:
 								                pass
 								            # vLLM: /version
 								            try:
 								                r = client.get(f"{server_url}/version")
 								                if r.status_code == 200:
 								                    data = r.json()
 								                    if "version" in data:
 								                        return "vllm"
 								            except Exception:
 								                pass
 								    except Exception:
 								        pass
 								    return None
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								def _iter_nested_dicts(value: Any):
 								    if isinstance(value, dict):
 								        yield value
 								        for nested in value.values():
 								            yield from _iter_nested_dicts(nested)
 								    elif isinstance(value, list):
 								        for item in value:
 								            yield from _iter_nested_dicts(item)
 								def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]:
 								    try:
 								        if isinstance(value, bool):
 								            return None
 								        if isinstance(value, str):
 								            value = value.strip().replace(",", "")
 								        result = int(value)
 								    except (TypeError, ValueError):
 								        return None
 								    if minimum <= result <= maximum:
 								        return result
 								    return None
 								def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]:
 								    keyset = {key.lower() for key in keys}
 								    for mapping in _iter_nested_dicts(payload):
 								        for key, value in mapping.items():
 								            if str(key).lower() not in keyset:
 								                continue
 								            coerced = _coerce_reasonable_int(value)
 								            if coerced is not None:
 								                return coerced
 								    return None
 								def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]:
 								    return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS)
 								def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]:
 								    return _extract_first_int(payload, _MAX_COMPLETION_KEYS)
 								def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]:
 								    alias_map = {
 								        "prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"),
 								        "completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"),
 								        "request": ("request", "request_cost"),
 								        "cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"),
 								        "cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"),
 								    }
 								    for mapping in _iter_nested_dicts(payload):
 								        normalized = {str(key).lower(): value for key, value in mapping.items()}
 								        if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()):
 								            continue
 								        pricing: Dict[str, Any] = {}
 								        for target, aliases in alias_map.items():
 								            for alias in aliases:
 								                if alias in normalized and normalized[alias] not in (None, ""):
 								                    pricing[target] = normalized[alias]
 								                    break
 								        if pricing:
 								            return pricing
 								    return {}
 								def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None:
 								    cache[model_id] = entry
 								    if "/" in model_id:
 								        bare_model = model_id.split("/", 1)[1]
 								        cache.setdefault(bare_model, entry)
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
 								def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
 								    """Fetch model metadata from OpenRouter (cached for 1 hour)."""
 								    global _model_metadata_cache, _model_metadata_cache_time
 								    if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
 								        return _model_metadata_cache
 								    try:
 								        response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
 								        response.raise_for_status()
 								        data = response.json()
 								        cache = {}
 								        for model in data.get("data", []):
 								            model_id = model.get("id", "")
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								            entry = {
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								                "context_length": model.get("context_length", 128000),
 								                "max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
 								                "name": model.get("name", model_id),
 								                "pricing": model.get("pricing", {}),
 								            }
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								            _add_model_aliases(cache, model_id, entry)
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								            canonical = model.get("canonical_slug", "")
 								            if canonical and canonical != model_id:
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								                _add_model_aliases(cache, canonical, entry)
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
 								        _model_metadata_cache = cache
 								        _model_metadata_cache_time = time.time()
 								        logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
 								        return cache
 								    except Exception as e:
 								        logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
 								        return _model_metadata_cache or {}
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								def fetch_endpoint_model_metadata(
 								    base_url: str,
 								    api_key: str = "",
 								    force_refresh: bool = False,
 								) -> Dict[str, Dict[str, Any]]:
 								    """Fetch model metadata from an OpenAI-compatible ``/models`` endpoint.
 								    This is used for explicit custom endpoints where hardcoded global model-name
 								    defaults are unreliable. Results are cached in memory per base URL.
 								    """
 								    normalized = _normalize_base_url(base_url)
 								    if not normalized or _is_openrouter_base_url(normalized):
 								        return {}
 								    if not force_refresh:
 								        cached = _endpoint_model_metadata_cache.get(normalized)
 								        cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0)
 								        if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL:
 								            return cached
 								    candidates = [normalized]
 								    if normalized.endswith("/v1"):
 								        alternate = normalized[:-3].rstrip("/")
 								    else:
 								        alternate = normalized + "/v1"
 								    if alternate and alternate not in candidates:
 								        candidates.append(alternate)
 								    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
 								    last_error: Optional[Exception] = None
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								    if is_local_endpoint(normalized):
 								        try:
 								            if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
 								                server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
 								                response = requests.get(
 								                    server_url.rstrip("/") + "/api/v1/models",
 								                    headers=headers,
 								                    timeout=10,
 								                )
 								                response.raise_for_status()
 								                payload = response.json()
 								                cache: Dict[str, Dict[str, Any]] = {}
 								                for model in payload.get("models", []):
 								                    if not isinstance(model, dict):
 								                        continue
 								                    model_id = model.get("key") or model.get("id")
 								                    if not model_id:
 								                        continue
 								                    entry: Dict[str, Any] = {"name": model.get("name", model_id)}
 								                    context_length = None
 								                    for inst in model.get("loaded_instances", []) or []:
 								                        if not isinstance(inst, dict):
 								                            continue
 								                        cfg = inst.get("config", {})
 								                        ctx = cfg.get("context_length") if isinstance(cfg, dict) else None
 								                        if isinstance(ctx, int) and ctx > 0:
 								                            context_length = ctx
 								                            break
 								                    if context_length is None:
 								                        context_length = _extract_context_length(model)
 								                    if context_length is not None:
 								                        entry["context_length"] = context_length
 								                    max_completion_tokens = _extract_max_completion_tokens(model)
 								                    if max_completion_tokens is not None:
 								                        entry["max_completion_tokens"] = max_completion_tokens
 								                    pricing = _extract_pricing(model)
 								                    if pricing:
 								                        entry["pricing"] = pricing
 								                    _add_model_aliases(cache, model_id, entry)
 								                    alt_id = model.get("id")
 								                    if isinstance(alt_id, str) and alt_id and alt_id != model_id:
 								                        _add_model_aliases(cache, alt_id, entry)
 								                _endpoint_model_metadata_cache[normalized] = cache
 								                _endpoint_model_metadata_cache_time[normalized] = time.time()
 								                return cache
 								        except Exception as exc:
 								            last_error = exc
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								    for candidate in candidates:
 								        url = candidate.rstrip("/") + "/models"
 								        try:
 								            response = requests.get(url, headers=headers, timeout=10)
 								            response.raise_for_status()
 								            payload = response.json()
 								            cache: Dict[str, Dict[str, Any]] = {}
 								            for model in payload.get("data", []):
 								                if not isinstance(model, dict):
 								                    continue
 								                model_id = model.get("id")
 								                if not model_id:
 								                    continue
 								                entry: Dict[str, Any] = {"name": model.get("name", model_id)}
 								                context_length = _extract_context_length(model)
 								                if context_length is not None:
 								                    entry["context_length"] = context_length
 								                max_completion_tokens = _extract_max_completion_tokens(model)
 								                if max_completion_tokens is not None:
 								                    entry["max_completion_tokens"] = max_completion_tokens
 								                pricing = _extract_pricing(model)
 								                if pricing:
 								                    entry["pricing"] = pricing
 								                _add_model_aliases(cache, model_id, entry)
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								            # If this is a llama.cpp server, query /props for actual allocated context
 								            is_llamacpp = any(
 								                m.get("owned_by") == "llamacpp"
 								                for m in payload.get("data", []) if isinstance(m, dict)
 								            )
 								            if is_llamacpp:
 								                try:
-												fix(model_metadata): use /v1/props endpoint for llama.cpp context detection

Recent versions of llama.cpp moved the server properties endpoint from
/props to /v1/props (consistent with the /v1 API prefix convention).

The server-type detection path and the n_ctx reading path both used the
old /props URL, which returns 404 on current builds. This caused the
allocated context window size to fall back to a hardcoded default,
resulting in an incorrect (too small) value being displayed in the TUI
context bar.

Fix: try /v1/props first, fall back to /props for backward compatibility
with older llama.cpp builds. Both paths are now handled gracefully.

											
										
										
											2026-03-21 18:07:18 -07:00
+								                    # Try /v1/props first (current llama.cpp); fall back to /props for older builds
 								                    base = candidate.rstrip("/").replace("/v1", "")
 								                    props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5)
 								                    if not props_resp.ok:
 								                        props_resp = requests.get(base + "/props", headers=headers, timeout=5)
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								                    if props_resp.ok:
 								                        props = props_resp.json()
 								                        gen_settings = props.get("default_generation_settings", {})
 								                        n_ctx = gen_settings.get("n_ctx")
 								                        model_alias = props.get("model_alias", "")
 								                        if n_ctx and model_alias and model_alias in cache:
 								                            cache[model_alias]["context_length"] = n_ctx
 								                except Exception:
 								                    pass
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								            _endpoint_model_metadata_cache[normalized] = cache
 								            _endpoint_model_metadata_cache_time[normalized] = time.time()
 								            return cache
 								        except Exception as exc:
 								            last_error = exc
 								    if last_error:
 								        logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error)
 								    _endpoint_model_metadata_cache[normalized] = {}
 								    _endpoint_model_metadata_cache_time[normalized] = time.time()
 								    return {}
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								def _get_context_cache_path() -> Path:
 								    """Return path to the persistent context length cache file."""
-												refactor: replace inline HERMES_HOME re-implementations with get_hermes_home()

16 callsites across 14 files were re-deriving the hermes home path
via os.environ.get('HERMES_HOME', ...) instead of using the canonical
get_hermes_home() from hermes_constants. This breaks profiles — each
profile has its own HERMES_HOME, and the inline fallback defaults to
~/.hermes regardless.

Fixed by importing and calling get_hermes_home() at each site. For
files already inside the hermes process (agent/, hermes_cli/, tools/,
gateway/, plugins/), this is always safe. Files that run outside the
process context (mcp_serve.py, mcp_oauth.py) already had correct
try/except ImportError fallbacks and were left alone.

Skipped: hermes_constants.py (IS the implementation), env_loader.py
(bootstrap), profiles.py (intentionally manipulates the env var),
standalone scripts (optional-skills/, skills/), and tests.

											
										
										
											2026-04-07 10:40:34 -07:00
+								    from hermes_constants import get_hermes_home
 								    return get_hermes_home() / "context_length_cache.yaml"
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
 								def _load_context_cache() -> Dict[str, int]:
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    """Load the model+provider -> context_length cache from disk."""
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    path = _get_context_cache_path()
 								    if not path.exists():
 								        return {}
 								    try:
 								        with open(path) as f:
 								            data = yaml.safe_load(f) or {}
 								        return data.get("context_lengths", {})
 								    except Exception as e:
 								        logger.debug("Failed to load context length cache: %s", e)
 								        return {}
 								def save_context_length(model: str, base_url: str, length: int) -> None:
 								    """Persist a discovered context length for a model+provider combo.
 								    Cache key is ``model@base_url`` so the same model name served from
 								    different providers can have different limits.
 								    """
 								    key = f"{model}@{base_url}"
 								    cache = _load_context_cache()
 								    if cache.get(key) == length:
 								        return  # already stored
 								    cache[key] = length
 								    path = _get_context_cache_path()
 								    try:
 								        path.parent.mkdir(parents=True, exist_ok=True)
 								        with open(path, "w") as f:
 								            yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								        logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    except Exception as e:
 								        logger.debug("Failed to save context length cache: %s", e)
 								def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
 								    """Look up a previously discovered context length for model+provider."""
 								    key = f"{model}@{base_url}"
 								    cache = _load_context_cache()
 								    return cache.get(key)
 								def get_next_probe_tier(current_length: int) -> Optional[int]:
 								    """Return the next lower probe tier, or None if already at minimum."""
 								    for tier in CONTEXT_PROBE_TIERS:
 								        if tier < current_length:
 								            return tier
 								    return None
 								def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
 								    """Try to extract the actual context limit from an API error message.
 								    Many providers include the limit in their error text, e.g.:
 								      - "maximum context length is 32768 tokens"
 								      - "context_length_exceeded: 131072"
 								      - "Maximum context size 32768 exceeded"
 								      - "model's max context length is 65536"
 								    """
 								    error_lower = error_msg.lower()
 								    # Pattern: look for numbers near context-related keywords
 								    patterns = [
 								        r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
 								        r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
 								        r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
-												test: comprehensive tests for model metadata + firecrawl config

model_metadata tests (61 tests, was 39):
  - Token estimation: concrete value assertions, unicode, tool_call messages,
    vision multimodal content, additive verification
  - Context length resolution: cache-over-API priority, no-base_url skips cache,
    missing context_length key in API response
  - API metadata fetch: canonical_slug aliasing, TTL expiry with time mock,
    stale cache fallback on API failure, malformed JSON resilience
  - Probe tiers: above-max returns 2M, zero returns None
  - Error parsing: Anthropic format ('X > Y maximum'), LM Studio, empty string,
    unreasonably large numbers — also fixed parser to handle Anthropic format
  - Cache: corruption resilience (garbage YAML, wrong structure), value updates,
    special chars in model names

Firecrawl config tests (8 tests, was 4):
  - Singleton caching (core purpose — verified constructor called once)
  - Constructor failure recovery (retry after exception)
  - Return value actually asserted (not just constructor args)
  - Empty string env vars treated as absent
  - Proper setup/teardown for env var isolation

											
										
										
											2026-03-05 18:22:39 -08:00
+								        r'>\s*(\d{4,})\s*(?:max|limit|token)',  # "250000 tokens > 200000 maximum"
 								        r'(\d{4,})\s*(?:max(?:imum)?)\b',  # "200000 maximum"
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    ]
 								    for pattern in patterns:
 								        match = re.search(pattern, error_lower)
 								        if match:
 								            limit = int(match.group(1))
 								            # Sanity check: must be a reasonable context length
 								            if 1024 <= limit <= 10_000_000:
 								                return limit
 								    return None
-												fix(compaction): don't halve context_length on output-cap-too-large errors

When the API returns "max_tokens too large given prompt" (input tokens
are within the context window, but input + requested output > window),
the old code incorrectly routed through the same handler as "prompt too
long" errors, calling get_next_probe_tier() and permanently halving
context_length. This made things worse: the window was fine, only the
requested output size needed trimming for that one call.

Two distinct error classes now handled separately:

  Prompt too long  — input itself exceeds context window.
    Fix: compress history + halve context_length (existing behaviour,
    unchanged).

  Output cap too large — input OK, but input + max_tokens > window.
    Fix: parse available_tokens from the error message, set a one-shot
    _ephemeral_max_output_tokens override for the retry, and leave
    context_length completely untouched.

Changes:
- agent/model_metadata.py: add parse_available_output_tokens_from_error()
  that detects Anthropic's "available_tokens: N" error format and returns
  the available output budget, or None for all other error types.
- run_agent.py: call the new parser first in the is_context_length_error
  block; if it fires, set _ephemeral_max_output_tokens (with a 64-token
  safety margin) and break to retry without touching context_length.
  _build_api_kwargs consumes the ephemeral value exactly once then clears
  it so subsequent calls use self.max_tokens normally.
- agent/anthropic_adapter.py: expand build_anthropic_kwargs docstring to
  clearly document the max_tokens (output cap) vs context_length (total
  window) distinction, which is a persistent source of confusion due to
  the OpenAI-inherited "max_tokens" name.
- cli-config.yaml.example: add inline comments explaining both keys side
  by side where users are most likely to look.
- website/docs/integrations/providers.md: add a callout box at the top
  of "Context Length Detection" and clarify the troubleshooting entry.
- tests/test_ctx_halving_fix.py: 24 tests across four classes covering
  the parser, build_anthropic_kwargs clamping, ephemeral one-shot
  consumption, and the invariant that context_length is never mutated
  on output-cap errors.

											
										
										
											2026-04-09 16:54:23 +02:00
+								def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
 								    """Detect an "output cap too large" error and return how many output tokens are available.
 								    Background — two distinct context errors exist:
 . "Prompt too long"  — the INPUT itself exceeds the context window.
 								           Fix: compress history and/or halve context_length.
 . "max_tokens too large" — input is fine, but input + requested_output > window.
 								           Fix: reduce max_tokens (the output cap) for this call.
 								           Do NOT touch context_length — the window hasn't shrunk.
 								    Anthropic's API returns errors like:
 								      "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"
 								    Returns the number of output tokens that would fit (e.g. 10000 above), or None if
 								    the error does not look like a max_tokens-too-large error.
 								    """
 								    error_lower = error_msg.lower()
 								    # Must look like an output-cap error, not a prompt-length error.
 								    is_output_cap_error = (
 								        "max_tokens" in error_lower
 								        and ("available_tokens" in error_lower or "available tokens" in error_lower)
 								    )
 								    if not is_output_cap_error:
 								        return None
 								    # Extract the available_tokens figure.
 								    # Anthropic format: "… = available_tokens: 10000"
 								    patterns = [
 								        r'available_tokens[:\s]+(\d+)',
 								        r'available\s+tokens[:\s]+(\d+)',
 								        # fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
 								        r'=\s*(\d+)\s*$',
 								    ]
 								    for pattern in patterns:
 								        match = re.search(pattern, error_lower)
 								        if match:
 								            tokens = int(match.group(1))
 								            if tokens >= 1:
 								                return tokens
 								    return None
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
+								def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
 								    """Return True if *candidate_id* (from server) matches *lookup_model* (configured).
 								    Supports two forms:
 								    - Exact match:  "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
 								    - Slug match:   "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
 								                    (the part after the last "/" equals lookup_model)
 								    This covers LM Studio's native API which stores models as "publisher/slug"
 								    while users typically configure only the slug after the "local:" prefix.
 								    """
 								    if candidate_id == lookup_model:
 								        return True
 								    # Slug match: basename of candidate equals the lookup name
 								    if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
 								        return True
 								    return False
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Optional[int]:
-												fix: provider/model resolution — salvage 4 PRs + MiniMax aux URL fix (#5983)

Salvaged fixes from community PRs:

- fix(model_switch): _read_auth_store → _load_auth_store + fix auth store
  key lookup (was checking top-level dict instead of store['providers']).
  OAuth providers now correctly detected in /model picker.
  Cherry-picked from PR #5911 by Xule Lin (linxule).

- fix(ollama): pass num_ctx to override 2048 default context window.
  Ollama defaults to 2048 context regardless of model capabilities. Now
  auto-detects from /api/show metadata and injects num_ctx into every
  request. Config override via model.ollama_num_ctx. Fixes #2708.
  Cherry-picked from PR #5929 by kshitij (kshitijk4poor).

- fix(aux): normalize provider aliases for vision/auxiliary routing.
  Adds _normalize_aux_provider() with 17 aliases (google→gemini,
  claude→anthropic, glm→zai, etc). Fixes vision routing failure when
  provider is set to 'google' instead of 'gemini'.
  Cherry-picked from PR #5793 by e11i (Elizabeth1979).

- fix(aux): rewrite MiniMax /anthropic base URLs to /v1 for OpenAI SDK.
  MiniMax's inference_base_url ends in /anthropic (Anthropic Messages API),
  but auxiliary client uses OpenAI SDK which appends /chat/completions →
  404 at /anthropic/chat/completions. Generic _to_openai_base_url() helper
  rewrites terminal /anthropic to /v1 for OpenAI-compatible endpoint.
  Inspired by PR #5786 by Lempkey.

Added debug logging to silent exception blocks across all fixes.

Co-authored-by: Hermes Agent <hermes@nousresearch.com>
											
										
										
											2026-04-07 22:23:28 -07:00
+								    """Query an Ollama server for the model's context length.
 								    Returns the model's maximum context from GGUF metadata via ``/api/show``,
 								    or the explicit ``num_ctx`` from the Modelfile if set.  Returns None if
 								    the server is unreachable or not Ollama.
 								    This is the value that should be passed as ``num_ctx`` in Ollama chat
 								    requests to override the default 2048.
 								    """
 								    import httpx
 								    bare_model = _strip_provider_prefix(model)
 								    server_url = base_url.rstrip("/")
 								    if server_url.endswith("/v1"):
 								        server_url = server_url[:-3]
 								    try:
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        server_type = detect_local_server_type(base_url, api_key=api_key)
-												fix: provider/model resolution — salvage 4 PRs + MiniMax aux URL fix (#5983)

Salvaged fixes from community PRs:

- fix(model_switch): _read_auth_store → _load_auth_store + fix auth store
  key lookup (was checking top-level dict instead of store['providers']).
  OAuth providers now correctly detected in /model picker.
  Cherry-picked from PR #5911 by Xule Lin (linxule).

- fix(ollama): pass num_ctx to override 2048 default context window.
  Ollama defaults to 2048 context regardless of model capabilities. Now
  auto-detects from /api/show metadata and injects num_ctx into every
  request. Config override via model.ollama_num_ctx. Fixes #2708.
  Cherry-picked from PR #5929 by kshitij (kshitijk4poor).

- fix(aux): normalize provider aliases for vision/auxiliary routing.
  Adds _normalize_aux_provider() with 17 aliases (google→gemini,
  claude→anthropic, glm→zai, etc). Fixes vision routing failure when
  provider is set to 'google' instead of 'gemini'.
  Cherry-picked from PR #5793 by e11i (Elizabeth1979).

- fix(aux): rewrite MiniMax /anthropic base URLs to /v1 for OpenAI SDK.
  MiniMax's inference_base_url ends in /anthropic (Anthropic Messages API),
  but auxiliary client uses OpenAI SDK which appends /chat/completions →
  404 at /anthropic/chat/completions. Generic _to_openai_base_url() helper
  rewrites terminal /anthropic to /v1 for OpenAI-compatible endpoint.
  Inspired by PR #5786 by Lempkey.

Added debug logging to silent exception blocks across all fixes.

Co-authored-by: Hermes Agent <hermes@nousresearch.com>
											
										
										
											2026-04-07 22:23:28 -07:00
+								    except Exception:
 								        return None
 								    if server_type != "ollama":
 								        return None
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								    headers = _auth_headers(api_key)
-												fix: provider/model resolution — salvage 4 PRs + MiniMax aux URL fix (#5983)

Salvaged fixes from community PRs:

- fix(model_switch): _read_auth_store → _load_auth_store + fix auth store
  key lookup (was checking top-level dict instead of store['providers']).
  OAuth providers now correctly detected in /model picker.
  Cherry-picked from PR #5911 by Xule Lin (linxule).

- fix(ollama): pass num_ctx to override 2048 default context window.
  Ollama defaults to 2048 context regardless of model capabilities. Now
  auto-detects from /api/show metadata and injects num_ctx into every
  request. Config override via model.ollama_num_ctx. Fixes #2708.
  Cherry-picked from PR #5929 by kshitij (kshitijk4poor).

- fix(aux): normalize provider aliases for vision/auxiliary routing.
  Adds _normalize_aux_provider() with 17 aliases (google→gemini,
  claude→anthropic, glm→zai, etc). Fixes vision routing failure when
  provider is set to 'google' instead of 'gemini'.
  Cherry-picked from PR #5793 by e11i (Elizabeth1979).

- fix(aux): rewrite MiniMax /anthropic base URLs to /v1 for OpenAI SDK.
  MiniMax's inference_base_url ends in /anthropic (Anthropic Messages API),
  but auxiliary client uses OpenAI SDK which appends /chat/completions →
  404 at /anthropic/chat/completions. Generic _to_openai_base_url() helper
  rewrites terminal /anthropic to /v1 for OpenAI-compatible endpoint.
  Inspired by PR #5786 by Lempkey.

Added debug logging to silent exception blocks across all fixes.

Co-authored-by: Hermes Agent <hermes@nousresearch.com>
											
										
										
											2026-04-07 22:23:28 -07:00
+								    try:
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        with httpx.Client(timeout=3.0, headers=headers) as client:
-												fix: provider/model resolution — salvage 4 PRs + MiniMax aux URL fix (#5983)

Salvaged fixes from community PRs:

- fix(model_switch): _read_auth_store → _load_auth_store + fix auth store
  key lookup (was checking top-level dict instead of store['providers']).
  OAuth providers now correctly detected in /model picker.
  Cherry-picked from PR #5911 by Xule Lin (linxule).

- fix(ollama): pass num_ctx to override 2048 default context window.
  Ollama defaults to 2048 context regardless of model capabilities. Now
  auto-detects from /api/show metadata and injects num_ctx into every
  request. Config override via model.ollama_num_ctx. Fixes #2708.
  Cherry-picked from PR #5929 by kshitij (kshitijk4poor).

- fix(aux): normalize provider aliases for vision/auxiliary routing.
  Adds _normalize_aux_provider() with 17 aliases (google→gemini,
  claude→anthropic, glm→zai, etc). Fixes vision routing failure when
  provider is set to 'google' instead of 'gemini'.
  Cherry-picked from PR #5793 by e11i (Elizabeth1979).

- fix(aux): rewrite MiniMax /anthropic base URLs to /v1 for OpenAI SDK.
  MiniMax's inference_base_url ends in /anthropic (Anthropic Messages API),
  but auxiliary client uses OpenAI SDK which appends /chat/completions →
  404 at /anthropic/chat/completions. Generic _to_openai_base_url() helper
  rewrites terminal /anthropic to /v1 for OpenAI-compatible endpoint.
  Inspired by PR #5786 by Lempkey.

Added debug logging to silent exception blocks across all fixes.

Co-authored-by: Hermes Agent <hermes@nousresearch.com>
											
										
										
											2026-04-07 22:23:28 -07:00
+								            resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
 								            if resp.status_code != 200:
 								                return None
 								            data = resp.json()
 								            # Prefer explicit num_ctx from Modelfile parameters (user override)
 								            params = data.get("parameters", "")
 								            if "num_ctx" in params:
 								                for line in params.split("\n"):
 								                    if "num_ctx" in line:
 								                        parts = line.strip().split()
 								                        if len(parts) >= 2:
 								                            try:
 								                                return int(parts[-1])
 								                            except ValueError:
 								                                pass
 								            # Fall back to GGUF model_info context_length (training max)
 								            model_info = data.get("model_info", {})
 								            for key, value in model_info.items():
 								                if "context_length" in key and isinstance(value, (int, float)):
 								                    return int(value)
 								    except Exception:
 								        pass
 								    return None
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    """Query a local server for the model's context length."""
 								    import httpx
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    # Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
 								    # Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
 								    model = _strip_provider_prefix(model)
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    # Strip /v1 suffix to get the server root
 								    server_url = base_url.rstrip("/")
 								    if server_url.endswith("/v1"):
 								        server_url = server_url[:-3]
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								    headers = _auth_headers(api_key)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    try:
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        server_type = detect_local_server_type(base_url, api_key=api_key)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    except Exception:
 								        server_type = None
 								    try:
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        with httpx.Client(timeout=3.0, headers=headers) as client:
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            # Ollama: /api/show returns model details with context info
 								            if server_type == "ollama":
 								                resp = client.post(f"{server_url}/api/show", json={"name": model})
 								                if resp.status_code == 200:
 								                    data = resp.json()
-												fix(agent): prefer Ollama Modelfile num_ctx over GGUF training max

_query_local_context_length was checking model_info.context_length
(the GGUF training max) before num_ctx (the Modelfile runtime override),
inverse to query_ollama_num_ctx. The two helpers therefore disagreed on
the same model:

  hermes-brain:qwen3-14b-ctx32k     # Modelfile: num_ctx 32768
  underlying qwen3:14b GGUF         # qwen3.context_length: 40960

query_ollama_num_ctx correctly returned 32768 (the value Ollama will
actually allocate KV cache for). _query_local_context_length returned
40960, which let ContextCompressor grow conversations past 32768 before
triggering compression — at which point Ollama silently truncated the
prefix, corrupting context.

Swap the order so num_ctx is checked first, matching query_ollama_num_ctx.
Adds a parametrized test that seeds both values and asserts num_ctx wins.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-13 11:41:45 +02:00
+								                    # Prefer explicit num_ctx from Modelfile parameters: this is
 								                    # the *runtime* context Ollama will actually allocate KV cache
 								                    # for. The GGUF model_info.context_length is the training max,
 								                    # which can be larger than num_ctx — using it here would let
 								                    # Hermes grow conversations past the runtime limit and Ollama
 								                    # would silently truncate. Matches query_ollama_num_ctx().
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                    params = data.get("parameters", "")
 								                    if "num_ctx" in params:
 								                        for line in params.split("\n"):
 								                            if "num_ctx" in line:
 								                                parts = line.strip().split()
 								                                if len(parts) >= 2:
 								                                    try:
 								                                        return int(parts[-1])
 								                                    except ValueError:
 								                                        pass
-												fix(agent): prefer Ollama Modelfile num_ctx over GGUF training max

_query_local_context_length was checking model_info.context_length
(the GGUF training max) before num_ctx (the Modelfile runtime override),
inverse to query_ollama_num_ctx. The two helpers therefore disagreed on
the same model:

  hermes-brain:qwen3-14b-ctx32k     # Modelfile: num_ctx 32768
  underlying qwen3:14b GGUF         # qwen3.context_length: 40960

query_ollama_num_ctx correctly returned 32768 (the value Ollama will
actually allocate KV cache for). _query_local_context_length returned
40960, which let ContextCompressor grow conversations past 32768 before
triggering compression — at which point Ollama silently truncated the
prefix, corrupting context.

Swap the order so num_ctx is checked first, matching query_ollama_num_ctx.
Adds a parametrized test that seeds both values and asserts num_ctx wins.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-13 11:41:45 +02:00
+								                    # Fall back to GGUF model_info context_length (training max)
 								                    model_info = data.get("model_info", {})
 								                    for key, value in model_info.items():
 								                        if "context_length" in key and isinstance(value, (int, float)):
 								                            return int(value)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
+								            # LM Studio native API: /api/v1/models returns max_context_length.
 								            # This is more reliable than the OpenAI-compat /v1/models which
 								            # doesn't include context window information for LM Studio servers.
 								            # Use _model_id_matches for fuzzy matching: LM Studio stores models as
 								            # "publisher/slug" but users configure only "slug" after "local:" prefix.
 								            if server_type == "lm-studio":
 								                resp = client.get(f"{server_url}/api/v1/models")
 								                if resp.status_code == 200:
 								                    data = resp.json()
 								                    for m in data.get("models", []):
 								                        if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
 								                            # Prefer loaded instance context (actual runtime value)
 								                            for inst in m.get("loaded_instances", []):
 								                                cfg = inst.get("config", {})
 								                                ctx = cfg.get("context_length")
 								                                if ctx and isinstance(ctx, (int, float)):
 								                                    return int(ctx)
 								                            # Fall back to max_context_length (theoretical model max)
 								                            ctx = m.get("max_context_length") or m.get("context_length")
 								                            if ctx and isinstance(ctx, (int, float)):
 								                                return int(ctx)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            # LM Studio / vLLM / llama.cpp: try /v1/models/{model}
 								            resp = client.get(f"{server_url}/v1/models/{model}")
 								            if resp.status_code == 200:
 								                data = resp.json()
 								                # vLLM returns max_model_len
 								                ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
 								                if ctx and isinstance(ctx, (int, float)):
 								                    return int(ctx)
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
+								            # Try /v1/models and find the model in the list.
 								            # Use _model_id_matches to handle "publisher/slug" vs bare "slug".
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            resp = client.get(f"{server_url}/v1/models")
 								            if resp.status_code == 200:
 								                data = resp.json()
 								                models_list = data.get("data", [])
 								                for m in models_list:
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
+								                    if _model_id_matches(m.get("id", ""), model):
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                        ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
 								                        if ctx and isinstance(ctx, (int, float)):
 								                            return int(ctx)
 								    except Exception:
 								        pass
 								    return None
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								def _normalize_model_version(model: str) -> str:
 								    """Normalize version separators for matching.
 								    Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
 								    OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
 								    Normalize both to dashes for comparison.
 								    """
 								    return model.replace(".", "-")
 								def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
 								    """Query Anthropic's /v1/models endpoint for context length.
 								    Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
 								    OAuth tokens (sk-ant-oat*) from Claude Code return 401.
 								    """
 								    if not api_key or api_key.startswith("sk-ant-oat"):
 								        return None  # OAuth tokens can't access /v1/models
 								    try:
 								        base = base_url.rstrip("/")
 								        if base.endswith("/v1"):
 								            base = base[:-3]
 								        url = f"{base}/v1/models?limit=1000"
 								        headers = {
 								            "x-api-key": api_key,
 								            "anthropic-version": "2023-06-01",
 								        }
 								        resp = requests.get(url, headers=headers, timeout=10)
 								        if resp.status_code != 200:
 								            return None
 								        data = resp.json()
 								        for m in data.get("data", []):
 								            if m.get("id") == model:
 								                ctx = m.get("max_input_tokens")
 								                if isinstance(ctx, int) and ctx > 0:
 								                    return ctx
 								    except Exception as e:
 								        logger.debug("Anthropic /v1/models query failed: %s", e)
 								    return None
 								def _resolve_nous_context_length(model: str) -> Optional[int]:
 								    """Resolve Nous Portal model context length via OpenRouter metadata.
 								    Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
 								    prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
 								    with version normalization (dot↔dash).
 								    """
 								    metadata = fetch_model_metadata()  # OpenRouter cache
 								    # Exact match first
 								    if model in metadata:
 								        return metadata[model].get("context_length")
 								    normalized = _normalize_model_version(model).lower()
 								    for or_id, entry in metadata.items():
 								        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
 								        if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
 								            return entry.get("context_length")
 								    # Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
 								    # Require match to be at a word boundary (followed by -, :, or end of string)
 								    model_lower = model.lower()
 								    for or_id, entry in metadata.items():
 								        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
 								        for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
 								            if candidate.startswith(query) and (
 								                len(candidate) == len(query) or candidate[len(query)] in "-:."
 								            ):
 								                return entry.get("context_length")
 								    return None
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								def get_model_context_length(
 								    model: str,
 								    base_url: str = "",
 								    api_key: str = "",
 								    config_context_length: int | None = None,
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    provider: str = "",
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								) -> int:
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    """Get the context length for a model.
 								    Resolution order:
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+. Explicit config override (model.context_length or custom_providers per-model)
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+. Persistent cache (previously discovered via probing)
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+. Active endpoint metadata (/models for explicit custom endpoints)
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+. Local server query (for local endpoints)
 . Anthropic /v1/models API (API-key users only, not OAuth)
 . OpenRouter live API metadata
 . Nous suffix-match via OpenRouter cache
 . models.dev registry lookup (provider-aware)
 . Thin hardcoded defaults (broad family patterns)
 . Default fallback (128K)
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    """
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								    # 0. Explicit config override — user knows best
 								    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
 								        return config_context_length
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
+								    # Normalise provider-prefixed model names (e.g. "local:model-name" →
 								    # "model-name") so cache lookups and server queries use the bare ID that
-												fix: preserve Ollama model:tag colons in context length detection (#2149)

The colon-split logic in get_model_context_length() and
_query_local_context_length() assumed any colon meant provider:model
format (e.g. "local:my-model"). But Ollama uses model:tag format
(e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just
"27b" — which matches nothing, causing a fallback to the 2M token
probe tier.

Now only recognised provider prefixes (local, openrouter, anthropic,
etc.) are stripped. Ollama model:tag names pass through intact.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-20 03:19:31 -07:00
+								    # local servers actually know about.  Ollama "model:tag" colons are preserved.
 								    model = _strip_provider_prefix(model)
-												fix: prefer loaded instance context size over max for LM Studio

When LM Studio has a model loaded with a custom context size (e.g.,
122K), prefer that over the model's max_context_length (e.g., 1M).
This makes the TUI status bar show the actual runtime context window.

											
										
										
											2026-03-18 22:00:53 +01:00
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    # 1. Check persistent cache (model+provider)
 								    if base_url:
 								        cached = get_cached_context_length(model, base_url)
 								        if cached is not None:
 								            return cached
-												fix(model_metadata): skip endpoint probe for known providers (Copilot context bug) (#2507)

The context length resolver was querying the /models endpoint for known
providers like GitHub Copilot, which returns a provider-imposed limit
(128k) instead of the model's actual context window (400k for gpt-5.4).
Since this check happened before the models.dev lookup, the wrong value
won every time.

Fix:
- Add api.githubcopilot.com and models.github.ai to _URL_TO_PROVIDER
- Skip the endpoint metadata probe for known providers — their /models
  data is unreliable for context length. models.dev has the correct
  per-provider values.

Reported by danny [DUMB] — gpt-5.4 via Copilot was resolving to 128k
instead of the correct 400k from models.dev.
											
										
										
											2026-03-22 08:15:06 -07:00
+								    # 2. Active endpoint metadata for truly custom/unknown endpoints.
 								    # Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
 								    # /models endpoint may report a provider-imposed limit (e.g. Copilot
 								    # returns 128k) instead of the model's full context (400k).  models.dev
 								    # has the correct per-provider values and is checked at step 5+.
 								    if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url):
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								        endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								        matched = endpoint_metadata.get(model)
 								        if not matched:
 								            # Single-model servers: if only one model is loaded, use it
 								            if len(endpoint_metadata) == 1:
 								                matched = next(iter(endpoint_metadata.values()))
 								            else:
 								                # Fuzzy match: substring in either direction
 								                for key, entry in endpoint_metadata.items():
 								                    if model in key or key in model:
 								                        matched = entry
 								                        break
 								        if matched:
 								            context_length = matched.get("context_length")
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
+								            if isinstance(context_length, int):
 								                return context_length
 								        if not _is_known_provider_base_url(base_url):
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								            # 3. Try querying local server directly
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								            if is_local_endpoint(base_url):
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								                local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								                if local_ctx and local_ctx > 0:
 								                    save_context_length(model, base_url, local_ctx)
 								                    return local_ctx
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								            logger.info(
 								                "Could not detect context length for model %r at %s — "
 								                "defaulting to %s tokens (probe-down). Set model.context_length "
 								                "in config.yaml to override.",
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								                model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
-												fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
											
										
										
											2026-03-19 06:01:16 -07:00
+								            )
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								            return DEFAULT_FALLBACK_CONTEXT
-												feat: use endpoint metadata for custom model context and pricing (#1906)

* perf: cache base_url.lower() via property, consolidate triple load_config(), hoist set constant

run_agent.py:
- Add base_url property that auto-caches _base_url_lower on every
  assignment, eliminating 12+ redundant .lower() calls per API cycle
  across __init__, _build_api_kwargs, _supports_reasoning_extra_body,
  and the main conversation loop
- Consolidate three separate load_config() disk reads in __init__
  (memory, skills, compression) into a single call, reusing the
  result dict for all three config sections

model_tools.py:
- Hoist _READ_SEARCH_TOOLS set to module level (was rebuilt inside
  handle_function_call on every tool invocation)

* Use endpoint metadata for custom model context and pricing

---------

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
											
										
										
											2026-03-18 03:04:07 -07:00
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
 								    if provider == "anthropic" or (
-												fix: extend hostname-match provider detection across remaining call sites

Aslaaen's fix in the original PR covered _detect_api_mode_for_url and the
two openai/xai sites in run_agent.py. This finishes the sweep: the same
substring-match false-positive class (e.g. https://api.openai.com.evil/v1,
https://proxy/api.openai.com/v1, https://api.anthropic.com.example/v1)
existed in eight more call sites, and the hostname helper was duplicated
in two modules.

- utils: add shared base_url_hostname() (single source of truth).
- hermes_cli/runtime_provider, run_agent: drop local duplicates, import
  from utils. Reuse the cached AIAgent._base_url_hostname attribute
  everywhere it's already populated.
- agent/auxiliary_client: switch codex-wrap auto-detect, max_completion_tokens
  gate (auxiliary_max_tokens_param), and custom-endpoint max_tokens kwarg
  selection to hostname equality.
- run_agent: native-anthropic check in the Claude-style model branch
  and in the AIAgent init provider-auto-detect branch.
- agent/model_metadata: Anthropic /v1/models context-length lookup.
- hermes_cli/providers.determine_api_mode: anthropic / openai URL
  heuristics for custom/unknown providers (the /anthropic path-suffix
  convention for third-party gateways is preserved).
- tools/delegate_tool: anthropic detection for delegated subagent
  runtimes.
- hermes_cli/setup, hermes_cli/tools_config: setup-wizard vision-endpoint
  native-OpenAI detection (paired with deduping the repeated check into
  a single is_native_openai boolean per branch).

Tests:
- tests/test_base_url_hostname.py covers the helper directly
  (path-containing-host, host-suffix, trailing dot, port, case).
- tests/hermes_cli/test_determine_api_mode_hostname.py adds the same
  regression class for determine_api_mode, plus a test that the
  /anthropic third-party gateway convention still wins.

Also: add asslaenn5@gmail.com → Aslaaen to scripts/release.py AUTHOR_MAP.

											
										
										
											2026-04-20 20:58:01 -07:00
+								        base_url and base_url_hostname(base_url) == "api.anthropic.com"
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    ):
 								        ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
 								        if ctx:
 								            return ctx
-												feat: native AWS Bedrock provider via Converse API

Salvaged from PR #7920 by JiaDe-Wu — cherry-picked Bedrock-specific
additions onto current main, skipping stale-branch reverts (293 commits
behind).

Dual-path architecture:
  - Claude models → AnthropicBedrock SDK (prompt caching, thinking budgets)
  - Non-Claude models → Converse API via boto3 (Nova, DeepSeek, Llama, Mistral)

Includes:
  - Core adapter (agent/bedrock_adapter.py, 1098 lines)
  - Full provider registration (auth, models, providers, config, runtime, main)
  - IAM credential chain + Bedrock API Key auth modes
  - Dynamic model discovery via ListFoundationModels + ListInferenceProfiles
  - Streaming with delta callbacks, error classification, guardrails
  - hermes doctor + hermes auth integration
  - /usage pricing for 7 Bedrock models
  - 130 automated tests (79 unit + 28 integration + follow-up fixes)
  - Documentation (website/docs/guides/aws-bedrock.md)
  - boto3 optional dependency (pip install hermes-agent[bedrock])

Co-authored-by: JiaDe WU <40445668+JiaDe-Wu@users.noreply.github.com>

											
										
										
											2026-04-15 15:18:01 -07:00
+								    # 4b. AWS Bedrock — use static context length table.
 								    # Bedrock's ListFoundationModels doesn't expose context window sizes,
 								    # so we maintain a curated table in bedrock_adapter.py.
-												fix: sweep remaining provider-URL substring checks across codebase

Completes the hostname-hardening sweep — every substring check against a
provider host in live-routing code is now hostname-based. This closes the
same false-positive class for OpenRouter, GitHub Copilot, Kimi, Qwen,
ChatGPT/Codex, Bedrock, GitHub Models, Vercel AI Gateway, Nous, Z.AI,
Moonshot, Arcee, and MiniMax that the original PR closed for OpenAI, xAI,
and Anthropic.

New helper:
- utils.base_url_host_matches(base_url, domain) — safe counterpart to
  'domain in base_url'. Accepts hostname equality and subdomain matches;
  rejects path segments, host suffixes, and prefix collisions.

Call sites converted (real-code only; tests, optional-skills, red-teaming
scripts untouched):

run_agent.py (10 sites):
- AIAgent.__init__ Bedrock branch, ChatGPT/Codex branch (also path check)
- header cascade for openrouter / copilot / kimi / qwen / chatgpt
- interleaved-thinking trigger (openrouter + claude)
- _is_openrouter_url(), _is_qwen_portal()
- is_native_anthropic check
- github-models-vs-copilot detection (3 sites)
- reasoning-capable route gate (nousresearch, vercel, github)
- codex-backend detection in API kwargs build
- fallback api_mode Bedrock detection

agent/auxiliary_client.py (7 sites):
- extra-headers cascades in 4 distinct client-construction paths
  (resolve custom, resolve auto, OpenRouter-fallback-to-custom,
  _async_client_from_sync, resolve_provider_client explicit-custom,
  resolve_auto_with_codex)
- _is_openrouter_client() base_url sniff

agent/usage_pricing.py:
- resolve_billing_route openrouter branch

agent/model_metadata.py:
- _is_openrouter_base_url(), Bedrock context-length lookup

hermes_cli/providers.py:
- determine_api_mode Bedrock heuristic

hermes_cli/runtime_provider.py:
- _is_openrouter_url flag for API-key preference (issues #420, #560)

hermes_cli/doctor.py:
- Kimi User-Agent header for /models probes

tools/delegate_tool.py:
- subagent Codex endpoint detection

trajectory_compressor.py:
- _detect_provider() cascade (8 providers: openrouter, nous, codex, zai,
  kimi-coding, arcee, minimax-cn, minimax)

cli.py, gateway/run.py:
- /model-switch cache-enabled hint (openrouter + claude)

Bedrock detection tightened from 'bedrock-runtime in url' to
'hostname starts with bedrock-runtime. AND host is under amazonaws.com'.
ChatGPT/Codex detection tightened from 'chatgpt.com/backend-api/codex in
url' to 'hostname is chatgpt.com AND path contains /backend-api/codex'.

Tests:
- tests/test_base_url_hostname.py extended with a base_url_host_matches
  suite (exact match, subdomain, path-segment rejection, host-suffix
  rejection, host-prefix rejection, empty-input, case-insensitivity,
  trailing dot).

Validation: 651 targeted tests pass (runtime_provider, minimax, bedrock,
gemini, auxiliary, codex_cloudflare, usage_pricing, compressor_fallback,
fallback_model, openai_client_lifecycle, provider_parity, cli_provider_resolution,
delegate, credential_pool, context_compressor, plus the 4 hostname test
modules). 26-assertion E2E call-site verification across 6 modules passes.

											
										
										
											2026-04-20 21:17:28 -07:00
+								    if provider == "bedrock" or (
 								        base_url
 								        and base_url_hostname(base_url).startswith("bedrock-runtime.")
 								        and base_url_host_matches(base_url, "amazonaws.com")
 								    ):
-												feat: native AWS Bedrock provider via Converse API

Salvaged from PR #7920 by JiaDe-Wu — cherry-picked Bedrock-specific
additions onto current main, skipping stale-branch reverts (293 commits
behind).

Dual-path architecture:
  - Claude models → AnthropicBedrock SDK (prompt caching, thinking budgets)
  - Non-Claude models → Converse API via boto3 (Nova, DeepSeek, Llama, Mistral)

Includes:
  - Core adapter (agent/bedrock_adapter.py, 1098 lines)
  - Full provider registration (auth, models, providers, config, runtime, main)
  - IAM credential chain + Bedrock API Key auth modes
  - Dynamic model discovery via ListFoundationModels + ListInferenceProfiles
  - Streaming with delta callbacks, error classification, guardrails
  - hermes doctor + hermes auth integration
  - /usage pricing for 7 Bedrock models
  - 130 automated tests (79 unit + 28 integration + follow-up fixes)
  - Documentation (website/docs/guides/aws-bedrock.md)
  - boto3 optional dependency (pip install hermes-agent[bedrock])

Co-authored-by: JiaDe WU <40445668+JiaDe-Wu@users.noreply.github.com>

											
										
										
											2026-04-15 15:18:01 -07:00
+								        try:
 								            from agent.bedrock_adapter import get_bedrock_context_length
 								            return get_bedrock_context_length(model)
 								        except ImportError:
 								            pass  # boto3 not installed — fall through to generic resolution
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # 5. Provider-aware lookups (before generic OpenRouter cache)
 								    # These are provider-specific and take priority over the generic OR cache,
 								    # since the same model can have different context limits per provider
 								    # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    # If provider is generic (openrouter/custom/empty), try to infer from URL.
 								    effective_provider = provider
 								    if not effective_provider or effective_provider in ("openrouter", "custom"):
 								        if base_url:
 								            inferred = _infer_provider_from_url(base_url)
 								            if inferred:
 								                effective_provider = inferred
 								    if effective_provider == "nous":
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								        ctx = _resolve_nous_context_length(model)
 								        if ctx:
 								            return ctx
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								    if effective_provider:
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								        from agent.models_dev import lookup_models_dev_context
-												fix: infer provider from base URL for models.dev context length lookup

Custom endpoint users (DashScope/Alibaba, Z.AI, Kimi, DeepSeek, etc.)
get wrong context lengths because their provider resolves as "openrouter"
or "custom", skipping the models.dev lookup entirely. For example,
qwen3.5-plus on DashScope falls to the generic "qwen" hardcoded default
(131K) instead of the correct 1M.

Add _infer_provider_from_url() that maps known API hostnames to their
models.dev provider IDs. When the explicit provider is generic
(openrouter/custom/empty), infer from the base URL before the models.dev
lookup. This resolves context lengths correctly for DashScope, Z.AI,
Kimi, MiniMax, DeepSeek, and Nous endpoints without requiring users to
manually set context_length in config.

Also refactors _is_known_provider_base_url() to use the same URL mapping,
removing the duplicated hostname list.

											
										
										
											2026-03-20 11:57:24 -07:00
+								        ctx = lookup_models_dev_context(effective_provider, model)
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								        if ctx:
 								            return ctx
 								    # 6. OpenRouter live API metadata (provider-unaware fallback)
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								    metadata = fetch_model_metadata()
 								    if model in metadata:
 								        return metadata[model].get("context_length", 128000)
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
-												fix: 6 bugs in model metadata, reasoning detection, and delegate tool

Cherry-picked from PR #2169 by @0xbyt4.

1. _strip_provider_prefix: skip Ollama model:tag names (qwen:0.5b)
2. Fuzzy match: remove reverse direction that made claude-sonnet-4
   resolve to 1M instead of 200K
3. _has_content_after_think_block: reuse _strip_think_blocks() to
   handle all tag variants (thinking, reasoning, REASONING_SCRATCHPAD)
4. models.dev lookup: elif→if so nous provider also queries models.dev
5. Disk cache fallback: use 5-min TTL instead of full hour so network
   is retried soon
6. Delegate build: wrap child construction in try/finally so
   _last_resolved_tool_names is always restored on exception

											
										
										
											2026-03-20 08:52:37 -07:00
+								    # Only check `default_model in model` (is the key a substring of the input).
 								    # The reverse (`model in default_model`) causes shorter names like
 								    # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
-												fix: case-insensitive model family matching + compressor init logging

Two fixes for local model context detection:

1. Hardcoded DEFAULT_CONTEXT_LENGTHS matching was case-sensitive.
   'qwen' didn't match 'Qwen3.5-9B-Q4_K_M.gguf' because of the
   capital Q. Now uses model.lower() for comparison.

2. Added compressor initialization logging showing the detected
   context_length, threshold, model, provider, and base_url.
   This makes turn-1 compression bugs diagnosable from logs —
   previously there was no log of what context length was detected.

											
										
										
											2026-03-21 10:47:44 -07:00
+								    model_lower = model.lower()
-												fix(metadata): fuzzy context length match prefers longest key

The fuzzy match for model context lengths iterated dict insertion
order. Shorter model names (e.g. 'gpt-5') could match before more
specific ones (e.g. 'gpt-5.4-pro'), returning the wrong context
length.

Sort by key length descending so more specific model names always
match first.

											
										
										
											2026-03-17 04:12:08 -07:00
+								    for default_model, length in sorted(
 								        DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
 								    ):
-												fix: case-insensitive model family matching + compressor init logging

Two fixes for local model context detection:

1. Hardcoded DEFAULT_CONTEXT_LENGTHS matching was case-sensitive.
   'qwen' didn't match 'Qwen3.5-9B-Q4_K_M.gguf' because of the
   capital Q. Now uses model.lower() for comparison.

2. Added compressor initialization logging showing the detected
   context_length, threshold, model, provider, and base_url.
   This makes turn-1 compression bugs diagnosable from logs —
   previously there was no log of what context length was detected.

											
										
										
											2026-03-21 10:47:44 -07:00
+								        if default_model in model_lower:
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								            return length
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # 9. Query local server as last resort
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								    if base_url and is_local_endpoint(base_url):
-												fix: forward auth when probing local model metadata

Pass the user's configured api_key through local-server detection and
context-length probes (detect_local_server_type, _query_local_context_length,
query_ollama_num_ctx) and use LM Studio's native /api/v1/models endpoint in
fetch_endpoint_model_metadata when a loaded instance is present — so the
probed context length is the actual runtime value the user loaded the model
at, not just the model's theoretical max.

Helps local-LLM users whose auto-detected context length was wrong, causing
compression failures and context-overrun crashes.

											
										
										
											2026-04-20 20:49:44 -07:00
+								        local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
-												feat: query local server for actual context window size

Instead of defaulting to 2M for unknown local models, query the server
API for the real context length. Supports Ollama (/api/show), vLLM
(max_model_len), and LM Studio (/v1/models). Results are cached to
avoid repeated queries.

											
										
										
											2026-03-18 21:38:41 +01:00
+								        if local_ctx and local_ctx > 0:
 								            save_context_length(model, base_url, local_ctx)
 								            return local_ctx
-												feat: overhaul context length detection with models.dev and provider-aware resolution (#2158)

Replace the fragile hardcoded context length system with a multi-source
resolution chain that correctly identifies context windows per provider.

Key changes:

- New agent/models_dev.py: Fetches and caches the models.dev registry
  (3800+ models across 100+ providers with per-provider context windows).
  In-memory cache (1hr TTL) + disk cache for cold starts.

- Rewritten get_model_context_length() resolution chain:
  0. Config override (model.context_length)
  1. Custom providers per-model context_length
  2. Persistent disk cache
  3. Endpoint /models (local servers)
  4. Anthropic /v1/models API (max_input_tokens, API-key only)
  5. OpenRouter live API (existing, unchanged)
  6. Nous suffix-match via OpenRouter (dot/dash normalization)
  7. models.dev registry lookup (provider-aware)
  8. Thin hardcoded defaults (broad family patterns)
  9. 128K fallback (was 2M)

- Provider-aware context: same model now correctly resolves to different
  context windows per provider (e.g. claude-opus-4.6: 1M on Anthropic,
  128K on GitHub Copilot). Provider name flows through ContextCompressor.

- DEFAULT_CONTEXT_LENGTHS shrunk from 80+ entries to ~16 broad patterns.
  models.dev replaces the per-model hardcoding.

- CONTEXT_PROBE_TIERS changed from [2M, 1M, 512K, 200K, 128K, 64K, 32K]
  to [128K, 64K, 32K, 16K, 8K]. Unknown models no longer start at 2M.

- hermes model: prompts for context_length when configuring custom
  endpoints. Supports shorthand (32k, 128K). Saved to custom_providers
  per-model config.

- custom_providers schema extended with optional models dict for
  per-model context_length (backward compatible).

- Nous Portal: suffix-matches bare IDs (claude-opus-4-6) against
  OpenRouter's prefixed IDs (anthropic/claude-opus-4.6) with dot/dash
  normalization. Handles all 15 current Nous models.

- Anthropic direct: queries /v1/models for max_input_tokens. Only works
  with regular API keys (sk-ant-api*), not OAuth tokens. Falls through
  to models.dev for OAuth users.

Tests: 5574 passed (18 new tests for models_dev + updated probe tiers)
Docs: Updated configuration.md context length section, AGENTS.md

Co-authored-by: Test <test@test.com>
											
										
										
											2026-03-20 06:04:33 -07:00
+								    # 10. Default fallback — 128K
 								    return DEFAULT_FALLBACK_CONTEXT
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
 								def estimate_tokens_rough(text: str) -> int:
-												fix: use ceiling division for token estimation, deduplicate inline formula

Switch estimate_tokens_rough(), estimate_messages_tokens_rough(), and
estimate_request_tokens_rough() from floor division (len // 4) to
ceiling division ((len + 3) // 4). Short texts (1-3 chars) previously
estimated as 0 tokens, causing the compressor and pre-flight checks to
systematically undercount when many short tool results are present.

Also replaced the inline duplicate formula in run_conversation()
(total_chars // 4) with a call to the shared
estimate_messages_tokens_rough() function.

Updated 4 tests that hardcoded floor-division expected values.

Related: issue #6217, PR #6629

											
										
										
											2026-04-11 16:33:35 -07:00
+								    """Rough token estimate (~4 chars/token) for pre-flight checks.
 								    Uses ceiling division so short texts (1-3 chars) never estimate as
 tokens, which would cause the compressor and pre-flight checks to
 								    systematically undercount when many short tool results are present.
 								    """
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								    if not text:
 								        return 0
-												fix: use ceiling division for token estimation, deduplicate inline formula

Switch estimate_tokens_rough(), estimate_messages_tokens_rough(), and
estimate_request_tokens_rough() from floor division (len // 4) to
ceiling division ((len + 3) // 4). Short texts (1-3 chars) previously
estimated as 0 tokens, causing the compressor and pre-flight checks to
systematically undercount when many short tool results are present.

Also replaced the inline duplicate formula in run_conversation()
(total_chars // 4) with a call to the shared
estimate_messages_tokens_rough() function.

Updated 4 tests that hardcoded floor-division expected values.

Related: issue #6217, PR #6629

											
										
										
											2026-04-11 16:33:35 -07:00
+								    return (len(text) + 3) // 4
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
 								def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
 								    """Rough token estimate for a message list (pre-flight only)."""
 								    total_chars = sum(len(str(msg)) for msg in messages)
-												fix: use ceiling division for token estimation, deduplicate inline formula

Switch estimate_tokens_rough(), estimate_messages_tokens_rough(), and
estimate_request_tokens_rough() from floor division (len // 4) to
ceiling division ((len + 3) // 4). Short texts (1-3 chars) previously
estimated as 0 tokens, causing the compressor and pre-flight checks to
systematically undercount when many short tool results are present.

Also replaced the inline duplicate formula in run_conversation()
(total_chars // 4) with a call to the shared
estimate_messages_tokens_rough() function.

Updated 4 tests that hardcoded floor-division expected values.

Related: issue #6217, PR #6629

											
										
										
											2026-04-11 16:33:35 -07:00
+								    return (total_chars + 3) // 4
-												fix(agent): include tool tokens in preflight estimate, guard context probe persistence (#3164)

Two improvements salvaged from PR #2600 (paraddox):

1. Preflight compression now counts tool schema tokens alongside system
   prompt and messages.  With 50+ tools enabled, schemas can add 20-30K
   tokens that were previously invisible to the estimator, delaying
   compression until the API rejected the request.

2. Context probe persistence guard: when the agent steps down context
   tiers after a context-length error, only provider-confirmed numeric
   limits (parsed from the error message) are cached to disk.  Guessed
   fallback tiers from get_next_probe_tier() stay in-memory only,
   preventing wrong values from polluting the persistent cache.

Co-authored-by: paraddox <paraddox@users.noreply.github.com>
											
										
										
											2026-03-26 02:00:50 -07:00
 								def estimate_request_tokens_rough(
 								    messages: List[Dict[str, Any]],
 								    *,
 								    system_prompt: str = "",
 								    tools: Optional[List[Dict[str, Any]]] = None,
 								) -> int:
 								    """Rough token estimate for a full chat-completions request.
 								    Includes the major payload buckets Hermes sends to providers:
 								    system prompt, conversation messages, and tool schemas.  With 50+
 								    tools enabled, schemas alone can add 20-30K tokens — a significant
 								    blind spot when only counting messages.
 								    """
 								    total_chars = 0
 								    if system_prompt:
 								        total_chars += len(system_prompt)
 								    if messages:
 								        total_chars += sum(len(str(msg)) for msg in messages)
 								    if tools:
 								        total_chars += len(str(tools))
-												fix: use ceiling division for token estimation, deduplicate inline formula

Switch estimate_tokens_rough(), estimate_messages_tokens_rough(), and
estimate_request_tokens_rough() from floor division (len // 4) to
ceiling division ((len + 3) // 4). Short texts (1-3 chars) previously
estimated as 0 tokens, causing the compressor and pre-flight checks to
systematically undercount when many short tool results are present.

Also replaced the inline duplicate formula in run_conversation()
(total_chars // 4) with a call to the shared
estimate_messages_tokens_rough() function.

Updated 4 tests that hardcoded floor-division expected values.

Related: issue #6217, PR #6629

											
										
										
											2026-04-11 16:33:35 -07:00
+								    return (total_chars + 3) // 4