diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 359b05c426..6ba9355053 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -46,6 +46,7 @@ class ContextCompressor: summary_model_override: str = None, base_url: str = "", api_key: str = "", + config_context_length: int | None = None, ): self.model = model self.base_url = base_url @@ -56,7 +57,10 @@ class ContextCompressor: self.summary_target_tokens = summary_target_tokens self.quiet_mode = quiet_mode - self.context_length = get_model_context_length(model, base_url=base_url, api_key=api_key) + self.context_length = get_model_context_length( + model, base_url=base_url, api_key=api_key, + config_context_length=config_context_length, + ) self.threshold_tokens = int(self.context_length * threshold_percent) self.compression_count = 0 self._context_probed = False # True after a step-down from context error diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 8283e8d32f..0a448990d3 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -136,6 +136,8 @@ _CONTEXT_LENGTH_KEYS = ( "max_input_tokens", "max_sequence_length", "max_seq_len", + "n_ctx_train", + "n_ctx", ) _MAX_COMPLETION_KEYS = ( @@ -342,6 +344,25 @@ def fetch_endpoint_model_metadata( entry["pricing"] = pricing _add_model_aliases(cache, model_id, entry) + # If this is a llama.cpp server, query /props for actual allocated context + is_llamacpp = any( + m.get("owned_by") == "llamacpp" + for m in payload.get("data", []) if isinstance(m, dict) + ) + if is_llamacpp: + try: + props_url = candidate.rstrip("/").replace("/v1", "") + "/props" + props_resp = requests.get(props_url, headers=headers, timeout=5) + if props_resp.ok: + props = props_resp.json() + gen_settings = props.get("default_generation_settings", {}) + n_ctx = gen_settings.get("n_ctx") + model_alias = props.get("model_alias", "") + if n_ctx and model_alias and model_alias in cache: + cache[model_alias]["context_length"] = n_ctx + except Exception: + pass + _endpoint_model_metadata_cache[normalized] = cache _endpoint_model_metadata_cache_time[normalized] = time.time() return cache @@ -439,16 +460,26 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]: return None -def get_model_context_length(model: str, base_url: str = "", api_key: str = "") -> int: +def get_model_context_length( + model: str, + base_url: str = "", + api_key: str = "", + config_context_length: int | None = None, +) -> int: """Get the context length for a model. Resolution order: + 0. Explicit config override (model.context_length in config.yaml) 1. Persistent cache (previously discovered via probing) 2. Active endpoint metadata (/models for explicit custom endpoints) 3. OpenRouter API metadata 4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only) 5. First probe tier (2M) — will be narrowed on first context error """ + # 0. Explicit config override — user knows best + if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0: + return config_context_length + # 1. Check persistent cache (model+provider) if base_url: cached = get_cached_context_length(model, base_url) @@ -458,13 +489,30 @@ def get_model_context_length(model: str, base_url: str = "", api_key: str = "") # 2. Active endpoint metadata for explicit custom routes if _is_custom_endpoint(base_url): endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key) - if model in endpoint_metadata: - context_length = endpoint_metadata[model].get("context_length") + matched = endpoint_metadata.get(model) + if not matched: + # Single-model servers: if only one model is loaded, use it + if len(endpoint_metadata) == 1: + matched = next(iter(endpoint_metadata.values())) + else: + # Fuzzy match: substring in either direction + for key, entry in endpoint_metadata.items(): + if model in key or key in model: + matched = entry + break + if matched: + context_length = matched.get("context_length") if isinstance(context_length, int): return context_length if not _is_known_provider_base_url(base_url): # Explicit third-party endpoints should not borrow fuzzy global # defaults from unrelated providers with similarly named models. + logger.info( + "Could not detect context length for model %r at %s — " + "defaulting to %s tokens (probe-down). Set model.context_length " + "in config.yaml to override.", + model, base_url, f"{CONTEXT_PROBE_TIERS[0]:,}", + ) return CONTEXT_PROBE_TIERS[0] # 3. OpenRouter API metadata diff --git a/cli.py b/cli.py index 24f12f3866..2be21b902f 100755 --- a/cli.py +++ b/cli.py @@ -1046,6 +1046,14 @@ class HermesCLI: _config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "") _FALLBACK_MODEL = "anthropic/claude-opus-4.6" self.model = model or _config_model or _FALLBACK_MODEL + # Auto-detect model from local server if still on fallback + if self.model == _FALLBACK_MODEL: + _base_url = _model_config.get("base_url", "") if isinstance(_model_config, dict) else "" + if "localhost" in _base_url or "127.0.0.1" in _base_url: + from hermes_cli.runtime_provider import _auto_detect_local_model + _detected = _auto_detect_local_model(_base_url) + if _detected: + self.model = _detected # Track whether model was explicitly chosen by the user or fell back # to the global default. Provider-specific normalisation may override # the default silently but should warn when overriding an explicit choice. @@ -1251,6 +1259,8 @@ class HermesCLI: def _get_status_bar_snapshot(self) -> Dict[str, Any]: model_name = self.model or "unknown" model_short = model_name.split("/")[-1] if "/" in model_name else model_name + if model_short.endswith(".gguf"): + model_short = model_short[:-5] if len(model_short) > 26: model_short = f"{model_short[:23]}..." diff --git a/hermes_cli/banner.py b/hermes_cli/banner.py index 21c577dd36..e457a0a83f 100644 --- a/hermes_cli/banner.py +++ b/hermes_cli/banner.py @@ -289,6 +289,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str, _hero = HERMES_CADUCEUS left_lines = ["", _hero, ""] model_short = model.split("/")[-1] if "/" in model else model + if model_short.endswith(".gguf"): + model_short = model_short[:-5] if len(model_short) > 28: model_short = model_short[:25] + "..." ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else "" diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index 6bfa5ae8a6..ed9e285404 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -24,11 +24,41 @@ def _normalize_custom_provider_name(value: str) -> str: return value.strip().lower().replace(" ", "-") +def _auto_detect_local_model(base_url: str) -> str: + """Query a local server for its model name when only one model is loaded.""" + if not base_url: + return "" + try: + import requests + url = base_url.rstrip("/") + if not url.endswith("/v1"): + url += "/v1" + resp = requests.get(url + "/models", timeout=5) + if resp.ok: + models = resp.json().get("data", []) + if len(models) == 1: + model_id = models[0].get("id", "") + if model_id: + return model_id + except Exception: + pass + return "" + + def _get_model_config() -> Dict[str, Any]: config = load_config() model_cfg = config.get("model") if isinstance(model_cfg, dict): - return dict(model_cfg) + cfg = dict(model_cfg) + default = cfg.get("default", "").strip() + base_url = cfg.get("base_url", "").strip() + is_local = "localhost" in base_url or "127.0.0.1" in base_url + is_fallback = not default or default == "anthropic/claude-opus-4.6" + if is_local and is_fallback and base_url: + detected = _auto_detect_local_model(base_url) + if detected: + cfg["default"] = detected + return cfg if isinstance(model_cfg, str) and model_cfg.strip(): return {"default": model_cfg.strip()} return {} diff --git a/run_agent.py b/run_agent.py index 58d0cdeeda..76d4ffcf45 100644 --- a/run_agent.py +++ b/run_agent.py @@ -969,6 +969,18 @@ class AIAgent: compression_threshold = float(_compression_cfg.get("threshold", 0.50)) compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") compression_summary_model = _compression_cfg.get("summary_model") or None + + # Read explicit context_length override from model config + _model_cfg = _agent_cfg.get("model", {}) + if isinstance(_model_cfg, dict): + _config_context_length = _model_cfg.get("context_length") + else: + _config_context_length = None + if _config_context_length is not None: + try: + _config_context_length = int(_config_context_length) + except (TypeError, ValueError): + _config_context_length = None self.context_compressor = ContextCompressor( model=self.model, @@ -980,6 +992,7 @@ class AIAgent: quiet_mode=self.quiet_mode, base_url=self.base_url, api_key=getattr(self, "api_key", ""), + config_context_length=_config_context_length, ) self.compression_enabled = compression_enabled self._user_turn_count = 0 diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index aa35be9b93..a733a03c6e 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -218,6 +218,79 @@ class TestGetModelContextLength: assert result == CONTEXT_PROBE_TIERS[0] + @patch("agent.model_metadata.fetch_model_metadata") + @patch("agent.model_metadata.fetch_endpoint_model_metadata") + def test_custom_endpoint_single_model_fallback(self, mock_endpoint_fetch, mock_fetch): + """Single-model servers: use the only model even if name doesn't match.""" + mock_fetch.return_value = {} + mock_endpoint_fetch.return_value = { + "Qwen3.5-9B-Q4_K_M.gguf": {"context_length": 131072} + } + + result = get_model_context_length( + "qwen3.5:9b", + base_url="http://myserver.example.com:8080/v1", + api_key="test-key", + ) + + assert result == 131072 + + @patch("agent.model_metadata.fetch_model_metadata") + @patch("agent.model_metadata.fetch_endpoint_model_metadata") + def test_custom_endpoint_fuzzy_substring_match(self, mock_endpoint_fetch, mock_fetch): + """Fuzzy match: configured model name is substring of endpoint model.""" + mock_fetch.return_value = {} + mock_endpoint_fetch.return_value = { + "org/llama-3.3-70b-instruct-fp8": {"context_length": 131072}, + "org/qwen-2.5-72b": {"context_length": 32768}, + } + + result = get_model_context_length( + "llama-3.3-70b-instruct", + base_url="http://myserver.example.com:8080/v1", + api_key="test-key", + ) + + assert result == 131072 + + @patch("agent.model_metadata.fetch_model_metadata") + def test_config_context_length_overrides_all(self, mock_fetch): + """Explicit config_context_length takes priority over everything.""" + mock_fetch.return_value = { + "test/model": {"context_length": 200000} + } + + result = get_model_context_length( + "test/model", + config_context_length=65536, + ) + + assert result == 65536 + + @patch("agent.model_metadata.fetch_model_metadata") + def test_config_context_length_zero_is_ignored(self, mock_fetch): + """config_context_length=0 should be treated as unset.""" + mock_fetch.return_value = {} + + result = get_model_context_length( + "anthropic/claude-sonnet-4", + config_context_length=0, + ) + + assert result == 200000 + + @patch("agent.model_metadata.fetch_model_metadata") + def test_config_context_length_none_is_ignored(self, mock_fetch): + """config_context_length=None should be treated as unset.""" + mock_fetch.return_value = {} + + result = get_model_context_length( + "anthropic/claude-sonnet-4", + config_context_length=None, + ) + + assert result == 200000 + # ========================================================================= # fetch_model_metadata — caching, TTL, slugs, failures diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 02396556a5..1d47f19229 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -414,6 +414,29 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo --- +### Context Length Detection + +Hermes automatically detects your model's context length by querying the endpoint's `/v1/models` response. For most setups this works out of the box. If detection fails (the model name doesn't match, the endpoint doesn't expose `/v1/models`, etc.), Hermes falls back to a high default and probes downward on context-length errors. + +To set the context length explicitly, add `context_length` to your model config: + +```yaml +model: + default: "qwen3.5:9b" + base_url: "http://localhost:8080/v1" + context_length: 131072 # tokens +``` + +This takes highest priority — it overrides auto-detection, cached values, and hardcoded defaults. + +:::tip When to set this manually +- Your model shows "2M context" in the status bar (detection failed) +- You want to limit context below the model's maximum (e.g., 8k on a 128k model to save VRAM) +- You're running behind a proxy that doesn't expose `/v1/models` +::: + +--- + ### Choosing the Right Setup | Use Case | Recommended |