fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override Custom model endpoints (non-OpenRouter, non-known-provider) were silently falling back to 2M tokens when the model name didn't exactly match what the endpoint's /v1/models reported. This happened because: 1. Endpoint metadata lookup used exact match only — model name mismatches (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss 2. Single-model servers (common for local inference) required exact name match even though only one model was loaded 3. No user escape hatch to manually set context length Changes: - Add fuzzy matching for endpoint model metadata: single-model servers use the only available model regardless of name; multi-model servers try substring matching in both directions - Add model.context_length config override (highest priority) so users can explicitly set their model's context length in config.yaml - Log an informative message when falling back to 2M probe, telling users about the config override option - Thread config_context_length through ContextCompressor and AIAgent init Tests: 6 new tests covering fuzzy match, single-model fallback, config override (including zero/None edge cases). * fix: auto-detect local model name and context length for local servers Cherry-picked from PR #2043 by sudoingX. - Auto-detect model name from local server's /v1/models when only one model is loaded (no manual model name config needed) - Add n_ctx_train and n_ctx to context length detection keys for llama.cpp - Query llama.cpp /props endpoint for actual allocated context (not just training context from GGUF metadata) - Strip .gguf suffix from display in banner and status bar - _auto_detect_local_model() in runtime_provider.py for CLI init Co-authored-by: sudo <sudoingx@users.noreply.github.com> * fix: revert accidental summary_target_tokens change + add docs for context_length config - Revert summary_target_tokens from 2500 back to 500 (accidental change during patching) - Add 'Context Length Detection' section to Custom & Self-Hosted docs explaining model.context_length config override --------- Co-authored-by: Test <test@test.com> Co-authored-by: sudo <sudoingx@users.noreply.github.com>
2026-04-28 06:51:16 +08:00 · 2026-03-19 06:01:16 -07:00
parent 7b6d14e62a
commit d76fa7fc37
8 changed files with 208 additions and 5 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -46,6 +46,7 @@ class ContextCompressor:
        summary_model_override: str = None,
        base_url: str = "",
        api_key: str = "",
+        config_context_length: int | None = None,
    ):
        self.model = model
        self.base_url = base_url
@@ -56,7 +57,10 @@ class ContextCompressor:
        self.summary_target_tokens = summary_target_tokens
        self.quiet_mode = quiet_mode

-        self.context_length = get_model_context_length(model, base_url=base_url, api_key=api_key)
+        self.context_length = get_model_context_length(
+            model, base_url=base_url, api_key=api_key,
+            config_context_length=config_context_length,
+        )
        self.threshold_tokens = int(self.context_length * threshold_percent)
        self.compression_count = 0
        self._context_probed = False  # True after a step-down from context error
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -136,6 +136,8 @@ _CONTEXT_LENGTH_KEYS = (
    "max_input_tokens",
    "max_sequence_length",
    "max_seq_len",
+    "n_ctx_train",
+    "n_ctx",
 )

 _MAX_COMPLETION_KEYS = (
@@ -342,6 +344,25 @@ def fetch_endpoint_model_metadata(
                    entry["pricing"] = pricing
                _add_model_aliases(cache, model_id, entry)

+            # If this is a llama.cpp server, query /props for actual allocated context
+            is_llamacpp = any(
+                m.get("owned_by") == "llamacpp"
+                for m in payload.get("data", []) if isinstance(m, dict)
+            )
+            if is_llamacpp:
+                try:
+                    props_url = candidate.rstrip("/").replace("/v1", "") + "/props"
+                    props_resp = requests.get(props_url, headers=headers, timeout=5)
+                    if props_resp.ok:
+                        props = props_resp.json()
+                        gen_settings = props.get("default_generation_settings", {})
+                        n_ctx = gen_settings.get("n_ctx")
+                        model_alias = props.get("model_alias", "")
+                        if n_ctx and model_alias and model_alias in cache:
+                            cache[model_alias]["context_length"] = n_ctx
+                except Exception:
+                    pass
+
            _endpoint_model_metadata_cache[normalized] = cache
            _endpoint_model_metadata_cache_time[normalized] = time.time()
            return cache
@@ -439,16 +460,26 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    return None


-def get_model_context_length(model: str, base_url: str = "", api_key: str = "") -> int:
+def get_model_context_length(
+    model: str,
+    base_url: str = "",
+    api_key: str = "",
+    config_context_length: int | None = None,
+) -> int:
    """Get the context length for a model.

    Resolution order:
+    0. Explicit config override (model.context_length in config.yaml)
    1. Persistent cache (previously discovered via probing)
    2. Active endpoint metadata (/models for explicit custom endpoints)
    3. OpenRouter API metadata
    4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only)
    5. First probe tier (2M) — will be narrowed on first context error
    """
+    # 0. Explicit config override — user knows best
+    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
+        return config_context_length
+
    # 1. Check persistent cache (model+provider)
    if base_url:
        cached = get_cached_context_length(model, base_url)
@@ -458,13 +489,30 @@ def get_model_context_length(model: str, base_url: str = "", api_key: str = "")
    # 2. Active endpoint metadata for explicit custom routes
    if _is_custom_endpoint(base_url):
        endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
-        if model in endpoint_metadata:
-            context_length = endpoint_metadata[model].get("context_length")
+        matched = endpoint_metadata.get(model)
+        if not matched:
+            # Single-model servers: if only one model is loaded, use it
+            if len(endpoint_metadata) == 1:
+                matched = next(iter(endpoint_metadata.values()))
+            else:
+                # Fuzzy match: substring in either direction
+                for key, entry in endpoint_metadata.items():
+                    if model in key or key in model:
+                        matched = entry
+                        break
+        if matched:
+            context_length = matched.get("context_length")
            if isinstance(context_length, int):
                return context_length
        if not _is_known_provider_base_url(base_url):
            # Explicit third-party endpoints should not borrow fuzzy global
            # defaults from unrelated providers with similarly named models.
+            logger.info(
+                "Could not detect context length for model %r at %s — "
+                "defaulting to %s tokens (probe-down). Set model.context_length "
+                "in config.yaml to override.",
+                model, base_url, f"{CONTEXT_PROBE_TIERS[0]:,}",
+            )
            return CONTEXT_PROBE_TIERS[0]

    # 3. OpenRouter API metadata