mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
Compare commits
3 Commits
skill/gith
...
fix/custom
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d7bc0e1d03 | ||
|
|
2a3a374c78 | ||
|
|
0cee97c500 |
@@ -46,6 +46,7 @@ class ContextCompressor:
|
|||||||
summary_model_override: str = None,
|
summary_model_override: str = None,
|
||||||
base_url: str = "",
|
base_url: str = "",
|
||||||
api_key: str = "",
|
api_key: str = "",
|
||||||
|
config_context_length: int | None = None,
|
||||||
):
|
):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
@@ -56,7 +57,10 @@ class ContextCompressor:
|
|||||||
self.summary_target_tokens = summary_target_tokens
|
self.summary_target_tokens = summary_target_tokens
|
||||||
self.quiet_mode = quiet_mode
|
self.quiet_mode = quiet_mode
|
||||||
|
|
||||||
self.context_length = get_model_context_length(model, base_url=base_url, api_key=api_key)
|
self.context_length = get_model_context_length(
|
||||||
|
model, base_url=base_url, api_key=api_key,
|
||||||
|
config_context_length=config_context_length,
|
||||||
|
)
|
||||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||||
self.compression_count = 0
|
self.compression_count = 0
|
||||||
self._context_probed = False # True after a step-down from context error
|
self._context_probed = False # True after a step-down from context error
|
||||||
|
|||||||
@@ -136,6 +136,8 @@ _CONTEXT_LENGTH_KEYS = (
|
|||||||
"max_input_tokens",
|
"max_input_tokens",
|
||||||
"max_sequence_length",
|
"max_sequence_length",
|
||||||
"max_seq_len",
|
"max_seq_len",
|
||||||
|
"n_ctx_train",
|
||||||
|
"n_ctx",
|
||||||
)
|
)
|
||||||
|
|
||||||
_MAX_COMPLETION_KEYS = (
|
_MAX_COMPLETION_KEYS = (
|
||||||
@@ -342,6 +344,25 @@ def fetch_endpoint_model_metadata(
|
|||||||
entry["pricing"] = pricing
|
entry["pricing"] = pricing
|
||||||
_add_model_aliases(cache, model_id, entry)
|
_add_model_aliases(cache, model_id, entry)
|
||||||
|
|
||||||
|
# If this is a llama.cpp server, query /props for actual allocated context
|
||||||
|
is_llamacpp = any(
|
||||||
|
m.get("owned_by") == "llamacpp"
|
||||||
|
for m in payload.get("data", []) if isinstance(m, dict)
|
||||||
|
)
|
||||||
|
if is_llamacpp:
|
||||||
|
try:
|
||||||
|
props_url = candidate.rstrip("/").replace("/v1", "") + "/props"
|
||||||
|
props_resp = requests.get(props_url, headers=headers, timeout=5)
|
||||||
|
if props_resp.ok:
|
||||||
|
props = props_resp.json()
|
||||||
|
gen_settings = props.get("default_generation_settings", {})
|
||||||
|
n_ctx = gen_settings.get("n_ctx")
|
||||||
|
model_alias = props.get("model_alias", "")
|
||||||
|
if n_ctx and model_alias and model_alias in cache:
|
||||||
|
cache[model_alias]["context_length"] = n_ctx
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
_endpoint_model_metadata_cache[normalized] = cache
|
_endpoint_model_metadata_cache[normalized] = cache
|
||||||
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
_endpoint_model_metadata_cache_time[normalized] = time.time()
|
||||||
return cache
|
return cache
|
||||||
@@ -439,16 +460,26 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_model_context_length(model: str, base_url: str = "", api_key: str = "") -> int:
|
def get_model_context_length(
|
||||||
|
model: str,
|
||||||
|
base_url: str = "",
|
||||||
|
api_key: str = "",
|
||||||
|
config_context_length: int | None = None,
|
||||||
|
) -> int:
|
||||||
"""Get the context length for a model.
|
"""Get the context length for a model.
|
||||||
|
|
||||||
Resolution order:
|
Resolution order:
|
||||||
|
0. Explicit config override (model.context_length in config.yaml)
|
||||||
1. Persistent cache (previously discovered via probing)
|
1. Persistent cache (previously discovered via probing)
|
||||||
2. Active endpoint metadata (/models for explicit custom endpoints)
|
2. Active endpoint metadata (/models for explicit custom endpoints)
|
||||||
3. OpenRouter API metadata
|
3. OpenRouter API metadata
|
||||||
4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only)
|
4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only)
|
||||||
5. First probe tier (2M) — will be narrowed on first context error
|
5. First probe tier (2M) — will be narrowed on first context error
|
||||||
"""
|
"""
|
||||||
|
# 0. Explicit config override — user knows best
|
||||||
|
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
|
||||||
|
return config_context_length
|
||||||
|
|
||||||
# 1. Check persistent cache (model+provider)
|
# 1. Check persistent cache (model+provider)
|
||||||
if base_url:
|
if base_url:
|
||||||
cached = get_cached_context_length(model, base_url)
|
cached = get_cached_context_length(model, base_url)
|
||||||
@@ -458,13 +489,30 @@ def get_model_context_length(model: str, base_url: str = "", api_key: str = "")
|
|||||||
# 2. Active endpoint metadata for explicit custom routes
|
# 2. Active endpoint metadata for explicit custom routes
|
||||||
if _is_custom_endpoint(base_url):
|
if _is_custom_endpoint(base_url):
|
||||||
endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
|
endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
|
||||||
if model in endpoint_metadata:
|
matched = endpoint_metadata.get(model)
|
||||||
context_length = endpoint_metadata[model].get("context_length")
|
if not matched:
|
||||||
|
# Single-model servers: if only one model is loaded, use it
|
||||||
|
if len(endpoint_metadata) == 1:
|
||||||
|
matched = next(iter(endpoint_metadata.values()))
|
||||||
|
else:
|
||||||
|
# Fuzzy match: substring in either direction
|
||||||
|
for key, entry in endpoint_metadata.items():
|
||||||
|
if model in key or key in model:
|
||||||
|
matched = entry
|
||||||
|
break
|
||||||
|
if matched:
|
||||||
|
context_length = matched.get("context_length")
|
||||||
if isinstance(context_length, int):
|
if isinstance(context_length, int):
|
||||||
return context_length
|
return context_length
|
||||||
if not _is_known_provider_base_url(base_url):
|
if not _is_known_provider_base_url(base_url):
|
||||||
# Explicit third-party endpoints should not borrow fuzzy global
|
# Explicit third-party endpoints should not borrow fuzzy global
|
||||||
# defaults from unrelated providers with similarly named models.
|
# defaults from unrelated providers with similarly named models.
|
||||||
|
logger.info(
|
||||||
|
"Could not detect context length for model %r at %s — "
|
||||||
|
"defaulting to %s tokens (probe-down). Set model.context_length "
|
||||||
|
"in config.yaml to override.",
|
||||||
|
model, base_url, f"{CONTEXT_PROBE_TIERS[0]:,}",
|
||||||
|
)
|
||||||
return CONTEXT_PROBE_TIERS[0]
|
return CONTEXT_PROBE_TIERS[0]
|
||||||
|
|
||||||
# 3. OpenRouter API metadata
|
# 3. OpenRouter API metadata
|
||||||
|
|||||||
10
cli.py
10
cli.py
@@ -1046,6 +1046,14 @@ class HermesCLI:
|
|||||||
_config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "")
|
_config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "")
|
||||||
_FALLBACK_MODEL = "anthropic/claude-opus-4.6"
|
_FALLBACK_MODEL = "anthropic/claude-opus-4.6"
|
||||||
self.model = model or _config_model or _FALLBACK_MODEL
|
self.model = model or _config_model or _FALLBACK_MODEL
|
||||||
|
# Auto-detect model from local server if still on fallback
|
||||||
|
if self.model == _FALLBACK_MODEL:
|
||||||
|
_base_url = _model_config.get("base_url", "") if isinstance(_model_config, dict) else ""
|
||||||
|
if "localhost" in _base_url or "127.0.0.1" in _base_url:
|
||||||
|
from hermes_cli.runtime_provider import _auto_detect_local_model
|
||||||
|
_detected = _auto_detect_local_model(_base_url)
|
||||||
|
if _detected:
|
||||||
|
self.model = _detected
|
||||||
# Track whether model was explicitly chosen by the user or fell back
|
# Track whether model was explicitly chosen by the user or fell back
|
||||||
# to the global default. Provider-specific normalisation may override
|
# to the global default. Provider-specific normalisation may override
|
||||||
# the default silently but should warn when overriding an explicit choice.
|
# the default silently but should warn when overriding an explicit choice.
|
||||||
@@ -1251,6 +1259,8 @@ class HermesCLI:
|
|||||||
def _get_status_bar_snapshot(self) -> Dict[str, Any]:
|
def _get_status_bar_snapshot(self) -> Dict[str, Any]:
|
||||||
model_name = self.model or "unknown"
|
model_name = self.model or "unknown"
|
||||||
model_short = model_name.split("/")[-1] if "/" in model_name else model_name
|
model_short = model_name.split("/")[-1] if "/" in model_name else model_name
|
||||||
|
if model_short.endswith(".gguf"):
|
||||||
|
model_short = model_short[:-5]
|
||||||
if len(model_short) > 26:
|
if len(model_short) > 26:
|
||||||
model_short = f"{model_short[:23]}..."
|
model_short = f"{model_short[:23]}..."
|
||||||
|
|
||||||
|
|||||||
@@ -289,6 +289,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
|||||||
_hero = HERMES_CADUCEUS
|
_hero = HERMES_CADUCEUS
|
||||||
left_lines = ["", _hero, ""]
|
left_lines = ["", _hero, ""]
|
||||||
model_short = model.split("/")[-1] if "/" in model else model
|
model_short = model.split("/")[-1] if "/" in model else model
|
||||||
|
if model_short.endswith(".gguf"):
|
||||||
|
model_short = model_short[:-5]
|
||||||
if len(model_short) > 28:
|
if len(model_short) > 28:
|
||||||
model_short = model_short[:25] + "..."
|
model_short = model_short[:25] + "..."
|
||||||
ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""
|
ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""
|
||||||
|
|||||||
@@ -24,11 +24,41 @@ def _normalize_custom_provider_name(value: str) -> str:
|
|||||||
return value.strip().lower().replace(" ", "-")
|
return value.strip().lower().replace(" ", "-")
|
||||||
|
|
||||||
|
|
||||||
|
def _auto_detect_local_model(base_url: str) -> str:
|
||||||
|
"""Query a local server for its model name when only one model is loaded."""
|
||||||
|
if not base_url:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
url = base_url.rstrip("/")
|
||||||
|
if not url.endswith("/v1"):
|
||||||
|
url += "/v1"
|
||||||
|
resp = requests.get(url + "/models", timeout=5)
|
||||||
|
if resp.ok:
|
||||||
|
models = resp.json().get("data", [])
|
||||||
|
if len(models) == 1:
|
||||||
|
model_id = models[0].get("id", "")
|
||||||
|
if model_id:
|
||||||
|
return model_id
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def _get_model_config() -> Dict[str, Any]:
|
def _get_model_config() -> Dict[str, Any]:
|
||||||
config = load_config()
|
config = load_config()
|
||||||
model_cfg = config.get("model")
|
model_cfg = config.get("model")
|
||||||
if isinstance(model_cfg, dict):
|
if isinstance(model_cfg, dict):
|
||||||
return dict(model_cfg)
|
cfg = dict(model_cfg)
|
||||||
|
default = cfg.get("default", "").strip()
|
||||||
|
base_url = cfg.get("base_url", "").strip()
|
||||||
|
is_local = "localhost" in base_url or "127.0.0.1" in base_url
|
||||||
|
is_fallback = not default or default == "anthropic/claude-opus-4.6"
|
||||||
|
if is_local and is_fallback and base_url:
|
||||||
|
detected = _auto_detect_local_model(base_url)
|
||||||
|
if detected:
|
||||||
|
cfg["default"] = detected
|
||||||
|
return cfg
|
||||||
if isinstance(model_cfg, str) and model_cfg.strip():
|
if isinstance(model_cfg, str) and model_cfg.strip():
|
||||||
return {"default": model_cfg.strip()}
|
return {"default": model_cfg.strip()}
|
||||||
return {}
|
return {}
|
||||||
|
|||||||
13
run_agent.py
13
run_agent.py
@@ -969,6 +969,18 @@ class AIAgent:
|
|||||||
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
||||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||||
|
|
||||||
|
# Read explicit context_length override from model config
|
||||||
|
_model_cfg = _agent_cfg.get("model", {})
|
||||||
|
if isinstance(_model_cfg, dict):
|
||||||
|
_config_context_length = _model_cfg.get("context_length")
|
||||||
|
else:
|
||||||
|
_config_context_length = None
|
||||||
|
if _config_context_length is not None:
|
||||||
|
try:
|
||||||
|
_config_context_length = int(_config_context_length)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
_config_context_length = None
|
||||||
|
|
||||||
self.context_compressor = ContextCompressor(
|
self.context_compressor = ContextCompressor(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
@@ -980,6 +992,7 @@ class AIAgent:
|
|||||||
quiet_mode=self.quiet_mode,
|
quiet_mode=self.quiet_mode,
|
||||||
base_url=self.base_url,
|
base_url=self.base_url,
|
||||||
api_key=getattr(self, "api_key", ""),
|
api_key=getattr(self, "api_key", ""),
|
||||||
|
config_context_length=_config_context_length,
|
||||||
)
|
)
|
||||||
self.compression_enabled = compression_enabled
|
self.compression_enabled = compression_enabled
|
||||||
self._user_turn_count = 0
|
self._user_turn_count = 0
|
||||||
|
|||||||
@@ -218,6 +218,79 @@ class TestGetModelContextLength:
|
|||||||
|
|
||||||
assert result == CONTEXT_PROBE_TIERS[0]
|
assert result == CONTEXT_PROBE_TIERS[0]
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
|
||||||
|
def test_custom_endpoint_single_model_fallback(self, mock_endpoint_fetch, mock_fetch):
|
||||||
|
"""Single-model servers: use the only model even if name doesn't match."""
|
||||||
|
mock_fetch.return_value = {}
|
||||||
|
mock_endpoint_fetch.return_value = {
|
||||||
|
"Qwen3.5-9B-Q4_K_M.gguf": {"context_length": 131072}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = get_model_context_length(
|
||||||
|
"qwen3.5:9b",
|
||||||
|
base_url="http://myserver.example.com:8080/v1",
|
||||||
|
api_key="test-key",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == 131072
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
|
||||||
|
def test_custom_endpoint_fuzzy_substring_match(self, mock_endpoint_fetch, mock_fetch):
|
||||||
|
"""Fuzzy match: configured model name is substring of endpoint model."""
|
||||||
|
mock_fetch.return_value = {}
|
||||||
|
mock_endpoint_fetch.return_value = {
|
||||||
|
"org/llama-3.3-70b-instruct-fp8": {"context_length": 131072},
|
||||||
|
"org/qwen-2.5-72b": {"context_length": 32768},
|
||||||
|
}
|
||||||
|
|
||||||
|
result = get_model_context_length(
|
||||||
|
"llama-3.3-70b-instruct",
|
||||||
|
base_url="http://myserver.example.com:8080/v1",
|
||||||
|
api_key="test-key",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == 131072
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
def test_config_context_length_overrides_all(self, mock_fetch):
|
||||||
|
"""Explicit config_context_length takes priority over everything."""
|
||||||
|
mock_fetch.return_value = {
|
||||||
|
"test/model": {"context_length": 200000}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = get_model_context_length(
|
||||||
|
"test/model",
|
||||||
|
config_context_length=65536,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == 65536
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
def test_config_context_length_zero_is_ignored(self, mock_fetch):
|
||||||
|
"""config_context_length=0 should be treated as unset."""
|
||||||
|
mock_fetch.return_value = {}
|
||||||
|
|
||||||
|
result = get_model_context_length(
|
||||||
|
"anthropic/claude-sonnet-4",
|
||||||
|
config_context_length=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == 200000
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
def test_config_context_length_none_is_ignored(self, mock_fetch):
|
||||||
|
"""config_context_length=None should be treated as unset."""
|
||||||
|
mock_fetch.return_value = {}
|
||||||
|
|
||||||
|
result = get_model_context_length(
|
||||||
|
"anthropic/claude-sonnet-4",
|
||||||
|
config_context_length=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == 200000
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# fetch_model_metadata — caching, TTL, slugs, failures
|
# fetch_model_metadata — caching, TTL, slugs, failures
|
||||||
|
|||||||
@@ -414,6 +414,29 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
### Context Length Detection
|
||||||
|
|
||||||
|
Hermes automatically detects your model's context length by querying the endpoint's `/v1/models` response. For most setups this works out of the box. If detection fails (the model name doesn't match, the endpoint doesn't expose `/v1/models`, etc.), Hermes falls back to a high default and probes downward on context-length errors.
|
||||||
|
|
||||||
|
To set the context length explicitly, add `context_length` to your model config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model:
|
||||||
|
default: "qwen3.5:9b"
|
||||||
|
base_url: "http://localhost:8080/v1"
|
||||||
|
context_length: 131072 # tokens
|
||||||
|
```
|
||||||
|
|
||||||
|
This takes highest priority — it overrides auto-detection, cached values, and hardcoded defaults.
|
||||||
|
|
||||||
|
:::tip When to set this manually
|
||||||
|
- Your model shows "2M context" in the status bar (detection failed)
|
||||||
|
- You want to limit context below the model's maximum (e.g., 8k on a 128k model to save VRAM)
|
||||||
|
- You're running behind a proxy that doesn't expose `/v1/models`
|
||||||
|
:::
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
### Choosing the Right Setup
|
### Choosing the Right Setup
|
||||||
|
|
||||||
| Use Case | Recommended |
|
| Use Case | Recommended |
|
||||||
|
|||||||
Reference in New Issue
Block a user