mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 23:11:37 +08:00
Compare commits
11 Commits
update-iss
...
docs/hooks
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a18884a3d0 | ||
|
|
29d3f1216b | ||
|
|
fe37a53b75 | ||
|
|
b6ef1deafd | ||
|
|
0f3c191ef1 | ||
|
|
7cdf4efe05 | ||
|
|
adee8d1b5f | ||
|
|
f5b84dddfd | ||
|
|
4549a2f51a | ||
|
|
466720c2f3 | ||
|
|
fccd7a2ab4 |
@@ -40,7 +40,7 @@ _MIN_SUMMARY_TOKENS = 2000
|
|||||||
# Proportion of compressed content to allocate for summary
|
# Proportion of compressed content to allocate for summary
|
||||||
_SUMMARY_RATIO = 0.20
|
_SUMMARY_RATIO = 0.20
|
||||||
# Absolute ceiling for summary tokens (even on very large context windows)
|
# Absolute ceiling for summary tokens (even on very large context windows)
|
||||||
_SUMMARY_TOKENS_CEILING = 32_000
|
_SUMMARY_TOKENS_CEILING = 12_000
|
||||||
|
|
||||||
# Placeholder used when pruning old tool results
|
# Placeholder used when pruning old tool results
|
||||||
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||||
@@ -63,10 +63,10 @@ class ContextCompressor:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
threshold_percent: float = 0.80,
|
threshold_percent: float = 0.50,
|
||||||
protect_first_n: int = 3,
|
protect_first_n: int = 3,
|
||||||
protect_last_n: int = 20,
|
protect_last_n: int = 20,
|
||||||
summary_target_ratio: float = 0.40,
|
summary_target_ratio: float = 0.20,
|
||||||
quiet_mode: bool = False,
|
quiet_mode: bool = False,
|
||||||
summary_model_override: str = None,
|
summary_model_override: str = None,
|
||||||
base_url: str = "",
|
base_url: str = "",
|
||||||
@@ -92,8 +92,8 @@ class ContextCompressor:
|
|||||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||||
self.compression_count = 0
|
self.compression_count = 0
|
||||||
|
|
||||||
# Derive token budgets from the target ratio and context length
|
# Derive token budgets: ratio is relative to the threshold, not total context
|
||||||
target_tokens = int(self.context_length * self.summary_target_ratio)
|
target_tokens = int(self.threshold_tokens * self.summary_target_ratio)
|
||||||
self.tail_token_budget = target_tokens
|
self.tail_token_budget = target_tokens
|
||||||
self.max_summary_tokens = min(
|
self.max_summary_tokens = min(
|
||||||
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
||||||
|
|||||||
@@ -236,23 +236,24 @@ browser:
|
|||||||
# 5. Summarizes middle turns using a fast/cheap model
|
# 5. Summarizes middle turns using a fast/cheap model
|
||||||
# 6. Inserts summary as a user message, continues conversation seamlessly
|
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||||
#
|
#
|
||||||
# Post-compression size scales with the model's context window via target_ratio:
|
# Post-compression tail budget is target_ratio × threshold × context_length:
|
||||||
# MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
|
# 200K context, threshold 0.50, ratio 0.20 → 20K tokens of recent tail preserved
|
||||||
# GPT-5 1M context → ~400K post-compression (at 0.40 ratio)
|
# 1M context, threshold 0.50, ratio 0.20 → 100K tokens of recent tail preserved
|
||||||
#
|
#
|
||||||
compression:
|
compression:
|
||||||
# Enable automatic context compression (default: true)
|
# Enable automatic context compression (default: true)
|
||||||
# Set to false if you prefer to manage context manually or want errors on overflow
|
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
# Trigger compression at this % of model's context limit (default: 0.80 = 80%)
|
# Trigger compression at this % of model's context limit (default: 0.50 = 50%)
|
||||||
# Lower values = more aggressive compression, higher values = compress later
|
# Lower values = more aggressive compression, higher values = compress later
|
||||||
threshold: 0.80
|
threshold: 0.50
|
||||||
|
|
||||||
# Target post-compression size as a fraction of context window (default: 0.40 = 40%)
|
# Fraction of the threshold to preserve as recent tail (default: 0.20 = 20%)
|
||||||
# Controls how much context survives compression. Tail token budget and summary
|
# e.g. 20% of 50% threshold = 10% of total context kept as recent messages.
|
||||||
# cap scale with this value. Range: 0.10 - 0.80
|
# Summary output is separately capped at 12K tokens (Gemini output limit).
|
||||||
target_ratio: 0.40
|
# Range: 0.10 - 0.80
|
||||||
|
target_ratio: 0.20
|
||||||
|
|
||||||
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
|
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
|
||||||
# Higher values keep more recent conversation intact at the cost of more aggressive
|
# Higher values keep more recent conversation intact at the cost of more aggressive
|
||||||
|
|||||||
6
cli.py
6
cli.py
@@ -1509,10 +1509,14 @@ class HermesCLI:
|
|||||||
|
|
||||||
self._reasoning_buf = getattr(self, "_reasoning_buf", "") + text
|
self._reasoning_buf = getattr(self, "_reasoning_buf", "") + text
|
||||||
|
|
||||||
# Emit complete lines
|
# Emit complete lines, and force-flush long partial lines so
|
||||||
|
# reasoning is visible in real-time even without newlines.
|
||||||
while "\n" in self._reasoning_buf:
|
while "\n" in self._reasoning_buf:
|
||||||
line, self._reasoning_buf = self._reasoning_buf.split("\n", 1)
|
line, self._reasoning_buf = self._reasoning_buf.split("\n", 1)
|
||||||
_cprint(f"{_DIM}{line}{_RST}")
|
_cprint(f"{_DIM}{line}{_RST}")
|
||||||
|
if len(self._reasoning_buf) > 80:
|
||||||
|
_cprint(f"{_DIM}{self._reasoning_buf}{_RST}")
|
||||||
|
self._reasoning_buf = ""
|
||||||
|
|
||||||
def _close_reasoning_box(self) -> None:
|
def _close_reasoning_box(self) -> None:
|
||||||
"""Close the live reasoning box if it's open."""
|
"""Close the live reasoning box if it's open."""
|
||||||
|
|||||||
@@ -163,8 +163,8 @@ DEFAULT_CONFIG = {
|
|||||||
|
|
||||||
"compression": {
|
"compression": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"threshold": 0.80, # compress when context usage exceeds this ratio
|
"threshold": 0.50, # compress when context usage exceeds this ratio
|
||||||
"target_ratio": 0.40, # fraction of context to preserve as recent tail
|
"target_ratio": 0.20, # fraction of threshold to preserve as recent tail
|
||||||
"protect_last_n": 20, # minimum recent messages to keep uncompressed
|
"protect_last_n": 20, # minimum recent messages to keep uncompressed
|
||||||
"summary_model": "", # empty = use main configured model
|
"summary_model": "", # empty = use main configured model
|
||||||
"summary_provider": "auto",
|
"summary_provider": "auto",
|
||||||
@@ -1686,8 +1686,8 @@ def show_config():
|
|||||||
enabled = compression.get('enabled', True)
|
enabled = compression.get('enabled', True)
|
||||||
print(f" Enabled: {'yes' if enabled else 'no'}")
|
print(f" Enabled: {'yes' if enabled else 'no'}")
|
||||||
if enabled:
|
if enabled:
|
||||||
print(f" Threshold: {compression.get('threshold', 0.80) * 100:.0f}%")
|
print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%")
|
||||||
print(f" Target ratio: {compression.get('target_ratio', 0.40) * 100:.0f}% of context preserved")
|
print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved")
|
||||||
print(f" Protect last: {compression.get('protect_last_n', 20)} messages")
|
print(f" Protect last: {compression.get('protect_last_n', 20)} messages")
|
||||||
_sm = compression.get('summary_model', '') or '(main model)'
|
_sm = compression.get('summary_model', '') or '(main model)'
|
||||||
print(f" Model: {_sm}")
|
print(f" Model: {_sm}")
|
||||||
|
|||||||
118
run_agent.py
118
run_agent.py
@@ -1009,10 +1009,10 @@ class AIAgent:
|
|||||||
_compression_cfg = _agent_cfg.get("compression", {})
|
_compression_cfg = _agent_cfg.get("compression", {})
|
||||||
if not isinstance(_compression_cfg, dict):
|
if not isinstance(_compression_cfg, dict):
|
||||||
_compression_cfg = {}
|
_compression_cfg = {}
|
||||||
compression_threshold = float(_compression_cfg.get("threshold", 0.80))
|
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
||||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
|
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
|
||||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||||
|
|
||||||
# Read explicit context_length override from model config
|
# Read explicit context_length override from model config
|
||||||
@@ -3585,7 +3585,20 @@ class AIAgent:
|
|||||||
|
|
||||||
def _call_chat_completions():
|
def _call_chat_completions():
|
||||||
"""Stream a chat completions response."""
|
"""Stream a chat completions response."""
|
||||||
stream_kwargs = {**api_kwargs, "stream": True, "stream_options": {"include_usage": True}}
|
import httpx as _httpx
|
||||||
|
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 900.0))
|
||||||
|
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 60.0))
|
||||||
|
stream_kwargs = {
|
||||||
|
**api_kwargs,
|
||||||
|
"stream": True,
|
||||||
|
"stream_options": {"include_usage": True},
|
||||||
|
"timeout": _httpx.Timeout(
|
||||||
|
connect=30.0,
|
||||||
|
read=_stream_read_timeout,
|
||||||
|
write=_base_timeout,
|
||||||
|
pool=30.0,
|
||||||
|
),
|
||||||
|
}
|
||||||
request_client_holder["client"] = self._create_request_openai_client(
|
request_client_holder["client"] = self._create_request_openai_client(
|
||||||
reason="chat_completion_stream_request"
|
reason="chat_completion_stream_request"
|
||||||
)
|
)
|
||||||
@@ -3653,6 +3666,7 @@ class AIAgent:
|
|||||||
name = entry["function"]["name"]
|
name = entry["function"]["name"]
|
||||||
if name and idx not in tool_gen_notified:
|
if name and idx not in tool_gen_notified:
|
||||||
tool_gen_notified.add(idx)
|
tool_gen_notified.add(idx)
|
||||||
|
_fire_first_delta()
|
||||||
self._fire_tool_gen_started(name)
|
self._fire_tool_gen_started(name)
|
||||||
|
|
||||||
if chunk.choices[0].finish_reason:
|
if chunk.choices[0].finish_reason:
|
||||||
@@ -3721,6 +3735,7 @@ class AIAgent:
|
|||||||
has_tool_use = True
|
has_tool_use = True
|
||||||
tool_name = getattr(block, "name", None)
|
tool_name = getattr(block, "name", None)
|
||||||
if tool_name:
|
if tool_name:
|
||||||
|
_fire_first_delta()
|
||||||
self._fire_tool_gen_started(tool_name)
|
self._fire_tool_gen_started(tool_name)
|
||||||
|
|
||||||
elif event_type == "content_block_delta":
|
elif event_type == "content_block_delta":
|
||||||
@@ -3742,29 +3757,84 @@ class AIAgent:
|
|||||||
return stream.get_final_message()
|
return stream.get_final_message()
|
||||||
|
|
||||||
def _call():
|
def _call():
|
||||||
|
import httpx as _httpx
|
||||||
|
|
||||||
|
_max_stream_retries = int(os.getenv("HERMES_STREAM_RETRIES", 2))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.api_mode == "anthropic_messages":
|
for _stream_attempt in range(_max_stream_retries + 1):
|
||||||
self._try_refresh_anthropic_client_credentials()
|
|
||||||
result["response"] = _call_anthropic()
|
|
||||||
else:
|
|
||||||
result["response"] = _call_chat_completions()
|
|
||||||
except Exception as e:
|
|
||||||
if deltas_were_sent["yes"]:
|
|
||||||
# Streaming failed AFTER some tokens were already delivered
|
|
||||||
# to consumers. Don't fall back — that would cause
|
|
||||||
# double-delivery (partial streamed + full non-streamed).
|
|
||||||
# Let the error propagate; the partial content already
|
|
||||||
# reached the user via the stream.
|
|
||||||
logger.warning("Streaming failed after partial delivery, not falling back: %s", e)
|
|
||||||
result["error"] = e
|
|
||||||
else:
|
|
||||||
# Streaming failed before any tokens reached consumers.
|
|
||||||
# Safe to fall back to the standard non-streaming path.
|
|
||||||
logger.info("Streaming failed before delivery, falling back to non-streaming: %s", e)
|
|
||||||
try:
|
try:
|
||||||
result["response"] = self._interruptible_api_call(api_kwargs)
|
if self.api_mode == "anthropic_messages":
|
||||||
except Exception as fallback_err:
|
self._try_refresh_anthropic_client_credentials()
|
||||||
result["error"] = fallback_err
|
result["response"] = _call_anthropic()
|
||||||
|
else:
|
||||||
|
result["response"] = _call_chat_completions()
|
||||||
|
return # success
|
||||||
|
except Exception as e:
|
||||||
|
if deltas_were_sent["yes"]:
|
||||||
|
# Streaming failed AFTER some tokens were already
|
||||||
|
# delivered. Don't retry or fall back — partial
|
||||||
|
# content already reached the user.
|
||||||
|
logger.warning(
|
||||||
|
"Streaming failed after partial delivery, not retrying: %s", e
|
||||||
|
)
|
||||||
|
result["error"] = e
|
||||||
|
return
|
||||||
|
|
||||||
|
_is_timeout = isinstance(
|
||||||
|
e, (_httpx.ReadTimeout, _httpx.ConnectTimeout, _httpx.PoolTimeout)
|
||||||
|
)
|
||||||
|
_is_conn_err = isinstance(
|
||||||
|
e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError)
|
||||||
|
)
|
||||||
|
|
||||||
|
if _is_timeout or _is_conn_err:
|
||||||
|
# Transient network / timeout error. Retry the
|
||||||
|
# streaming request with a fresh connection rather
|
||||||
|
# than falling back to non-streaming (which would
|
||||||
|
# hang for up to 15 min on the same dead server).
|
||||||
|
if _stream_attempt < _max_stream_retries:
|
||||||
|
logger.info(
|
||||||
|
"Streaming attempt %s/%s failed (%s: %s), "
|
||||||
|
"retrying with fresh connection...",
|
||||||
|
_stream_attempt + 1,
|
||||||
|
_max_stream_retries + 1,
|
||||||
|
type(e).__name__,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
# Close the stale request client before retry
|
||||||
|
stale = request_client_holder.get("client")
|
||||||
|
if stale is not None:
|
||||||
|
self._close_request_openai_client(
|
||||||
|
stale, reason="stream_retry_cleanup"
|
||||||
|
)
|
||||||
|
request_client_holder["client"] = None
|
||||||
|
continue
|
||||||
|
# Exhausted retries — propagate to outer loop
|
||||||
|
logger.warning(
|
||||||
|
"Streaming exhausted %s retries on transient error: %s",
|
||||||
|
_max_stream_retries + 1,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
result["error"] = e
|
||||||
|
return
|
||||||
|
|
||||||
|
# Non-transient error (e.g. "streaming not supported",
|
||||||
|
# auth error, 4xx). Fall back to non-streaming once.
|
||||||
|
err_msg = str(e).lower()
|
||||||
|
if "stream" in err_msg and "not supported" in err_msg:
|
||||||
|
logger.info(
|
||||||
|
"Streaming not supported, falling back to non-streaming: %s", e
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
result["response"] = self._interruptible_api_call(api_kwargs)
|
||||||
|
except Exception as fallback_err:
|
||||||
|
result["error"] = fallback_err
|
||||||
|
return
|
||||||
|
|
||||||
|
# Unknown error — propagate to outer retry loop
|
||||||
|
result["error"] = e
|
||||||
|
return
|
||||||
finally:
|
finally:
|
||||||
request_client = request_client_holder.get("client")
|
request_client = request_client_holder.get("client")
|
||||||
if request_client is not None:
|
if request_client is not None:
|
||||||
|
|||||||
@@ -519,24 +519,26 @@ class TestSummaryTargetRatio:
|
|||||||
"""Verify that summary_target_ratio properly scales budgets with context window."""
|
"""Verify that summary_target_ratio properly scales budgets with context window."""
|
||||||
|
|
||||||
def test_tail_budget_scales_with_context(self):
|
def test_tail_budget_scales_with_context(self):
|
||||||
"""Tail token budget should be context_length * summary_target_ratio."""
|
"""Tail token budget should be threshold_tokens * summary_target_ratio."""
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||||
assert c.tail_token_budget == 80_000
|
# 200K * 0.50 threshold * 0.40 ratio = 40K
|
||||||
|
assert c.tail_token_budget == 40_000
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||||
assert c.tail_token_budget == 400_000
|
# 1M * 0.50 threshold * 0.40 ratio = 200K
|
||||||
|
assert c.tail_token_budget == 200_000
|
||||||
|
|
||||||
def test_summary_cap_scales_with_context(self):
|
def test_summary_cap_scales_with_context(self):
|
||||||
"""Max summary tokens should be 5% of context, capped at 32K."""
|
"""Max summary tokens should be 5% of context, capped at 12K."""
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
assert c.max_summary_tokens == 10_000 # 200K * 0.05
|
assert c.max_summary_tokens == 10_000 # 200K * 0.05
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
assert c.max_summary_tokens == 32_000 # capped at ceiling
|
assert c.max_summary_tokens == 12_000 # capped at 12K ceiling
|
||||||
|
|
||||||
def test_ratio_clamped(self):
|
def test_ratio_clamped(self):
|
||||||
"""Ratio should be clamped to [0.10, 0.80]."""
|
"""Ratio should be clamped to [0.10, 0.80]."""
|
||||||
@@ -548,12 +550,12 @@ class TestSummaryTargetRatio:
|
|||||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
|
||||||
assert c.summary_target_ratio == 0.80
|
assert c.summary_target_ratio == 0.80
|
||||||
|
|
||||||
def test_default_threshold_is_80_percent(self):
|
def test_default_threshold_is_50_percent(self):
|
||||||
"""Default compression threshold should be 80%."""
|
"""Default compression threshold should be 50%."""
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
assert c.threshold_percent == 0.80
|
assert c.threshold_percent == 0.50
|
||||||
assert c.threshold_tokens == 80_000
|
assert c.threshold_tokens == 50_000
|
||||||
|
|
||||||
def test_default_protect_last_n_is_20(self):
|
def test_default_protect_last_n_is_20(self):
|
||||||
"""Default protect_last_n should be 20."""
|
"""Default protect_last_n should be 20."""
|
||||||
|
|||||||
@@ -1567,6 +1567,20 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||||||
vision_model = _get_vision_model()
|
vision_model = _get_vision_model()
|
||||||
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
||||||
len(image_data))
|
len(image_data))
|
||||||
|
|
||||||
|
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
|
||||||
|
# Local vision models (llama.cpp, ollama) can take well over 30s for
|
||||||
|
# screenshot analysis, so the default must be generous.
|
||||||
|
vision_timeout = 120.0
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
_cfg = load_config()
|
||||||
|
_vt = _cfg.get("auxiliary", {}).get("vision", {}).get("timeout")
|
||||||
|
if _vt is not None:
|
||||||
|
vision_timeout = float(_vt)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
call_kwargs = {
|
call_kwargs = {
|
||||||
"task": "vision",
|
"task": "vision",
|
||||||
"messages": [
|
"messages": [
|
||||||
@@ -1580,6 +1594,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||||||
],
|
],
|
||||||
"max_tokens": 2000,
|
"max_tokens": 2000,
|
||||||
"temperature": 0.1,
|
"temperature": 0.1,
|
||||||
|
"timeout": vision_timeout,
|
||||||
}
|
}
|
||||||
if vision_model:
|
if vision_model:
|
||||||
call_kwargs["model"] = vision_model
|
call_kwargs["model"] = vision_model
|
||||||
|
|||||||
@@ -179,6 +179,58 @@ async def _summarize_session(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _list_recent_sessions(db, limit: int, current_session_id: str = None) -> str:
|
||||||
|
"""Return metadata for the most recent sessions (no LLM calls)."""
|
||||||
|
try:
|
||||||
|
sessions = db.list_sessions_rich(limit=limit + 5) # fetch extra to skip current
|
||||||
|
|
||||||
|
# Resolve current session lineage to exclude it
|
||||||
|
current_root = None
|
||||||
|
if current_session_id:
|
||||||
|
try:
|
||||||
|
sid = current_session_id
|
||||||
|
visited = set()
|
||||||
|
while sid and sid not in visited:
|
||||||
|
visited.add(sid)
|
||||||
|
s = db.get_session(sid)
|
||||||
|
parent = s.get("parent_session_id") if s else None
|
||||||
|
sid = parent if parent else None
|
||||||
|
current_root = max(visited, key=len) if visited else current_session_id
|
||||||
|
except Exception:
|
||||||
|
current_root = current_session_id
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for s in sessions:
|
||||||
|
sid = s.get("id", "")
|
||||||
|
if current_root and (sid == current_root or sid == current_session_id):
|
||||||
|
continue
|
||||||
|
# Skip child/delegation sessions (they have parent_session_id)
|
||||||
|
if s.get("parent_session_id"):
|
||||||
|
continue
|
||||||
|
results.append({
|
||||||
|
"session_id": sid,
|
||||||
|
"title": s.get("title") or None,
|
||||||
|
"source": s.get("source", ""),
|
||||||
|
"started_at": s.get("started_at", ""),
|
||||||
|
"last_active": s.get("last_active", ""),
|
||||||
|
"message_count": s.get("message_count", 0),
|
||||||
|
"preview": s.get("preview", ""),
|
||||||
|
})
|
||||||
|
if len(results) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"success": True,
|
||||||
|
"mode": "recent",
|
||||||
|
"results": results,
|
||||||
|
"count": len(results),
|
||||||
|
"message": f"Showing {len(results)} most recent sessions. Use a keyword query to search specific topics.",
|
||||||
|
}, ensure_ascii=False)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error listing recent sessions: %s", e, exc_info=True)
|
||||||
|
return json.dumps({"success": False, "error": f"Failed to list recent sessions: {e}"}, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def session_search(
|
def session_search(
|
||||||
query: str,
|
query: str,
|
||||||
role_filter: str = None,
|
role_filter: str = None,
|
||||||
@@ -195,11 +247,14 @@ def session_search(
|
|||||||
if db is None:
|
if db is None:
|
||||||
return json.dumps({"success": False, "error": "Session database not available."}, ensure_ascii=False)
|
return json.dumps({"success": False, "error": "Session database not available."}, ensure_ascii=False)
|
||||||
|
|
||||||
|
limit = min(limit, 5) # Cap at 5 sessions to avoid excessive LLM calls
|
||||||
|
|
||||||
|
# Recent sessions mode: when query is empty, return metadata for recent sessions.
|
||||||
|
# No LLM calls — just DB queries for titles, previews, timestamps.
|
||||||
if not query or not query.strip():
|
if not query or not query.strip():
|
||||||
return json.dumps({"success": False, "error": "Query cannot be empty."}, ensure_ascii=False)
|
return _list_recent_sessions(db, limit, current_session_id)
|
||||||
|
|
||||||
query = query.strip()
|
query = query.strip()
|
||||||
limit = min(limit, 5) # Cap at 5 sessions to avoid excessive LLM calls
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Parse role filter
|
# Parse role filter
|
||||||
@@ -364,8 +419,14 @@ def check_session_search_requirements() -> bool:
|
|||||||
SESSION_SEARCH_SCHEMA = {
|
SESSION_SEARCH_SCHEMA = {
|
||||||
"name": "session_search",
|
"name": "session_search",
|
||||||
"description": (
|
"description": (
|
||||||
"Search your long-term memory of past conversations. This is your recall -- "
|
"Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
|
||||||
"every past session is searchable, and this tool summarizes what happened.\n\n"
|
"every past session is searchable, and this tool summarizes what happened.\n\n"
|
||||||
|
"TWO MODES:\n"
|
||||||
|
"1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
|
||||||
|
"Returns titles, previews, and timestamps. Zero LLM cost, instant. "
|
||||||
|
"Start here when the user asks what were we working on or what did we do recently.\n"
|
||||||
|
"2. Keyword search (with query): Search for specific topics across all past sessions. "
|
||||||
|
"Returns LLM-generated summaries of matching sessions.\n\n"
|
||||||
"USE THIS PROACTIVELY when:\n"
|
"USE THIS PROACTIVELY when:\n"
|
||||||
"- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
|
"- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
|
||||||
"- The user asks about a topic you worked on before but don't have in current context\n"
|
"- The user asks about a topic you worked on before but don't have in current context\n"
|
||||||
@@ -385,7 +446,7 @@ SESSION_SEARCH_SCHEMA = {
|
|||||||
"properties": {
|
"properties": {
|
||||||
"query": {
|
"query": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Search query — keywords, phrases, or boolean expressions to find in past sessions.",
|
"description": "Search query — keywords, phrases, or boolean expressions to find in past sessions. Omit this parameter entirely to browse recent sessions instead (returns titles, previews, timestamps with no LLM cost).",
|
||||||
},
|
},
|
||||||
"role_filter": {
|
"role_filter": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@@ -397,7 +458,7 @@ SESSION_SEARCH_SCHEMA = {
|
|||||||
"default": 3,
|
"default": 3,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"required": ["query"],
|
"required": [],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -410,7 +471,7 @@ registry.register(
|
|||||||
toolset="session_search",
|
toolset="session_search",
|
||||||
schema=SESSION_SEARCH_SCHEMA,
|
schema=SESSION_SEARCH_SCHEMA,
|
||||||
handler=lambda args, **kw: session_search(
|
handler=lambda args, **kw: session_search(
|
||||||
query=args.get("query", ""),
|
query=args.get("query") or "",
|
||||||
role_filter=args.get("role_filter"),
|
role_filter=args.get("role_filter"),
|
||||||
limit=args.get("limit", 3),
|
limit=args.get("limit", 3),
|
||||||
db=kw.get("db"),
|
db=kw.get("db"),
|
||||||
|
|||||||
@@ -1050,6 +1050,9 @@ def _get_configured_model() -> str:
|
|||||||
|
|
||||||
def _resolve_trust_level(source: str) -> str:
|
def _resolve_trust_level(source: str) -> str:
|
||||||
"""Map a source identifier to a trust level."""
|
"""Map a source identifier to a trust level."""
|
||||||
|
# Agent-created skills get their own permissive trust level
|
||||||
|
if source == "agent-created":
|
||||||
|
return "agent-created"
|
||||||
# Official optional skills shipped with the repo
|
# Official optional skills shipped with the repo
|
||||||
if source.startswith("official/") or source == "official":
|
if source.startswith("official/") or source == "official":
|
||||||
return "builtin"
|
return "builtin"
|
||||||
|
|||||||
@@ -325,8 +325,9 @@ async def vision_analyze_tool(
|
|||||||
logger.info("Processing image with vision model...")
|
logger.info("Processing image with vision model...")
|
||||||
|
|
||||||
# Call the vision API via centralized router.
|
# Call the vision API via centralized router.
|
||||||
# Read timeout from config.yaml (auxiliary.vision.timeout), default 30s.
|
# Read timeout from config.yaml (auxiliary.vision.timeout), default 120s.
|
||||||
vision_timeout = 30.0
|
# Local vision models (llama.cpp, ollama) can take well over 30s.
|
||||||
|
vision_timeout = 120.0
|
||||||
try:
|
try:
|
||||||
from hermes_cli.config import load_config
|
from hermes_cli.config import load_config
|
||||||
_cfg = load_config()
|
_cfg = load_config()
|
||||||
|
|||||||
@@ -6,9 +6,20 @@ description: "Run custom code at key lifecycle points — log activity, send ale
|
|||||||
|
|
||||||
# Event Hooks
|
# Event Hooks
|
||||||
|
|
||||||
The hooks system lets you run custom code at key points in the agent lifecycle — session creation, slash commands, each tool-calling step, and more. Hooks fire automatically during gateway operation without blocking the main agent pipeline.
|
Hermes has two hook systems that run custom code at key lifecycle points:
|
||||||
|
|
||||||
## Creating a Hook
|
| System | Registered via | Runs in | Use case |
|
||||||
|
|--------|---------------|---------|----------|
|
||||||
|
| **[Gateway hooks](#gateway-event-hooks)** | `HOOK.yaml` + `handler.py` in `~/.hermes/hooks/` | Gateway only | Logging, alerts, webhooks |
|
||||||
|
| **[Plugin hooks](#plugin-hooks)** | `ctx.register_hook()` in a [plugin](/docs/user-guide/features/plugins) | CLI + Gateway | Tool interception, metrics, guardrails |
|
||||||
|
|
||||||
|
Both systems are non-blocking — errors in any hook are caught and logged, never crashing the agent.
|
||||||
|
|
||||||
|
## Gateway Event Hooks
|
||||||
|
|
||||||
|
Gateway hooks fire automatically during gateway operation (Telegram, Discord, Slack, WhatsApp) without blocking the main agent pipeline.
|
||||||
|
|
||||||
|
### Creating a Hook
|
||||||
|
|
||||||
Each hook is a directory under `~/.hermes/hooks/` containing two files:
|
Each hook is a directory under `~/.hermes/hooks/` containing two files:
|
||||||
|
|
||||||
@@ -19,7 +30,7 @@ Each hook is a directory under `~/.hermes/hooks/` containing two files:
|
|||||||
└── handler.py # Python handler function
|
└── handler.py # Python handler function
|
||||||
```
|
```
|
||||||
|
|
||||||
### HOOK.yaml
|
#### HOOK.yaml
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
name: my-hook
|
name: my-hook
|
||||||
@@ -32,7 +43,7 @@ events:
|
|||||||
|
|
||||||
The `events` list determines which events trigger your handler. You can subscribe to any combination of events, including wildcards like `command:*`.
|
The `events` list determines which events trigger your handler. You can subscribe to any combination of events, including wildcards like `command:*`.
|
||||||
|
|
||||||
### handler.py
|
#### handler.py
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import json
|
import json
|
||||||
@@ -58,25 +69,26 @@ async def handle(event_type: str, context: dict):
|
|||||||
- Can be `async def` or regular `def` — both work
|
- Can be `async def` or regular `def` — both work
|
||||||
- Errors are caught and logged, never crashing the agent
|
- Errors are caught and logged, never crashing the agent
|
||||||
|
|
||||||
## Available Events
|
### Available Events
|
||||||
|
|
||||||
| Event | When it fires | Context keys |
|
| Event | When it fires | Context keys |
|
||||||
|-------|---------------|--------------|
|
|-------|---------------|--------------|
|
||||||
| `gateway:startup` | Gateway process starts | `platforms` (list of active platform names) |
|
| `gateway:startup` | Gateway process starts | `platforms` (list of active platform names) |
|
||||||
| `session:start` | New messaging session created | `platform`, `user_id`, `session_id`, `session_key` |
|
| `session:start` | New messaging session created | `platform`, `user_id`, `session_id`, `session_key` |
|
||||||
|
| `session:end` | Session ended (before reset) | `platform`, `user_id`, `session_key` |
|
||||||
| `session:reset` | User ran `/new` or `/reset` | `platform`, `user_id`, `session_key` |
|
| `session:reset` | User ran `/new` or `/reset` | `platform`, `user_id`, `session_key` |
|
||||||
| `agent:start` | Agent begins processing a message | `platform`, `user_id`, `session_id`, `message` |
|
| `agent:start` | Agent begins processing a message | `platform`, `user_id`, `session_id`, `message` |
|
||||||
| `agent:step` | Each iteration of the tool-calling loop | `platform`, `user_id`, `session_id`, `iteration`, `tool_names` |
|
| `agent:step` | Each iteration of the tool-calling loop | `platform`, `user_id`, `session_id`, `iteration`, `tool_names` |
|
||||||
| `agent:end` | Agent finishes processing | `platform`, `user_id`, `session_id`, `message`, `response` |
|
| `agent:end` | Agent finishes processing | `platform`, `user_id`, `session_id`, `message`, `response` |
|
||||||
| `command:*` | Any slash command executed | `platform`, `user_id`, `command`, `args` |
|
| `command:*` | Any slash command executed | `platform`, `user_id`, `command`, `args` |
|
||||||
|
|
||||||
### Wildcard Matching
|
#### Wildcard Matching
|
||||||
|
|
||||||
Handlers registered for `command:*` fire for any `command:` event (`command:model`, `command:reset`, etc.). Monitor all slash commands with a single subscription.
|
Handlers registered for `command:*` fire for any `command:` event (`command:model`, `command:reset`, etc.). Monitor all slash commands with a single subscription.
|
||||||
|
|
||||||
## Examples
|
### Examples
|
||||||
|
|
||||||
### Telegram Alert on Long Tasks
|
#### Telegram Alert on Long Tasks
|
||||||
|
|
||||||
Send yourself a message when the agent takes more than 10 steps:
|
Send yourself a message when the agent takes more than 10 steps:
|
||||||
|
|
||||||
@@ -109,7 +121,7 @@ async def handle(event_type: str, context: dict):
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Command Usage Logger
|
#### Command Usage Logger
|
||||||
|
|
||||||
Track which slash commands are used:
|
Track which slash commands are used:
|
||||||
|
|
||||||
@@ -142,7 +154,7 @@ def handle(event_type: str, context: dict):
|
|||||||
f.write(json.dumps(entry) + "\n")
|
f.write(json.dumps(entry) + "\n")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Session Start Webhook
|
#### Session Start Webhook
|
||||||
|
|
||||||
POST to an external service on new sessions:
|
POST to an external service on new sessions:
|
||||||
|
|
||||||
@@ -169,7 +181,7 @@ async def handle(event_type: str, context: dict):
|
|||||||
}, timeout=5)
|
}, timeout=5)
|
||||||
```
|
```
|
||||||
|
|
||||||
## How It Works
|
### How It Works
|
||||||
|
|
||||||
1. On gateway startup, `HookRegistry.discover_and_load()` scans `~/.hermes/hooks/`
|
1. On gateway startup, `HookRegistry.discover_and_load()` scans `~/.hermes/hooks/`
|
||||||
2. Each subdirectory with `HOOK.yaml` + `handler.py` is loaded dynamically
|
2. Each subdirectory with `HOOK.yaml` + `handler.py` is loaded dynamically
|
||||||
@@ -178,5 +190,51 @@ async def handle(event_type: str, context: dict):
|
|||||||
5. Errors in any handler are caught and logged — a broken hook never crashes the agent
|
5. Errors in any handler are caught and logged — a broken hook never crashes the agent
|
||||||
|
|
||||||
:::info
|
:::info
|
||||||
Hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp). The CLI does not currently load hooks.
|
Gateway hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp). The CLI does not load gateway hooks. For hooks that work everywhere, use [plugin hooks](#plugin-hooks).
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
## Plugin Hooks
|
||||||
|
|
||||||
|
[Plugins](/docs/user-guide/features/plugins) can register hooks that fire in **both CLI and gateway** sessions. These are registered programmatically via `ctx.register_hook()` in your plugin's `register()` function.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def register(ctx):
|
||||||
|
ctx.register_hook("pre_tool_call", my_callback)
|
||||||
|
ctx.register_hook("post_tool_call", my_callback)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Available Plugin Hooks
|
||||||
|
|
||||||
|
| Hook | Fires when | Callback receives |
|
||||||
|
|------|-----------|-------------------|
|
||||||
|
| `pre_tool_call` | Before any tool executes | `tool_name`, `args`, `task_id` |
|
||||||
|
| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` |
|
||||||
|
| `pre_llm_call` | Before LLM API request | *(planned — not yet wired)* |
|
||||||
|
| `post_llm_call` | After LLM API response | *(planned — not yet wired)* |
|
||||||
|
| `on_session_start` | Session begins | *(planned — not yet wired)* |
|
||||||
|
| `on_session_end` | Session ends | *(planned — not yet wired)* |
|
||||||
|
|
||||||
|
Callbacks receive keyword arguments matching the columns above:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def my_callback(**kwargs):
|
||||||
|
tool = kwargs["tool_name"]
|
||||||
|
args = kwargs["args"]
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Block Dangerous Tools
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ~/.hermes/plugins/tool-guard/__init__.py
|
||||||
|
BLOCKED = {"terminal", "write_file"}
|
||||||
|
|
||||||
|
def guard(**kwargs):
|
||||||
|
if kwargs["tool_name"] in BLOCKED:
|
||||||
|
print(f"⚠ Blocked tool call: {kwargs['tool_name']}")
|
||||||
|
|
||||||
|
def register(ctx):
|
||||||
|
ctx.register_hook("pre_tool_call", guard)
|
||||||
|
```
|
||||||
|
|
||||||
|
See the **[Plugins guide](/docs/user-guide/features/plugins)** for full details on creating plugins.
|
||||||
|
|||||||
@@ -46,14 +46,16 @@ Project-local plugins under `./.hermes/plugins/` are disabled by default. Enable
|
|||||||
|
|
||||||
## Available hooks
|
## Available hooks
|
||||||
|
|
||||||
|
Plugins can register callbacks for these lifecycle events. See the **[Event Hooks page](/docs/user-guide/features/hooks#plugin-hooks)** for full details, callback signatures, and examples.
|
||||||
|
|
||||||
| Hook | Fires when |
|
| Hook | Fires when |
|
||||||
|------|-----------|
|
|------|-----------|
|
||||||
| `pre_tool_call` | Before any tool executes |
|
| `pre_tool_call` | Before any tool executes |
|
||||||
| `post_tool_call` | After any tool returns |
|
| `post_tool_call` | After any tool returns |
|
||||||
| `pre_llm_call` | Before LLM API request |
|
| `pre_llm_call` | Before LLM API request *(planned)* |
|
||||||
| `post_llm_call` | After LLM API response |
|
| `post_llm_call` | After LLM API response *(planned)* |
|
||||||
| `on_session_start` | Session begins |
|
| `on_session_start` | Session begins *(planned)* |
|
||||||
| `on_session_end` | Session ends |
|
| `on_session_end` | Session ends *(planned)* |
|
||||||
|
|
||||||
## Slash commands
|
## Slash commands
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user