mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
Compare commits
11 Commits
opencode-p
...
docs/hooks
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a18884a3d0 | ||
|
|
29d3f1216b | ||
|
|
fe37a53b75 | ||
|
|
b6ef1deafd | ||
|
|
0f3c191ef1 | ||
|
|
7cdf4efe05 | ||
|
|
adee8d1b5f | ||
|
|
f5b84dddfd | ||
|
|
4549a2f51a | ||
|
|
466720c2f3 | ||
|
|
fccd7a2ab4 |
@@ -40,7 +40,7 @@ _MIN_SUMMARY_TOKENS = 2000
|
||||
# Proportion of compressed content to allocate for summary
|
||||
_SUMMARY_RATIO = 0.20
|
||||
# Absolute ceiling for summary tokens (even on very large context windows)
|
||||
_SUMMARY_TOKENS_CEILING = 32_000
|
||||
_SUMMARY_TOKENS_CEILING = 12_000
|
||||
|
||||
# Placeholder used when pruning old tool results
|
||||
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||
@@ -63,10 +63,10 @@ class ContextCompressor:
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
threshold_percent: float = 0.80,
|
||||
threshold_percent: float = 0.50,
|
||||
protect_first_n: int = 3,
|
||||
protect_last_n: int = 20,
|
||||
summary_target_ratio: float = 0.40,
|
||||
summary_target_ratio: float = 0.20,
|
||||
quiet_mode: bool = False,
|
||||
summary_model_override: str = None,
|
||||
base_url: str = "",
|
||||
@@ -92,8 +92,8 @@ class ContextCompressor:
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
self.compression_count = 0
|
||||
|
||||
# Derive token budgets from the target ratio and context length
|
||||
target_tokens = int(self.context_length * self.summary_target_ratio)
|
||||
# Derive token budgets: ratio is relative to the threshold, not total context
|
||||
target_tokens = int(self.threshold_tokens * self.summary_target_ratio)
|
||||
self.tail_token_budget = target_tokens
|
||||
self.max_summary_tokens = min(
|
||||
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
||||
|
||||
@@ -236,23 +236,24 @@ browser:
|
||||
# 5. Summarizes middle turns using a fast/cheap model
|
||||
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||
#
|
||||
# Post-compression size scales with the model's context window via target_ratio:
|
||||
# MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
|
||||
# GPT-5 1M context → ~400K post-compression (at 0.40 ratio)
|
||||
# Post-compression tail budget is target_ratio × threshold × context_length:
|
||||
# 200K context, threshold 0.50, ratio 0.20 → 20K tokens of recent tail preserved
|
||||
# 1M context, threshold 0.50, ratio 0.20 → 100K tokens of recent tail preserved
|
||||
#
|
||||
compression:
|
||||
# Enable automatic context compression (default: true)
|
||||
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||
enabled: true
|
||||
|
||||
# Trigger compression at this % of model's context limit (default: 0.80 = 80%)
|
||||
# Trigger compression at this % of model's context limit (default: 0.50 = 50%)
|
||||
# Lower values = more aggressive compression, higher values = compress later
|
||||
threshold: 0.80
|
||||
threshold: 0.50
|
||||
|
||||
# Target post-compression size as a fraction of context window (default: 0.40 = 40%)
|
||||
# Controls how much context survives compression. Tail token budget and summary
|
||||
# cap scale with this value. Range: 0.10 - 0.80
|
||||
target_ratio: 0.40
|
||||
# Fraction of the threshold to preserve as recent tail (default: 0.20 = 20%)
|
||||
# e.g. 20% of 50% threshold = 10% of total context kept as recent messages.
|
||||
# Summary output is separately capped at 12K tokens (Gemini output limit).
|
||||
# Range: 0.10 - 0.80
|
||||
target_ratio: 0.20
|
||||
|
||||
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
|
||||
# Higher values keep more recent conversation intact at the cost of more aggressive
|
||||
|
||||
6
cli.py
6
cli.py
@@ -1509,10 +1509,14 @@ class HermesCLI:
|
||||
|
||||
self._reasoning_buf = getattr(self, "_reasoning_buf", "") + text
|
||||
|
||||
# Emit complete lines
|
||||
# Emit complete lines, and force-flush long partial lines so
|
||||
# reasoning is visible in real-time even without newlines.
|
||||
while "\n" in self._reasoning_buf:
|
||||
line, self._reasoning_buf = self._reasoning_buf.split("\n", 1)
|
||||
_cprint(f"{_DIM}{line}{_RST}")
|
||||
if len(self._reasoning_buf) > 80:
|
||||
_cprint(f"{_DIM}{self._reasoning_buf}{_RST}")
|
||||
self._reasoning_buf = ""
|
||||
|
||||
def _close_reasoning_box(self) -> None:
|
||||
"""Close the live reasoning box if it's open."""
|
||||
|
||||
@@ -163,8 +163,8 @@ DEFAULT_CONFIG = {
|
||||
|
||||
"compression": {
|
||||
"enabled": True,
|
||||
"threshold": 0.80, # compress when context usage exceeds this ratio
|
||||
"target_ratio": 0.40, # fraction of context to preserve as recent tail
|
||||
"threshold": 0.50, # compress when context usage exceeds this ratio
|
||||
"target_ratio": 0.20, # fraction of threshold to preserve as recent tail
|
||||
"protect_last_n": 20, # minimum recent messages to keep uncompressed
|
||||
"summary_model": "", # empty = use main configured model
|
||||
"summary_provider": "auto",
|
||||
@@ -1686,8 +1686,8 @@ def show_config():
|
||||
enabled = compression.get('enabled', True)
|
||||
print(f" Enabled: {'yes' if enabled else 'no'}")
|
||||
if enabled:
|
||||
print(f" Threshold: {compression.get('threshold', 0.80) * 100:.0f}%")
|
||||
print(f" Target ratio: {compression.get('target_ratio', 0.40) * 100:.0f}% of context preserved")
|
||||
print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%")
|
||||
print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved")
|
||||
print(f" Protect last: {compression.get('protect_last_n', 20)} messages")
|
||||
_sm = compression.get('summary_model', '') or '(main model)'
|
||||
print(f" Model: {_sm}")
|
||||
|
||||
118
run_agent.py
118
run_agent.py
@@ -1009,10 +1009,10 @@ class AIAgent:
|
||||
_compression_cfg = _agent_cfg.get("compression", {})
|
||||
if not isinstance(_compression_cfg, dict):
|
||||
_compression_cfg = {}
|
||||
compression_threshold = float(_compression_cfg.get("threshold", 0.80))
|
||||
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
|
||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||
|
||||
# Read explicit context_length override from model config
|
||||
@@ -3585,7 +3585,20 @@ class AIAgent:
|
||||
|
||||
def _call_chat_completions():
|
||||
"""Stream a chat completions response."""
|
||||
stream_kwargs = {**api_kwargs, "stream": True, "stream_options": {"include_usage": True}}
|
||||
import httpx as _httpx
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 900.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 60.0))
|
||||
stream_kwargs = {
|
||||
**api_kwargs,
|
||||
"stream": True,
|
||||
"stream_options": {"include_usage": True},
|
||||
"timeout": _httpx.Timeout(
|
||||
connect=30.0,
|
||||
read=_stream_read_timeout,
|
||||
write=_base_timeout,
|
||||
pool=30.0,
|
||||
),
|
||||
}
|
||||
request_client_holder["client"] = self._create_request_openai_client(
|
||||
reason="chat_completion_stream_request"
|
||||
)
|
||||
@@ -3653,6 +3666,7 @@ class AIAgent:
|
||||
name = entry["function"]["name"]
|
||||
if name and idx not in tool_gen_notified:
|
||||
tool_gen_notified.add(idx)
|
||||
_fire_first_delta()
|
||||
self._fire_tool_gen_started(name)
|
||||
|
||||
if chunk.choices[0].finish_reason:
|
||||
@@ -3721,6 +3735,7 @@ class AIAgent:
|
||||
has_tool_use = True
|
||||
tool_name = getattr(block, "name", None)
|
||||
if tool_name:
|
||||
_fire_first_delta()
|
||||
self._fire_tool_gen_started(tool_name)
|
||||
|
||||
elif event_type == "content_block_delta":
|
||||
@@ -3742,29 +3757,84 @@ class AIAgent:
|
||||
return stream.get_final_message()
|
||||
|
||||
def _call():
|
||||
import httpx as _httpx
|
||||
|
||||
_max_stream_retries = int(os.getenv("HERMES_STREAM_RETRIES", 2))
|
||||
|
||||
try:
|
||||
if self.api_mode == "anthropic_messages":
|
||||
self._try_refresh_anthropic_client_credentials()
|
||||
result["response"] = _call_anthropic()
|
||||
else:
|
||||
result["response"] = _call_chat_completions()
|
||||
except Exception as e:
|
||||
if deltas_were_sent["yes"]:
|
||||
# Streaming failed AFTER some tokens were already delivered
|
||||
# to consumers. Don't fall back — that would cause
|
||||
# double-delivery (partial streamed + full non-streamed).
|
||||
# Let the error propagate; the partial content already
|
||||
# reached the user via the stream.
|
||||
logger.warning("Streaming failed after partial delivery, not falling back: %s", e)
|
||||
result["error"] = e
|
||||
else:
|
||||
# Streaming failed before any tokens reached consumers.
|
||||
# Safe to fall back to the standard non-streaming path.
|
||||
logger.info("Streaming failed before delivery, falling back to non-streaming: %s", e)
|
||||
for _stream_attempt in range(_max_stream_retries + 1):
|
||||
try:
|
||||
result["response"] = self._interruptible_api_call(api_kwargs)
|
||||
except Exception as fallback_err:
|
||||
result["error"] = fallback_err
|
||||
if self.api_mode == "anthropic_messages":
|
||||
self._try_refresh_anthropic_client_credentials()
|
||||
result["response"] = _call_anthropic()
|
||||
else:
|
||||
result["response"] = _call_chat_completions()
|
||||
return # success
|
||||
except Exception as e:
|
||||
if deltas_were_sent["yes"]:
|
||||
# Streaming failed AFTER some tokens were already
|
||||
# delivered. Don't retry or fall back — partial
|
||||
# content already reached the user.
|
||||
logger.warning(
|
||||
"Streaming failed after partial delivery, not retrying: %s", e
|
||||
)
|
||||
result["error"] = e
|
||||
return
|
||||
|
||||
_is_timeout = isinstance(
|
||||
e, (_httpx.ReadTimeout, _httpx.ConnectTimeout, _httpx.PoolTimeout)
|
||||
)
|
||||
_is_conn_err = isinstance(
|
||||
e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError)
|
||||
)
|
||||
|
||||
if _is_timeout or _is_conn_err:
|
||||
# Transient network / timeout error. Retry the
|
||||
# streaming request with a fresh connection rather
|
||||
# than falling back to non-streaming (which would
|
||||
# hang for up to 15 min on the same dead server).
|
||||
if _stream_attempt < _max_stream_retries:
|
||||
logger.info(
|
||||
"Streaming attempt %s/%s failed (%s: %s), "
|
||||
"retrying with fresh connection...",
|
||||
_stream_attempt + 1,
|
||||
_max_stream_retries + 1,
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
# Close the stale request client before retry
|
||||
stale = request_client_holder.get("client")
|
||||
if stale is not None:
|
||||
self._close_request_openai_client(
|
||||
stale, reason="stream_retry_cleanup"
|
||||
)
|
||||
request_client_holder["client"] = None
|
||||
continue
|
||||
# Exhausted retries — propagate to outer loop
|
||||
logger.warning(
|
||||
"Streaming exhausted %s retries on transient error: %s",
|
||||
_max_stream_retries + 1,
|
||||
e,
|
||||
)
|
||||
result["error"] = e
|
||||
return
|
||||
|
||||
# Non-transient error (e.g. "streaming not supported",
|
||||
# auth error, 4xx). Fall back to non-streaming once.
|
||||
err_msg = str(e).lower()
|
||||
if "stream" in err_msg and "not supported" in err_msg:
|
||||
logger.info(
|
||||
"Streaming not supported, falling back to non-streaming: %s", e
|
||||
)
|
||||
try:
|
||||
result["response"] = self._interruptible_api_call(api_kwargs)
|
||||
except Exception as fallback_err:
|
||||
result["error"] = fallback_err
|
||||
return
|
||||
|
||||
# Unknown error — propagate to outer retry loop
|
||||
result["error"] = e
|
||||
return
|
||||
finally:
|
||||
request_client = request_client_holder.get("client")
|
||||
if request_client is not None:
|
||||
|
||||
@@ -519,24 +519,26 @@ class TestSummaryTargetRatio:
|
||||
"""Verify that summary_target_ratio properly scales budgets with context window."""
|
||||
|
||||
def test_tail_budget_scales_with_context(self):
|
||||
"""Tail token budget should be context_length * summary_target_ratio."""
|
||||
"""Tail token budget should be threshold_tokens * summary_target_ratio."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||
assert c.tail_token_budget == 80_000
|
||||
# 200K * 0.50 threshold * 0.40 ratio = 40K
|
||||
assert c.tail_token_budget == 40_000
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||
assert c.tail_token_budget == 400_000
|
||||
# 1M * 0.50 threshold * 0.40 ratio = 200K
|
||||
assert c.tail_token_budget == 200_000
|
||||
|
||||
def test_summary_cap_scales_with_context(self):
|
||||
"""Max summary tokens should be 5% of context, capped at 32K."""
|
||||
"""Max summary tokens should be 5% of context, capped at 12K."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.max_summary_tokens == 10_000 # 200K * 0.05
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.max_summary_tokens == 32_000 # capped at ceiling
|
||||
assert c.max_summary_tokens == 12_000 # capped at 12K ceiling
|
||||
|
||||
def test_ratio_clamped(self):
|
||||
"""Ratio should be clamped to [0.10, 0.80]."""
|
||||
@@ -548,12 +550,12 @@ class TestSummaryTargetRatio:
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
|
||||
assert c.summary_target_ratio == 0.80
|
||||
|
||||
def test_default_threshold_is_80_percent(self):
|
||||
"""Default compression threshold should be 80%."""
|
||||
def test_default_threshold_is_50_percent(self):
|
||||
"""Default compression threshold should be 50%."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.threshold_percent == 0.80
|
||||
assert c.threshold_tokens == 80_000
|
||||
assert c.threshold_percent == 0.50
|
||||
assert c.threshold_tokens == 50_000
|
||||
|
||||
def test_default_protect_last_n_is_20(self):
|
||||
"""Default protect_last_n should be 20."""
|
||||
|
||||
@@ -1567,6 +1567,20 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||
vision_model = _get_vision_model()
|
||||
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
||||
len(image_data))
|
||||
|
||||
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
|
||||
# Local vision models (llama.cpp, ollama) can take well over 30s for
|
||||
# screenshot analysis, so the default must be generous.
|
||||
vision_timeout = 120.0
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
_cfg = load_config()
|
||||
_vt = _cfg.get("auxiliary", {}).get("vision", {}).get("timeout")
|
||||
if _vt is not None:
|
||||
vision_timeout = float(_vt)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
call_kwargs = {
|
||||
"task": "vision",
|
||||
"messages": [
|
||||
@@ -1580,6 +1594,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||
],
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.1,
|
||||
"timeout": vision_timeout,
|
||||
}
|
||||
if vision_model:
|
||||
call_kwargs["model"] = vision_model
|
||||
|
||||
@@ -179,6 +179,58 @@ async def _summarize_session(
|
||||
return None
|
||||
|
||||
|
||||
def _list_recent_sessions(db, limit: int, current_session_id: str = None) -> str:
|
||||
"""Return metadata for the most recent sessions (no LLM calls)."""
|
||||
try:
|
||||
sessions = db.list_sessions_rich(limit=limit + 5) # fetch extra to skip current
|
||||
|
||||
# Resolve current session lineage to exclude it
|
||||
current_root = None
|
||||
if current_session_id:
|
||||
try:
|
||||
sid = current_session_id
|
||||
visited = set()
|
||||
while sid and sid not in visited:
|
||||
visited.add(sid)
|
||||
s = db.get_session(sid)
|
||||
parent = s.get("parent_session_id") if s else None
|
||||
sid = parent if parent else None
|
||||
current_root = max(visited, key=len) if visited else current_session_id
|
||||
except Exception:
|
||||
current_root = current_session_id
|
||||
|
||||
results = []
|
||||
for s in sessions:
|
||||
sid = s.get("id", "")
|
||||
if current_root and (sid == current_root or sid == current_session_id):
|
||||
continue
|
||||
# Skip child/delegation sessions (they have parent_session_id)
|
||||
if s.get("parent_session_id"):
|
||||
continue
|
||||
results.append({
|
||||
"session_id": sid,
|
||||
"title": s.get("title") or None,
|
||||
"source": s.get("source", ""),
|
||||
"started_at": s.get("started_at", ""),
|
||||
"last_active": s.get("last_active", ""),
|
||||
"message_count": s.get("message_count", 0),
|
||||
"preview": s.get("preview", ""),
|
||||
})
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"mode": "recent",
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
"message": f"Showing {len(results)} most recent sessions. Use a keyword query to search specific topics.",
|
||||
}, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error("Error listing recent sessions: %s", e, exc_info=True)
|
||||
return json.dumps({"success": False, "error": f"Failed to list recent sessions: {e}"}, ensure_ascii=False)
|
||||
|
||||
|
||||
def session_search(
|
||||
query: str,
|
||||
role_filter: str = None,
|
||||
@@ -195,11 +247,14 @@ def session_search(
|
||||
if db is None:
|
||||
return json.dumps({"success": False, "error": "Session database not available."}, ensure_ascii=False)
|
||||
|
||||
limit = min(limit, 5) # Cap at 5 sessions to avoid excessive LLM calls
|
||||
|
||||
# Recent sessions mode: when query is empty, return metadata for recent sessions.
|
||||
# No LLM calls — just DB queries for titles, previews, timestamps.
|
||||
if not query or not query.strip():
|
||||
return json.dumps({"success": False, "error": "Query cannot be empty."}, ensure_ascii=False)
|
||||
return _list_recent_sessions(db, limit, current_session_id)
|
||||
|
||||
query = query.strip()
|
||||
limit = min(limit, 5) # Cap at 5 sessions to avoid excessive LLM calls
|
||||
|
||||
try:
|
||||
# Parse role filter
|
||||
@@ -364,8 +419,14 @@ def check_session_search_requirements() -> bool:
|
||||
SESSION_SEARCH_SCHEMA = {
|
||||
"name": "session_search",
|
||||
"description": (
|
||||
"Search your long-term memory of past conversations. This is your recall -- "
|
||||
"Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
|
||||
"every past session is searchable, and this tool summarizes what happened.\n\n"
|
||||
"TWO MODES:\n"
|
||||
"1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
|
||||
"Returns titles, previews, and timestamps. Zero LLM cost, instant. "
|
||||
"Start here when the user asks what were we working on or what did we do recently.\n"
|
||||
"2. Keyword search (with query): Search for specific topics across all past sessions. "
|
||||
"Returns LLM-generated summaries of matching sessions.\n\n"
|
||||
"USE THIS PROACTIVELY when:\n"
|
||||
"- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
|
||||
"- The user asks about a topic you worked on before but don't have in current context\n"
|
||||
@@ -385,7 +446,7 @@ SESSION_SEARCH_SCHEMA = {
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search query — keywords, phrases, or boolean expressions to find in past sessions.",
|
||||
"description": "Search query — keywords, phrases, or boolean expressions to find in past sessions. Omit this parameter entirely to browse recent sessions instead (returns titles, previews, timestamps with no LLM cost).",
|
||||
},
|
||||
"role_filter": {
|
||||
"type": "string",
|
||||
@@ -397,7 +458,7 @@ SESSION_SEARCH_SCHEMA = {
|
||||
"default": 3,
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
"required": [],
|
||||
},
|
||||
}
|
||||
|
||||
@@ -410,7 +471,7 @@ registry.register(
|
||||
toolset="session_search",
|
||||
schema=SESSION_SEARCH_SCHEMA,
|
||||
handler=lambda args, **kw: session_search(
|
||||
query=args.get("query", ""),
|
||||
query=args.get("query") or "",
|
||||
role_filter=args.get("role_filter"),
|
||||
limit=args.get("limit", 3),
|
||||
db=kw.get("db"),
|
||||
|
||||
@@ -1050,6 +1050,9 @@ def _get_configured_model() -> str:
|
||||
|
||||
def _resolve_trust_level(source: str) -> str:
|
||||
"""Map a source identifier to a trust level."""
|
||||
# Agent-created skills get their own permissive trust level
|
||||
if source == "agent-created":
|
||||
return "agent-created"
|
||||
# Official optional skills shipped with the repo
|
||||
if source.startswith("official/") or source == "official":
|
||||
return "builtin"
|
||||
|
||||
@@ -325,8 +325,9 @@ async def vision_analyze_tool(
|
||||
logger.info("Processing image with vision model...")
|
||||
|
||||
# Call the vision API via centralized router.
|
||||
# Read timeout from config.yaml (auxiliary.vision.timeout), default 30s.
|
||||
vision_timeout = 30.0
|
||||
# Read timeout from config.yaml (auxiliary.vision.timeout), default 120s.
|
||||
# Local vision models (llama.cpp, ollama) can take well over 30s.
|
||||
vision_timeout = 120.0
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
_cfg = load_config()
|
||||
|
||||
@@ -6,9 +6,20 @@ description: "Run custom code at key lifecycle points — log activity, send ale
|
||||
|
||||
# Event Hooks
|
||||
|
||||
The hooks system lets you run custom code at key points in the agent lifecycle — session creation, slash commands, each tool-calling step, and more. Hooks fire automatically during gateway operation without blocking the main agent pipeline.
|
||||
Hermes has two hook systems that run custom code at key lifecycle points:
|
||||
|
||||
## Creating a Hook
|
||||
| System | Registered via | Runs in | Use case |
|
||||
|--------|---------------|---------|----------|
|
||||
| **[Gateway hooks](#gateway-event-hooks)** | `HOOK.yaml` + `handler.py` in `~/.hermes/hooks/` | Gateway only | Logging, alerts, webhooks |
|
||||
| **[Plugin hooks](#plugin-hooks)** | `ctx.register_hook()` in a [plugin](/docs/user-guide/features/plugins) | CLI + Gateway | Tool interception, metrics, guardrails |
|
||||
|
||||
Both systems are non-blocking — errors in any hook are caught and logged, never crashing the agent.
|
||||
|
||||
## Gateway Event Hooks
|
||||
|
||||
Gateway hooks fire automatically during gateway operation (Telegram, Discord, Slack, WhatsApp) without blocking the main agent pipeline.
|
||||
|
||||
### Creating a Hook
|
||||
|
||||
Each hook is a directory under `~/.hermes/hooks/` containing two files:
|
||||
|
||||
@@ -19,7 +30,7 @@ Each hook is a directory under `~/.hermes/hooks/` containing two files:
|
||||
└── handler.py # Python handler function
|
||||
```
|
||||
|
||||
### HOOK.yaml
|
||||
#### HOOK.yaml
|
||||
|
||||
```yaml
|
||||
name: my-hook
|
||||
@@ -32,7 +43,7 @@ events:
|
||||
|
||||
The `events` list determines which events trigger your handler. You can subscribe to any combination of events, including wildcards like `command:*`.
|
||||
|
||||
### handler.py
|
||||
#### handler.py
|
||||
|
||||
```python
|
||||
import json
|
||||
@@ -58,25 +69,26 @@ async def handle(event_type: str, context: dict):
|
||||
- Can be `async def` or regular `def` — both work
|
||||
- Errors are caught and logged, never crashing the agent
|
||||
|
||||
## Available Events
|
||||
### Available Events
|
||||
|
||||
| Event | When it fires | Context keys |
|
||||
|-------|---------------|--------------|
|
||||
| `gateway:startup` | Gateway process starts | `platforms` (list of active platform names) |
|
||||
| `session:start` | New messaging session created | `platform`, `user_id`, `session_id`, `session_key` |
|
||||
| `session:end` | Session ended (before reset) | `platform`, `user_id`, `session_key` |
|
||||
| `session:reset` | User ran `/new` or `/reset` | `platform`, `user_id`, `session_key` |
|
||||
| `agent:start` | Agent begins processing a message | `platform`, `user_id`, `session_id`, `message` |
|
||||
| `agent:step` | Each iteration of the tool-calling loop | `platform`, `user_id`, `session_id`, `iteration`, `tool_names` |
|
||||
| `agent:end` | Agent finishes processing | `platform`, `user_id`, `session_id`, `message`, `response` |
|
||||
| `command:*` | Any slash command executed | `platform`, `user_id`, `command`, `args` |
|
||||
|
||||
### Wildcard Matching
|
||||
#### Wildcard Matching
|
||||
|
||||
Handlers registered for `command:*` fire for any `command:` event (`command:model`, `command:reset`, etc.). Monitor all slash commands with a single subscription.
|
||||
|
||||
## Examples
|
||||
### Examples
|
||||
|
||||
### Telegram Alert on Long Tasks
|
||||
#### Telegram Alert on Long Tasks
|
||||
|
||||
Send yourself a message when the agent takes more than 10 steps:
|
||||
|
||||
@@ -109,7 +121,7 @@ async def handle(event_type: str, context: dict):
|
||||
)
|
||||
```
|
||||
|
||||
### Command Usage Logger
|
||||
#### Command Usage Logger
|
||||
|
||||
Track which slash commands are used:
|
||||
|
||||
@@ -142,7 +154,7 @@ def handle(event_type: str, context: dict):
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
```
|
||||
|
||||
### Session Start Webhook
|
||||
#### Session Start Webhook
|
||||
|
||||
POST to an external service on new sessions:
|
||||
|
||||
@@ -169,7 +181,7 @@ async def handle(event_type: str, context: dict):
|
||||
}, timeout=5)
|
||||
```
|
||||
|
||||
## How It Works
|
||||
### How It Works
|
||||
|
||||
1. On gateway startup, `HookRegistry.discover_and_load()` scans `~/.hermes/hooks/`
|
||||
2. Each subdirectory with `HOOK.yaml` + `handler.py` is loaded dynamically
|
||||
@@ -178,5 +190,51 @@ async def handle(event_type: str, context: dict):
|
||||
5. Errors in any handler are caught and logged — a broken hook never crashes the agent
|
||||
|
||||
:::info
|
||||
Hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp). The CLI does not currently load hooks.
|
||||
Gateway hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp). The CLI does not load gateway hooks. For hooks that work everywhere, use [plugin hooks](#plugin-hooks).
|
||||
:::
|
||||
|
||||
## Plugin Hooks
|
||||
|
||||
[Plugins](/docs/user-guide/features/plugins) can register hooks that fire in **both CLI and gateway** sessions. These are registered programmatically via `ctx.register_hook()` in your plugin's `register()` function.
|
||||
|
||||
```python
|
||||
def register(ctx):
|
||||
ctx.register_hook("pre_tool_call", my_callback)
|
||||
ctx.register_hook("post_tool_call", my_callback)
|
||||
```
|
||||
|
||||
### Available Plugin Hooks
|
||||
|
||||
| Hook | Fires when | Callback receives |
|
||||
|------|-----------|-------------------|
|
||||
| `pre_tool_call` | Before any tool executes | `tool_name`, `args`, `task_id` |
|
||||
| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` |
|
||||
| `pre_llm_call` | Before LLM API request | *(planned — not yet wired)* |
|
||||
| `post_llm_call` | After LLM API response | *(planned — not yet wired)* |
|
||||
| `on_session_start` | Session begins | *(planned — not yet wired)* |
|
||||
| `on_session_end` | Session ends | *(planned — not yet wired)* |
|
||||
|
||||
Callbacks receive keyword arguments matching the columns above:
|
||||
|
||||
```python
|
||||
def my_callback(**kwargs):
|
||||
tool = kwargs["tool_name"]
|
||||
args = kwargs["args"]
|
||||
# ...
|
||||
```
|
||||
|
||||
### Example: Block Dangerous Tools
|
||||
|
||||
```python
|
||||
# ~/.hermes/plugins/tool-guard/__init__.py
|
||||
BLOCKED = {"terminal", "write_file"}
|
||||
|
||||
def guard(**kwargs):
|
||||
if kwargs["tool_name"] in BLOCKED:
|
||||
print(f"⚠ Blocked tool call: {kwargs['tool_name']}")
|
||||
|
||||
def register(ctx):
|
||||
ctx.register_hook("pre_tool_call", guard)
|
||||
```
|
||||
|
||||
See the **[Plugins guide](/docs/user-guide/features/plugins)** for full details on creating plugins.
|
||||
|
||||
@@ -46,14 +46,16 @@ Project-local plugins under `./.hermes/plugins/` are disabled by default. Enable
|
||||
|
||||
## Available hooks
|
||||
|
||||
Plugins can register callbacks for these lifecycle events. See the **[Event Hooks page](/docs/user-guide/features/hooks#plugin-hooks)** for full details, callback signatures, and examples.
|
||||
|
||||
| Hook | Fires when |
|
||||
|------|-----------|
|
||||
| `pre_tool_call` | Before any tool executes |
|
||||
| `post_tool_call` | After any tool returns |
|
||||
| `pre_llm_call` | Before LLM API request |
|
||||
| `post_llm_call` | After LLM API response |
|
||||
| `on_session_start` | Session begins |
|
||||
| `on_session_end` | Session ends |
|
||||
| `pre_llm_call` | Before LLM API request *(planned)* |
|
||||
| `post_llm_call` | After LLM API response *(planned)* |
|
||||
| `on_session_start` | Session begins *(planned)* |
|
||||
| `on_session_end` | Session ends *(planned)* |
|
||||
|
||||
## Slash commands
|
||||
|
||||
|
||||
Reference in New Issue
Block a user