mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 12:18:44 +08:00
Compare commits
2 Commits
opencode-p
...
hermes/her
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd65d1d287 | ||
|
|
c943fedf9d |
@@ -100,6 +100,26 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
||||
if not isinstance(function_args, dict):
|
||||
function_args = {}
|
||||
|
||||
# ── Tool Search unwrap ────────────────────────────────────────
|
||||
# When the model invokes the tool_call bridge, peel it open so
|
||||
# every downstream check (checkpointing, guardrails, plugin
|
||||
# pre-tool-call hooks, the display/activity feed, the post-call
|
||||
# callback) sees the underlying tool — not the bridge. This is
|
||||
# the OpenClaw lesson: hooks must observe the real tool name.
|
||||
#
|
||||
# The original tool_call entry on ``tool_call.function`` is left
|
||||
# untouched so the conversation transcript and the matching
|
||||
# tool_call_id are preserved exactly as the model emitted them.
|
||||
try:
|
||||
from tools import tool_search as _ts
|
||||
if function_name == _ts.TOOL_CALL_NAME:
|
||||
_underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
|
||||
if not _err and _underlying:
|
||||
function_name = _underlying
|
||||
function_args = _underlying_args
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Checkpoint for file-mutating tools
|
||||
if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
|
||||
try:
|
||||
@@ -496,6 +516,17 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
|
||||
if not isinstance(function_args, dict):
|
||||
function_args = {}
|
||||
|
||||
# Tool Search unwrap — see _execute_tool_calls for full rationale.
|
||||
try:
|
||||
from tools import tool_search as _ts
|
||||
if function_name == _ts.TOOL_CALL_NAME:
|
||||
_underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
|
||||
if not _err and _underlying:
|
||||
function_name = _underlying
|
||||
function_args = _underlying_args
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check plugin hooks for a block directive before executing.
|
||||
_block_msg: Optional[str] = None
|
||||
try:
|
||||
|
||||
@@ -1576,6 +1576,38 @@ DEFAULT_CONFIG = {
|
||||
"mode": "project",
|
||||
},
|
||||
|
||||
# Tool Search (progressive disclosure for large tool surfaces).
|
||||
# When the model is connected to many MCP servers or non-core plugin
|
||||
# tools, their JSON schemas can consume a substantial fraction of the
|
||||
# context window on every turn. When enabled, those tools are replaced
|
||||
# in the model-facing tools array with three bridge tools —
|
||||
# tool_search / tool_describe / tool_call — and surfaced on demand.
|
||||
#
|
||||
# Core Hermes tools (terminal, read_file, write_file, patch,
|
||||
# search_files, todo, memory, browser_*, etc.) are NEVER deferred.
|
||||
# See tools/tool_search.py for full design notes and the
|
||||
# openclaw-tool-search-report PDF in this PR for the rationale.
|
||||
"tools": {
|
||||
"tool_search": {
|
||||
# "auto" (default) — activate only when deferrable tool schemas
|
||||
# exceed ``threshold_pct`` of the active model's context length,
|
||||
# so small toolsets pay no overhead.
|
||||
# "on" — always activate when there is at least one deferrable
|
||||
# tool. Use when you have many MCP servers and want maximum
|
||||
# token reduction unconditionally.
|
||||
# "off" — disable entirely. Tools-array assembly is a pass-through.
|
||||
"enabled": "auto",
|
||||
# Percentage of context length at which "auto" mode kicks in.
|
||||
# 10 matches the Claude Code default. Range 0..100.
|
||||
"threshold_pct": 10,
|
||||
# When the model calls tool_search without a ``limit`` argument,
|
||||
# how many hits to return. Range 1..max_search_limit.
|
||||
"search_default_limit": 5,
|
||||
# Hard upper bound the model can request via ``limit``. Range 1..50.
|
||||
"max_search_limit": 20,
|
||||
},
|
||||
},
|
||||
|
||||
# Logging — controls file logging to ~/.hermes/logs/.
|
||||
# agent.log captures INFO+ (all agent activity); errors.log captures WARNING+.
|
||||
"logging": {
|
||||
|
||||
108
model_tools.py
108
model_tools.py
@@ -265,6 +265,7 @@ def get_tool_definitions(
|
||||
enabled_toolsets: List[str] = None,
|
||||
disabled_toolsets: List[str] = None,
|
||||
quiet_mode: bool = False,
|
||||
skip_tool_search_assembly: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get tool definitions for model API calls with toolset-based filtering.
|
||||
@@ -275,6 +276,11 @@ def get_tool_definitions(
|
||||
enabled_toolsets: Only include tools from these toolsets.
|
||||
disabled_toolsets: Exclude tools from these toolsets (if enabled_toolsets is None).
|
||||
quiet_mode: Suppress status prints.
|
||||
skip_tool_search_assembly: When True, return the pre-assembly tool list
|
||||
(raw schemas for every enabled tool). Used internally by the
|
||||
tool_search / tool_describe bridge handlers so they can read the
|
||||
real catalog, not the already-collapsed one. Public callers should
|
||||
leave this False.
|
||||
|
||||
Returns:
|
||||
Filtered list of OpenAI-format tool definitions.
|
||||
@@ -301,6 +307,7 @@ def get_tool_definitions(
|
||||
registry._generation,
|
||||
cfg_fp,
|
||||
bool(os.environ.get("HERMES_KANBAN_TASK")),
|
||||
bool(skip_tool_search_assembly),
|
||||
)
|
||||
cached = _tool_defs_cache.get(cache_key)
|
||||
if cached is not None:
|
||||
@@ -312,7 +319,8 @@ def get_tool_definitions(
|
||||
# schemas are treated as read-only by all known callers.
|
||||
return list(cached)
|
||||
|
||||
result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode)
|
||||
result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode,
|
||||
skip_tool_search_assembly=skip_tool_search_assembly)
|
||||
if quiet_mode:
|
||||
# Cache the freshly-computed list, but hand callers a shallow copy so
|
||||
# downstream mutations (e.g. run_agent appending memory/LCM tool
|
||||
@@ -330,6 +338,7 @@ def _compute_tool_definitions(
|
||||
enabled_toolsets: List[str] = None,
|
||||
disabled_toolsets: List[str] = None,
|
||||
quiet_mode: bool = False,
|
||||
skip_tool_search_assembly: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Uncached implementation of :func:`get_tool_definitions`."""
|
||||
# Determine which tool names the caller wants
|
||||
@@ -481,9 +490,61 @@ def _compute_tool_definitions(
|
||||
except Exception as e: # pragma: no cover — defensive
|
||||
logger.warning("Schema sanitization skipped: %s", e)
|
||||
|
||||
# ── Tool Search (progressive disclosure) ────────────────────────────
|
||||
# Conditionally replace MCP + plugin (non-core) tools with three bridge
|
||||
# tools (tool_search / tool_describe / tool_call) when the deferrable
|
||||
# surface exceeds the configured threshold (default 10% of context
|
||||
# window). Core Hermes tools (toolsets._HERMES_CORE_TOOLS) are NEVER
|
||||
# deferred. See tools/tool_search.py for full design notes.
|
||||
#
|
||||
# This is deliberately the last step before returning — sanitization
|
||||
# has already normalized schemas, and the assembly is idempotent in
|
||||
# case some caller invokes get_tool_definitions twice.
|
||||
try:
|
||||
from tools.tool_search import assemble_tool_defs, load_config as _load_ts_config
|
||||
ts_cfg = _load_ts_config()
|
||||
if not skip_tool_search_assembly and ts_cfg.enabled != "off":
|
||||
context_length = _resolve_active_context_length()
|
||||
assembly = assemble_tool_defs(
|
||||
filtered_tools,
|
||||
context_length=context_length,
|
||||
config=ts_cfg,
|
||||
)
|
||||
if assembly.activated and not quiet_mode:
|
||||
print(
|
||||
f"🔎 Tool Search: {assembly.deferred_count} MCP/plugin tools deferred "
|
||||
f"(~{assembly.deferred_tokens} tokens) behind tool_search/describe/call. "
|
||||
f"Threshold ~{assembly.threshold_tokens} tokens."
|
||||
)
|
||||
filtered_tools = assembly.tool_defs
|
||||
except Exception as e: # pragma: no cover — never break tool loading
|
||||
logger.warning("Tool search assembly skipped: %s", e)
|
||||
|
||||
return filtered_tools
|
||||
|
||||
|
||||
def _resolve_active_context_length() -> int:
|
||||
"""Look up the active model's context length for the tool-search gate.
|
||||
|
||||
Returns 0 when the model can't be resolved — ``should_activate`` falls
|
||||
back to a fixed token cutoff in that case.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load
|
||||
cfg = _load() or {}
|
||||
model_cfg = cfg.get("model") if isinstance(cfg.get("model"), dict) else {}
|
||||
if not isinstance(model_cfg, dict):
|
||||
model_cfg = {}
|
||||
model_id = (model_cfg.get("model") or model_cfg.get("default") or "").strip()
|
||||
if not model_id:
|
||||
return 0
|
||||
from agent.model_metadata import get_model_context_length
|
||||
return int(get_model_context_length(model_id) or 0)
|
||||
except Exception as e:
|
||||
logger.debug("Could not resolve active context length: %s", e)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# handle_function_call (the main dispatcher)
|
||||
# =============================================================================
|
||||
@@ -767,6 +828,51 @@ def handle_function_call(
|
||||
# Coerce string arguments to their schema-declared types (e.g. "42"→42)
|
||||
function_args = coerce_tool_args(function_name, function_args)
|
||||
|
||||
# ── Tool Search bridge dispatch ──────────────────────────────────
|
||||
# tool_search and tool_describe are pure catalog reads — handle them
|
||||
# inline. tool_call is unwrapped to the underlying tool so that every
|
||||
# downstream hook (pre/post, edit approval, guardrails) sees the real
|
||||
# tool name, not the bridge.
|
||||
_ts_mod = None
|
||||
try:
|
||||
from tools import tool_search as _ts_mod # noqa: F401
|
||||
except Exception:
|
||||
_ts_mod = None
|
||||
|
||||
if _ts_mod is not None and _ts_mod.is_bridge_tool(function_name):
|
||||
try:
|
||||
# Use skip_tool_search_assembly=True so we see the real catalog,
|
||||
# not the already-collapsed bridge-only list (the bridge would
|
||||
# otherwise be searching only itself).
|
||||
current_defs = get_tool_definitions(
|
||||
quiet_mode=True, skip_tool_search_assembly=True,
|
||||
) or []
|
||||
except Exception:
|
||||
current_defs = []
|
||||
if function_name == _ts_mod.TOOL_SEARCH_NAME:
|
||||
return _ts_mod.dispatch_tool_search(function_args or {},
|
||||
current_tool_defs=current_defs)
|
||||
if function_name == _ts_mod.TOOL_DESCRIBE_NAME:
|
||||
return _ts_mod.dispatch_tool_describe(function_args or {},
|
||||
current_tool_defs=current_defs)
|
||||
if function_name == _ts_mod.TOOL_CALL_NAME:
|
||||
underlying_name, underlying_args, err = _ts_mod.resolve_underlying_call(function_args or {})
|
||||
if err or not underlying_name:
|
||||
return json.dumps({"error": err or "tool_call could not be resolved"},
|
||||
ensure_ascii=False)
|
||||
# Recurse with the underlying tool. All hooks fire against the
|
||||
# real tool name. The bridge is invisible to hooks by design.
|
||||
return handle_function_call(
|
||||
function_name=underlying_name,
|
||||
function_args=underlying_args,
|
||||
task_id=task_id,
|
||||
tool_call_id=tool_call_id,
|
||||
session_id=session_id,
|
||||
user_task=user_task,
|
||||
enabled_tools=enabled_tools,
|
||||
skip_pre_tool_call_hook=skip_pre_tool_call_hook,
|
||||
)
|
||||
|
||||
try:
|
||||
if function_name in _AGENT_LOOP_TOOLS:
|
||||
return json.dumps({"error": f"{function_name} must be handled by the agent loop"})
|
||||
|
||||
45
scripts/LIVETEST_README.md
Normal file
45
scripts/LIVETEST_README.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Tool Search live test harness
|
||||
|
||||
Runs five scenarios against a real model (Claude Haiku 4.5 via OpenRouter) to
|
||||
verify that the bridge tools work end-to-end. Records transcripts in
|
||||
`scripts/out/`.
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
cd <repo root>
|
||||
python3 scripts/tool_search_livetest.py # runs all 5 scenarios x 2 modes
|
||||
python3 scripts/analyze_livetest.py # side-by-side report
|
||||
```
|
||||
|
||||
Requires `OPENROUTER_API_KEY` set or present in `~/.hermes/.env`.
|
||||
|
||||
## What it verifies
|
||||
|
||||
| Scenario | Tests |
|
||||
|----------|-------|
|
||||
| A obvious_single | BM25 retrieval on an obvious tool name (github_create_issue) |
|
||||
| B vague_paraphrased | Retrieval when the model has to paraphrase ("schedule meeting" → evt_create) |
|
||||
| C multi_tool_chain | Multi-step task chaining two deferred tools (GitHub + Slack) |
|
||||
| D core_plus_deferred | Mixed: core tool (read_file) called directly, deferred tool (Slack) via bridge |
|
||||
| E no_tool_needed | Pure-knowledge prompt; verify no spurious tool_search invocations |
|
||||
|
||||
Each scenario runs with `tool_search.enabled = on` and again with `off` for an
|
||||
A/B baseline. The harness records:
|
||||
|
||||
- bridge_calls (the tool_search / tool_describe / tool_call sequence the model emitted)
|
||||
- underlying_tool_calls (what actually ran through the registry dispatcher)
|
||||
- final_response, iteration count, elapsed time, any errors
|
||||
|
||||
## Output structure
|
||||
|
||||
```
|
||||
scripts/out/
|
||||
<scenario>__enabled.json # tool_search ON
|
||||
<scenario>__disabled.json # tool_search OFF
|
||||
_summary.json # one-line summary across all runs
|
||||
```
|
||||
|
||||
The 2026-05 baseline run is checked in for reference. Re-running may produce
|
||||
slightly different transcripts (the model is non-deterministic) but the
|
||||
expected_underlying_tools assertions should remain satisfied.
|
||||
114
scripts/analyze_livetest.py
Normal file
114
scripts/analyze_livetest.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare enabled vs disabled runs and produce a readable report.
|
||||
|
||||
Reads scripts/out/_summary.json and the per-scenario JSONs, prints a side-by-
|
||||
side comparison of what happened, and flags anomalies.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
HERE = Path(__file__).resolve().parent
|
||||
OUT = HERE / "out"
|
||||
|
||||
|
||||
def load_record(scenario_id: str, mode: str):
|
||||
path = OUT / f"{scenario_id}__{mode}.json"
|
||||
if not path.exists():
|
||||
return None
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def fmt_tool_seq(calls):
|
||||
if not calls:
|
||||
return "(none)"
|
||||
return " → ".join(c["name"] for c in calls)
|
||||
|
||||
|
||||
def fmt_bridge_seq(calls):
|
||||
if not calls:
|
||||
return "(none)"
|
||||
parts = []
|
||||
for c in calls:
|
||||
if c["name"] == "tool_call":
|
||||
inner = (c.get("args") or {}).get("name", "?")
|
||||
parts.append(f"tool_call→{inner}")
|
||||
elif c["name"] == "tool_search":
|
||||
q = (c.get("args") or {}).get("query", "?")
|
||||
parts.append(f"search('{q[:30]}')")
|
||||
elif c["name"] == "tool_describe":
|
||||
n = (c.get("args") or {}).get("name", "?")
|
||||
parts.append(f"describe({n})")
|
||||
return " → ".join(parts)
|
||||
|
||||
|
||||
def main():
|
||||
if not OUT.exists():
|
||||
print("No output directory at", OUT)
|
||||
sys.exit(1)
|
||||
summary_path = OUT / "_summary.json"
|
||||
if not summary_path.exists():
|
||||
print("No _summary.json yet")
|
||||
sys.exit(1)
|
||||
|
||||
summary = json.loads(summary_path.read_text())
|
||||
scenarios = sorted({row["scenario"] for row in summary})
|
||||
|
||||
print(f"{'='*78}")
|
||||
print(f" Live test results: tool_search ENABLED vs DISABLED")
|
||||
print(f"{'='*78}\n")
|
||||
|
||||
fails = 0
|
||||
for sid in scenarios:
|
||||
en = load_record(sid, "enabled")
|
||||
di = load_record(sid, "disabled")
|
||||
if not en or not di:
|
||||
continue
|
||||
expected = set(en["expected_underlying_tools"])
|
||||
|
||||
print(f"┌─ {sid} ({en['scenario_description']})")
|
||||
print(f"│ Prompt: {en['prompt'][:120]}")
|
||||
print(f"│ Expected underlying tools: {sorted(expected) or '(none)'}")
|
||||
print(f"│")
|
||||
|
||||
for label, rec in [("ENABLED ", en), ("DISABLED", di)]:
|
||||
called_under = [c["name"] for c in rec["underlying_tool_calls"]]
|
||||
called_set = set(called_under)
|
||||
missing = expected - called_set
|
||||
extra = called_set - expected - {"read_file", "search_files", "terminal", "todo", "memory"}
|
||||
|
||||
mark = "✓" if (expected.issubset(called_set) and not rec["error"]) else "✗"
|
||||
if mark == "✗":
|
||||
fails += 1
|
||||
|
||||
print(f"│ {label} {mark} bridges={len(rec['bridge_calls']):2} underlying={len(rec['underlying_tool_calls']):2} "
|
||||
f"iters={rec['n_iterations']:2} elapsed={rec['elapsed_seconds']:5.1f}s err={bool(rec['error'])}")
|
||||
print(f"│ underlying: {fmt_tool_seq(rec['underlying_tool_calls'])}")
|
||||
if rec["bridge_calls"]:
|
||||
print(f"│ bridges: {fmt_bridge_seq(rec['bridge_calls'])}")
|
||||
if missing:
|
||||
print(f"│ ⚠ MISSING expected tools: {sorted(missing)}")
|
||||
if extra:
|
||||
print(f"│ ⓘ extra tools called: {sorted(extra)}")
|
||||
if rec["error"]:
|
||||
print(f"│ 💥 error: {rec['error'][:200]}")
|
||||
# Bridge-trip count vs direct (interesting comparator)
|
||||
en_bridges = len(en["bridge_calls"])
|
||||
di_underlying = len(di["underlying_tool_calls"])
|
||||
en_underlying = len(en["underlying_tool_calls"])
|
||||
overhead = en_bridges + en_underlying - di_underlying
|
||||
print(f"│ Δ round-trip cost: enabled used {en_bridges + en_underlying} calls vs disabled {di_underlying} → +{overhead}")
|
||||
print(f"│ Final (enabled): {(en.get('final_response') or '')[:140]}")
|
||||
print(f"│ Final (disabled): {(di.get('final_response') or '')[:140]}")
|
||||
print(f"└──")
|
||||
print()
|
||||
|
||||
print(f"\nFails: {fails}/{2*len(scenarios)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
26
scripts/out/A_obvious_single__disabled.json
Normal file
26
scripts/out/A_obvious_single__disabled.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"scenario_id": "A_obvious_single",
|
||||
"scenario_description": "Single tool, obvious name in the user request",
|
||||
"tool_search_enabled": false,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Open a GitHub issue in repo 'acme/widget' titled 'Crash on startup' with body 'App crashes immediately after launch when offline.' Then tell me you're done. Don't do anything else.",
|
||||
"expected_underlying_tools": [
|
||||
"github_create_issue"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 9.67,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "github_create_issue",
|
||||
"args": {
|
||||
"repo": "acme/widget",
|
||||
"title": "Crash on startup",
|
||||
"body": "App crashes immediately after launch when offline."
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 2,
|
||||
"error": null
|
||||
}
|
||||
50
scripts/out/A_obvious_single__enabled.json
Normal file
50
scripts/out/A_obvious_single__enabled.json
Normal file
@@ -0,0 +1,50 @@
|
||||
{
|
||||
"scenario_id": "A_obvious_single",
|
||||
"scenario_description": "Single tool, obvious name in the user request",
|
||||
"tool_search_enabled": true,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Open a GitHub issue in repo 'acme/widget' titled 'Crash on startup' with body 'App crashes immediately after launch when offline.' Then tell me you're done. Don't do anything else.",
|
||||
"expected_underlying_tools": [
|
||||
"github_create_issue"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 18.51,
|
||||
"bridge_calls": [
|
||||
{
|
||||
"name": "tool_search",
|
||||
"args": {
|
||||
"query": "create github issue"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_describe",
|
||||
"args": {
|
||||
"name": "github_create_issue"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_call",
|
||||
"args": {
|
||||
"name": "github_create_issue",
|
||||
"arguments": {
|
||||
"repo": "acme/widget",
|
||||
"title": "Crash on startup",
|
||||
"body": "App crashes immediately after launch when offline."
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "github_create_issue",
|
||||
"args": {
|
||||
"repo": "acme/widget",
|
||||
"title": "Crash on startup",
|
||||
"body": "App crashes immediately after launch when offline."
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 4,
|
||||
"error": null
|
||||
}
|
||||
26
scripts/out/B_vague_paraphrased__disabled.json
Normal file
26
scripts/out/B_vague_paraphrased__disabled.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"scenario_id": "B_vague_paraphrased",
|
||||
"scenario_description": "Single tool, paraphrased intent (tests retrieval quality)",
|
||||
"tool_search_enabled": false,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Add a meeting to my schedule for tomorrow morning at 10am called 'Design review', 30 minutes long. Then tell me you're done. Don't do anything else.",
|
||||
"expected_underlying_tools": [
|
||||
"evt_create"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 11.26,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "evt_create",
|
||||
"args": {
|
||||
"title": "Design review",
|
||||
"start": "2026-05-29T10:00:00",
|
||||
"duration_min": 30
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 2,
|
||||
"error": null
|
||||
}
|
||||
50
scripts/out/B_vague_paraphrased__enabled.json
Normal file
50
scripts/out/B_vague_paraphrased__enabled.json
Normal file
@@ -0,0 +1,50 @@
|
||||
{
|
||||
"scenario_id": "B_vague_paraphrased",
|
||||
"scenario_description": "Single tool, paraphrased intent (tests retrieval quality)",
|
||||
"tool_search_enabled": true,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Add a meeting to my schedule for tomorrow morning at 10am called 'Design review', 30 minutes long. Then tell me you're done. Don't do anything else.",
|
||||
"expected_underlying_tools": [
|
||||
"evt_create"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 15.57,
|
||||
"bridge_calls": [
|
||||
{
|
||||
"name": "tool_search",
|
||||
"args": {
|
||||
"query": "calendar schedule meeting"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_describe",
|
||||
"args": {
|
||||
"name": "evt_create"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_call",
|
||||
"args": {
|
||||
"name": "evt_create",
|
||||
"arguments": {
|
||||
"title": "Design review",
|
||||
"start": "2026-05-29T10:00:00",
|
||||
"duration_min": 30
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "evt_create",
|
||||
"args": {
|
||||
"title": "Design review",
|
||||
"start": "2026-05-29T10:00:00",
|
||||
"duration_min": 30
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 4,
|
||||
"error": null
|
||||
}
|
||||
32
scripts/out/C_multi_tool_chain__disabled.json
Normal file
32
scripts/out/C_multi_tool_chain__disabled.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"scenario_id": "C_multi_tool_chain",
|
||||
"scenario_description": "Multi-step task requiring 2-3 deferred tools",
|
||||
"tool_search_enabled": false,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Find the open pull requests on repo 'acme/widget', then post a summary of how many there are to the #engineering Slack channel. Then tell me you're done.",
|
||||
"expected_underlying_tools": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 14.11,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "github_list_pulls",
|
||||
"args": {
|
||||
"repo": "acme/widget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "slack_send_message",
|
||||
"args": {
|
||||
"channel": "#engineering",
|
||||
"text": "Open pull requests summary for acme/widget: 1 open PR\n\n- #31163: feat(tools): tool search"
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 3,
|
||||
"error": null
|
||||
}
|
||||
64
scripts/out/C_multi_tool_chain__enabled.json
Normal file
64
scripts/out/C_multi_tool_chain__enabled.json
Normal file
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"scenario_id": "C_multi_tool_chain",
|
||||
"scenario_description": "Multi-step task requiring 2-3 deferred tools",
|
||||
"tool_search_enabled": true,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Find the open pull requests on repo 'acme/widget', then post a summary of how many there are to the #engineering Slack channel. Then tell me you're done.",
|
||||
"expected_underlying_tools": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 20.3,
|
||||
"bridge_calls": [
|
||||
{
|
||||
"name": "tool_search",
|
||||
"args": {
|
||||
"query": "GitHub pull requests"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_search",
|
||||
"args": {
|
||||
"query": "Slack post message channel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_call",
|
||||
"args": {
|
||||
"name": "github_list_pulls",
|
||||
"arguments": {
|
||||
"repo": "acme/widget"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_call",
|
||||
"args": {
|
||||
"arguments": {
|
||||
"channel": "#engineering",
|
||||
"message": "Open PR Summary for acme/widget: 1 open pull request\n\n\u2022 PR #31163: feat(tools): tool search"
|
||||
},
|
||||
"name": "slack_send_message"
|
||||
}
|
||||
}
|
||||
],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "github_list_pulls",
|
||||
"args": {
|
||||
"repo": "acme/widget"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "slack_send_message",
|
||||
"args": {
|
||||
"channel": "#engineering",
|
||||
"message": "Open PR Summary for acme/widget: 1 open pull request\n\n\u2022 PR #31163: feat(tools): tool search"
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done. Found 1 open pull request in acme/widget (PR #31163: feat(tools): tool search) and posted a summary to #engineering.",
|
||||
"n_iterations": 4,
|
||||
"error": null
|
||||
}
|
||||
32
scripts/out/D_core_plus_deferred__disabled.json
Normal file
32
scripts/out/D_core_plus_deferred__disabled.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"scenario_id": "D_core_plus_deferred",
|
||||
"scenario_description": "Task uses BOTH a core tool (read_file) and a deferred tool",
|
||||
"tool_search_enabled": false,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Read the file at /tmp/livetest/notes.txt (it exists, just read it) and then post its contents to the #random Slack channel. Tell me you're done.",
|
||||
"expected_underlying_tools": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 9.78,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "read_file",
|
||||
"args": {
|
||||
"path": "/tmp/livetest/notes.txt"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "slack_send_message",
|
||||
"args": {
|
||||
"channel": "#random",
|
||||
"text": "Hello from the test fixture."
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done.",
|
||||
"n_iterations": 3,
|
||||
"error": null
|
||||
}
|
||||
55
scripts/out/D_core_plus_deferred__enabled.json
Normal file
55
scripts/out/D_core_plus_deferred__enabled.json
Normal file
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"scenario_id": "D_core_plus_deferred",
|
||||
"scenario_description": "Task uses BOTH a core tool (read_file) and a deferred tool",
|
||||
"tool_search_enabled": true,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "Read the file at /tmp/livetest/notes.txt (it exists, just read it) and then post its contents to the #random Slack channel. Tell me you're done.",
|
||||
"expected_underlying_tools": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 33.13,
|
||||
"bridge_calls": [
|
||||
{
|
||||
"name": "tool_search",
|
||||
"args": {
|
||||
"query": "post message Slack channel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_describe",
|
||||
"args": {
|
||||
"name": "slack_send_message"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tool_call",
|
||||
"args": {
|
||||
"name": "slack_send_message",
|
||||
"arguments": {
|
||||
"channel": "#random",
|
||||
"text": "Hello from the test fixture."
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"underlying_tool_calls": [
|
||||
{
|
||||
"name": "read_file",
|
||||
"args": {
|
||||
"path": "/tmp/livetest/notes.txt"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "slack_send_message",
|
||||
"args": {
|
||||
"channel": "#random",
|
||||
"text": "Hello from the test fixture."
|
||||
}
|
||||
}
|
||||
],
|
||||
"final_response": "Done. I read the file from /tmp/livetest/notes.txt and posted its contents to the #random Slack channel.",
|
||||
"n_iterations": 5,
|
||||
"error": null
|
||||
}
|
||||
15
scripts/out/E_no_tool_needed__disabled.json
Normal file
15
scripts/out/E_no_tool_needed__disabled.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"scenario_id": "E_no_tool_needed",
|
||||
"scenario_description": "Question doesn't need any tool \u2014 model should just answer",
|
||||
"tool_search_enabled": false,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "What's 7 times 8? Answer with just the number.",
|
||||
"expected_underlying_tools": [],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 2.76,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [],
|
||||
"final_response": "56",
|
||||
"n_iterations": 1,
|
||||
"error": null
|
||||
}
|
||||
15
scripts/out/E_no_tool_needed__enabled.json
Normal file
15
scripts/out/E_no_tool_needed__enabled.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"scenario_id": "E_no_tool_needed",
|
||||
"scenario_description": "Question doesn't need any tool \u2014 model should just answer",
|
||||
"tool_search_enabled": true,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": "What's 7 times 8? Answer with just the number.",
|
||||
"expected_underlying_tools": [],
|
||||
"n_fake_tools_registered": 20,
|
||||
"elapsed_seconds": 8.25,
|
||||
"bridge_calls": [],
|
||||
"underlying_tool_calls": [],
|
||||
"final_response": "56",
|
||||
"n_iterations": 1,
|
||||
"error": null
|
||||
}
|
||||
142
scripts/out/_summary.json
Normal file
142
scripts/out/_summary.json
Normal file
@@ -0,0 +1,142 @@
|
||||
[
|
||||
{
|
||||
"scenario": "A_obvious_single",
|
||||
"enabled": true,
|
||||
"n_bridge": 3,
|
||||
"n_underlying": 1,
|
||||
"elapsed": 18.51,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"github_create_issue"
|
||||
],
|
||||
"expected": [
|
||||
"github_create_issue"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "A_obvious_single",
|
||||
"enabled": false,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 1,
|
||||
"elapsed": 9.67,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"github_create_issue"
|
||||
],
|
||||
"expected": [
|
||||
"github_create_issue"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "B_vague_paraphrased",
|
||||
"enabled": true,
|
||||
"n_bridge": 3,
|
||||
"n_underlying": 1,
|
||||
"elapsed": 15.57,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"evt_create"
|
||||
],
|
||||
"expected": [
|
||||
"evt_create"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "B_vague_paraphrased",
|
||||
"enabled": false,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 1,
|
||||
"elapsed": 11.26,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"evt_create"
|
||||
],
|
||||
"expected": [
|
||||
"evt_create"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "C_multi_tool_chain",
|
||||
"enabled": true,
|
||||
"n_bridge": 4,
|
||||
"n_underlying": 2,
|
||||
"elapsed": 20.3,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
],
|
||||
"expected": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "C_multi_tool_chain",
|
||||
"enabled": false,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 2,
|
||||
"elapsed": 14.11,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
],
|
||||
"expected": [
|
||||
"github_list_pulls",
|
||||
"slack_send_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "D_core_plus_deferred",
|
||||
"enabled": true,
|
||||
"n_bridge": 3,
|
||||
"n_underlying": 2,
|
||||
"elapsed": 33.13,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
],
|
||||
"expected": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "D_core_plus_deferred",
|
||||
"enabled": false,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 2,
|
||||
"elapsed": 9.78,
|
||||
"error": false,
|
||||
"underlying_tools_called": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
],
|
||||
"expected": [
|
||||
"read_file",
|
||||
"slack_send_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scenario": "E_no_tool_needed",
|
||||
"enabled": true,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 0,
|
||||
"elapsed": 8.25,
|
||||
"error": false,
|
||||
"underlying_tools_called": [],
|
||||
"expected": []
|
||||
},
|
||||
{
|
||||
"scenario": "E_no_tool_needed",
|
||||
"enabled": false,
|
||||
"n_bridge": 0,
|
||||
"n_underlying": 0,
|
||||
"elapsed": 2.76,
|
||||
"error": false,
|
||||
"underlying_tools_called": [],
|
||||
"expected": []
|
||||
}
|
||||
]
|
||||
526
scripts/tool_search_livetest.py
Normal file
526
scripts/tool_search_livetest.py
Normal file
@@ -0,0 +1,526 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Live test harness for Hermes Agent's Tool Search feature.
|
||||
|
||||
Spins up a real AIAgent against a real model, registers ~20 fake "MCP" tools
|
||||
with realistic shapes (github-like, slack-like, calendar-like, search-like),
|
||||
runs a small set of scenarios, and records exactly what the model did.
|
||||
|
||||
For each scenario we record:
|
||||
- the full message transcript
|
||||
- the sequence of tool calls (name + args) the model emitted
|
||||
- which underlying tools actually got invoked (after bridge unwrap)
|
||||
- the final assistant response
|
||||
- timing and round-trip count
|
||||
|
||||
Each scenario runs twice:
|
||||
- tool_search ENABLED (deferred behind bridges)
|
||||
- tool_search DISABLED (all tools loaded directly)
|
||||
|
||||
Output: ./out/<scenario_id>__<enabled|disabled>.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
# Force-isolate the test environment BEFORE any hermes imports.
|
||||
ORIGINAL_HOME = os.environ.get("HERMES_HOME")
|
||||
ORIGINAL_AUTH = Path.home() / ".hermes" / "auth.json"
|
||||
|
||||
_THIS_DIR = Path(__file__).resolve().parent
|
||||
_WORKTREE_ROOT = _THIS_DIR.parent
|
||||
sys.path.insert(0, str(_WORKTREE_ROOT))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fake MCP tools — realistic shape, varied difficulty for retrieval
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FAKE_MCP_TOOLS: List[Dict[str, Any]] = [
|
||||
# GitHub cluster
|
||||
{
|
||||
"name": "github_create_issue",
|
||||
"description": "Open a new issue in a GitHub repository. Use when the user wants to report a bug or request a feature in a repo.",
|
||||
"params": {"repo": ("string", "Repository in owner/name form"),
|
||||
"title": ("string", "Issue title"),
|
||||
"body": ("string", "Issue body in Markdown")},
|
||||
"returns": lambda args: {"ok": True, "issue_url": f"https://github.com/{args.get('repo','x/y')}/issues/42"},
|
||||
},
|
||||
{
|
||||
"name": "github_search_repos",
|
||||
"description": "Search GitHub repositories by free-text query. Returns a ranked list of repo names with star counts.",
|
||||
"params": {"query": ("string", "Search terms"),
|
||||
"limit": ("integer", "Max results")},
|
||||
"returns": lambda args: {"results": [{"name": "fake/repo-1", "stars": 1200},
|
||||
{"name": "fake/repo-2", "stars": 540}]},
|
||||
},
|
||||
{
|
||||
"name": "github_close_pr",
|
||||
"description": "Close a pull request without merging it. Use when the PR should be abandoned.",
|
||||
"params": {"repo": ("string", ""), "pr_number": ("integer", "")},
|
||||
"returns": lambda args: {"ok": True, "state": "closed"},
|
||||
},
|
||||
{
|
||||
"name": "github_list_pulls",
|
||||
"description": "List open pull requests for a repository.",
|
||||
"params": {"repo": ("string", "")},
|
||||
"returns": lambda args: {"pulls": [{"number": 31163, "title": "feat(tools): tool search"}]},
|
||||
},
|
||||
|
||||
# Slack cluster
|
||||
{
|
||||
"name": "slack_send_message",
|
||||
"description": "Post a message into a Slack channel as the connected workspace's app.",
|
||||
"params": {"channel": ("string", "Channel name with leading #"),
|
||||
"text": ("string", "Message body")},
|
||||
"returns": lambda args: {"ok": True, "ts": "1716528000.000100"},
|
||||
},
|
||||
{
|
||||
"name": "slack_list_channels",
|
||||
"description": "Return all channels visible to the connected Slack workspace bot.",
|
||||
"params": {},
|
||||
"returns": lambda args: {"channels": ["#general", "#engineering", "#random"]},
|
||||
},
|
||||
{
|
||||
"name": "slack_set_status",
|
||||
"description": "Set the current user's Slack status (emoji + text).",
|
||||
"params": {"emoji": ("string", ""), "text": ("string", "")},
|
||||
"returns": lambda args: {"ok": True},
|
||||
},
|
||||
|
||||
# Calendar cluster (intentionally vague names to stress retrieval)
|
||||
{
|
||||
"name": "evt_create",
|
||||
"description": "Add an event to the connected calendar. Used for scheduling meetings.",
|
||||
"params": {"title": ("string", ""),
|
||||
"start": ("string", "ISO 8601 datetime"),
|
||||
"duration_min": ("integer", "")},
|
||||
"returns": lambda args: {"ok": True, "event_id": "evt_abc"},
|
||||
},
|
||||
{
|
||||
"name": "evt_list",
|
||||
"description": "List upcoming calendar events.",
|
||||
"params": {"max_results": ("integer", "")},
|
||||
"returns": lambda args: {"events": [{"id": "evt_1", "title": "Standup", "start": "2026-05-25T09:00:00Z"}]},
|
||||
},
|
||||
|
||||
# Knowledge / docs (paraphrased name to stress retrieval)
|
||||
{
|
||||
"name": "docsearch_query",
|
||||
"description": "Search the user's internal documentation index for matching pages.",
|
||||
"params": {"q": ("string", "Search query"), "limit": ("integer", "")},
|
||||
"returns": lambda args: {"hits": [{"title": "Onboarding", "url": "https://docs/x"}]},
|
||||
},
|
||||
{
|
||||
"name": "docsearch_fetch",
|
||||
"description": "Fetch the full markdown content of one document by ID.",
|
||||
"params": {"id": ("string", "")},
|
||||
"returns": lambda args: {"content": "# Onboarding\n..."},
|
||||
},
|
||||
|
||||
# Database
|
||||
{
|
||||
"name": "db_query",
|
||||
"description": "Run a read-only SQL query against the analytics database.",
|
||||
"params": {"sql": ("string", "SELECT ... statement")},
|
||||
"returns": lambda args: {"rows": [{"id": 1, "name": "alice"}]},
|
||||
},
|
||||
{
|
||||
"name": "db_describe_table",
|
||||
"description": "Show the schema of a database table.",
|
||||
"params": {"table": ("string", "")},
|
||||
"returns": lambda args: {"columns": [{"name": "id", "type": "int"}, {"name": "name", "type": "text"}]},
|
||||
},
|
||||
|
||||
# Linear
|
||||
{
|
||||
"name": "linear_create_ticket",
|
||||
"description": "Create a new Linear issue (ticket) in the connected workspace.",
|
||||
"params": {"title": ("string", ""), "body": ("string", ""), "priority": ("integer", "1-4")},
|
||||
"returns": lambda args: {"ok": True, "id": "ENG-101"},
|
||||
},
|
||||
{
|
||||
"name": "linear_assign",
|
||||
"description": "Reassign a Linear ticket to a different user.",
|
||||
"params": {"ticket_id": ("string", ""), "user": ("string", "")},
|
||||
"returns": lambda args: {"ok": True},
|
||||
},
|
||||
|
||||
# Notion
|
||||
{
|
||||
"name": "notion_create_page",
|
||||
"description": "Create a new page in the connected Notion workspace.",
|
||||
"params": {"title": ("string", ""), "body": ("string", ""), "parent": ("string", "")},
|
||||
"returns": lambda args: {"ok": True, "page_id": "abc123"},
|
||||
},
|
||||
|
||||
# Random others (filler / distractors)
|
||||
{
|
||||
"name": "weather_get",
|
||||
"description": "Look up the current weather for a city.",
|
||||
"params": {"city": ("string", "")},
|
||||
"returns": lambda args: {"city": args.get("city", ""), "temp_c": 19, "summary": "Cloudy"},
|
||||
},
|
||||
{
|
||||
"name": "translate_text",
|
||||
"description": "Translate a short text from one language to another.",
|
||||
"params": {"text": ("string", ""), "to": ("string", "Target language code")},
|
||||
"returns": lambda args: {"translated": args.get("text", "") + " [translated to " + args.get("to", "??") + "]"},
|
||||
},
|
||||
{
|
||||
"name": "pdf_extract",
|
||||
"description": "Extract text from a PDF file given its path.",
|
||||
"params": {"path": ("string", "")},
|
||||
"returns": lambda args: {"text": "[fake PDF text]"},
|
||||
},
|
||||
{
|
||||
"name": "yt_transcript",
|
||||
"description": "Fetch the transcript for a YouTube video by URL.",
|
||||
"params": {"url": ("string", "")},
|
||||
"returns": lambda args: {"transcript": "[fake transcript]"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCENARIOS: List[Dict[str, Any]] = [
|
||||
{
|
||||
"id": "A_obvious_single",
|
||||
"description": "Single tool, obvious name in the user request",
|
||||
"prompt": (
|
||||
"Open a GitHub issue in repo 'acme/widget' titled 'Crash on startup' "
|
||||
"with body 'App crashes immediately after launch when offline.' "
|
||||
"Then tell me you're done. Don't do anything else."
|
||||
),
|
||||
"expected_underlying_tools": ["github_create_issue"],
|
||||
},
|
||||
{
|
||||
"id": "B_vague_paraphrased",
|
||||
"description": "Single tool, paraphrased intent (tests retrieval quality)",
|
||||
"prompt": (
|
||||
"Add a meeting to my schedule for tomorrow morning at 10am called "
|
||||
"'Design review', 30 minutes long. Then tell me you're done. Don't do anything else."
|
||||
),
|
||||
"expected_underlying_tools": ["evt_create"],
|
||||
},
|
||||
{
|
||||
"id": "C_multi_tool_chain",
|
||||
"description": "Multi-step task requiring 2-3 deferred tools",
|
||||
"prompt": (
|
||||
"Find the open pull requests on repo 'acme/widget', then post a "
|
||||
"summary of how many there are to the #engineering Slack channel. "
|
||||
"Then tell me you're done."
|
||||
),
|
||||
"expected_underlying_tools": ["github_list_pulls", "slack_send_message"],
|
||||
},
|
||||
{
|
||||
"id": "D_core_plus_deferred",
|
||||
"description": "Task uses BOTH a core tool (read_file) and a deferred tool",
|
||||
"prompt": (
|
||||
"Read the file at /tmp/livetest/notes.txt (it exists, just read it) "
|
||||
"and then post its contents to the #random Slack channel. Tell me you're done."
|
||||
),
|
||||
"expected_underlying_tools": ["read_file", "slack_send_message"],
|
||||
"expected_core_tool_direct": True, # must NOT use tool_call for read_file
|
||||
},
|
||||
{
|
||||
"id": "E_no_tool_needed",
|
||||
"description": "Question doesn't need any tool — model should just answer",
|
||||
"prompt": "What's 7 times 8? Answer with just the number.",
|
||||
"expected_underlying_tools": [],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Harness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def setup_isolated_home(enabled: bool) -> Path:
|
||||
"""Create a fresh ~/.hermes/ for one test, copying minimal credentials.
|
||||
|
||||
Also reads OPENROUTER_API_KEY from the user's real ``~/.hermes/.env`` so
|
||||
the agent can authenticate against OpenRouter inside the isolated home.
|
||||
"""
|
||||
home_dir = Path(tempfile.mkdtemp(prefix="hermes_ts_live_"))
|
||||
hermes_home = home_dir / ".hermes"
|
||||
hermes_home.mkdir(parents=True)
|
||||
|
||||
if ORIGINAL_AUTH.exists():
|
||||
shutil.copy(ORIGINAL_AUTH, hermes_home / "auth.json")
|
||||
|
||||
# Copy .env so OPENROUTER_API_KEY (or others) are visible.
|
||||
real_env_file = Path.home() / ".hermes" / ".env"
|
||||
if real_env_file.exists():
|
||||
shutil.copy(real_env_file, hermes_home / ".env")
|
||||
# Also load into current process env so it's actually visible to the
|
||||
# provider resolver in this run.
|
||||
for line in real_env_file.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("OPENROUTER_API_KEY=") and "OPENROUTER_API_KEY" not in os.environ:
|
||||
_, val = line.split("=", 1)
|
||||
os.environ["OPENROUTER_API_KEY"] = val.strip().strip('"').strip("'")
|
||||
break
|
||||
|
||||
cfg = {
|
||||
"model": {
|
||||
"provider": "openrouter",
|
||||
"model": "anthropic/claude-haiku-4.5",
|
||||
},
|
||||
"tools": {
|
||||
"tool_search": {
|
||||
"enabled": "on" if enabled else "off",
|
||||
"threshold_pct": 10,
|
||||
"search_default_limit": 5,
|
||||
"max_search_limit": 20,
|
||||
},
|
||||
},
|
||||
"logging": {"level": "WARNING"},
|
||||
}
|
||||
(hermes_home / "config.yaml").write_text(_yaml_dump(cfg))
|
||||
return hermes_home
|
||||
|
||||
|
||||
def _yaml_dump(obj: Any) -> str:
|
||||
try:
|
||||
import yaml
|
||||
return yaml.safe_dump(obj, sort_keys=False)
|
||||
except ImportError:
|
||||
return json.dumps(obj, indent=2)
|
||||
|
||||
|
||||
def register_fake_tools() -> int:
|
||||
"""Register the FAKE_MCP_TOOLS into the live tool registry."""
|
||||
from tools.registry import registry
|
||||
|
||||
def make_handler(tool_def):
|
||||
def _handler(*args, **kwargs):
|
||||
try:
|
||||
return json.dumps(tool_def["returns"](kwargs), ensure_ascii=False)
|
||||
except Exception as e:
|
||||
return json.dumps({"error": f"fake tool handler error: {e}"})
|
||||
return _handler
|
||||
|
||||
count = 0
|
||||
for tdef in FAKE_MCP_TOOLS:
|
||||
properties = {}
|
||||
required = []
|
||||
for p_name, (p_type, p_desc) in tdef["params"].items():
|
||||
properties[p_name] = {"type": p_type, "description": p_desc}
|
||||
required.append(p_name)
|
||||
|
||||
registry.register(
|
||||
name=tdef["name"],
|
||||
toolset="mcp-fake",
|
||||
schema={
|
||||
"name": tdef["name"],
|
||||
"description": tdef["description"],
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required": required,
|
||||
},
|
||||
},
|
||||
handler=make_handler(tdef),
|
||||
)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def reset_module_state():
|
||||
"""Drop cached modules so the new HERMES_HOME takes effect."""
|
||||
keys = [k for k in sys.modules.keys()
|
||||
if k.startswith(("tools.", "model_tools", "toolsets",
|
||||
"hermes_cli", "agent.", "run_agent"))]
|
||||
for k in keys:
|
||||
del sys.modules[k]
|
||||
|
||||
|
||||
def run_one_scenario(scenario: Dict[str, Any], enabled: bool, out_dir: Path) -> Dict[str, Any]:
|
||||
"""Run one (scenario, enabled) combination. Returns the recorded transcript."""
|
||||
reset_module_state()
|
||||
home = setup_isolated_home(enabled=enabled)
|
||||
os.environ["HERMES_HOME"] = str(home)
|
||||
|
||||
# Pre-create the test file used by scenario D.
|
||||
Path("/tmp/livetest").mkdir(exist_ok=True)
|
||||
Path("/tmp/livetest/notes.txt").write_text("Hello from the test fixture.\n")
|
||||
|
||||
n_registered = register_fake_tools()
|
||||
|
||||
# Capture tool calls via a hook on the registry dispatch path. We use the
|
||||
# registry hook (rather than the run_agent.handle_function_call binding,
|
||||
# which is already cached by tool_executor) because the dispatch call is
|
||||
# the one place every underlying tool call lands. Bridge calls are
|
||||
# extracted from the message transcript after the run.
|
||||
tool_call_log: List[Dict[str, Any]] = []
|
||||
|
||||
from tools.registry import registry
|
||||
original_dispatch = registry.dispatch
|
||||
|
||||
def logging_dispatch(name, args, **kw):
|
||||
tool_call_log.append({"name": name, "args": _trim_args(args)})
|
||||
return original_dispatch(name, args, **kw)
|
||||
registry.dispatch = logging_dispatch
|
||||
|
||||
# Build agent and run
|
||||
started = time.time()
|
||||
error = None
|
||||
final_response = ""
|
||||
messages_out = []
|
||||
try:
|
||||
from run_agent import AIAgent
|
||||
agent = AIAgent(
|
||||
provider="openrouter",
|
||||
model="anthropic/claude-haiku-4.5",
|
||||
enabled_toolsets=None, # Default = all available toolsets, including the registered mcp-fake tools
|
||||
quiet_mode=True,
|
||||
save_trajectories=False,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
platform="cli",
|
||||
max_iterations=15,
|
||||
)
|
||||
result = agent.run_conversation(
|
||||
user_message=scenario["prompt"],
|
||||
system_message=(
|
||||
"You are a test agent. Complete the user's task using available "
|
||||
"tools. Be concise; don't add commentary beyond what's needed."
|
||||
),
|
||||
)
|
||||
if isinstance(result, dict):
|
||||
final_response = result.get("final_response") or ""
|
||||
messages_out = result.get("messages") or []
|
||||
else:
|
||||
final_response = str(result)
|
||||
except Exception as e:
|
||||
error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
|
||||
finally:
|
||||
registry.dispatch = original_dispatch
|
||||
|
||||
elapsed = time.time() - started
|
||||
|
||||
# Extract bridge calls from the message transcript. Easier and more
|
||||
# accurate than monkey-patching: this is the actual wire shape the
|
||||
# model emitted.
|
||||
bridge_call_log = _extract_bridge_calls(messages_out)
|
||||
|
||||
# Compose the trace.
|
||||
record = {
|
||||
"scenario_id": scenario["id"],
|
||||
"scenario_description": scenario["description"],
|
||||
"tool_search_enabled": enabled,
|
||||
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
||||
"prompt": scenario["prompt"],
|
||||
"expected_underlying_tools": scenario.get("expected_underlying_tools", []),
|
||||
"n_fake_tools_registered": n_registered,
|
||||
"elapsed_seconds": round(elapsed, 2),
|
||||
"bridge_calls": bridge_call_log,
|
||||
"underlying_tool_calls": tool_call_log,
|
||||
"final_response": final_response,
|
||||
"n_iterations": _count_assistant_turns(messages_out),
|
||||
"error": error,
|
||||
}
|
||||
|
||||
suffix = "enabled" if enabled else "disabled"
|
||||
out_path = out_dir / f"{scenario['id']}__{suffix}.json"
|
||||
out_path.write_text(json.dumps(record, indent=2, default=str))
|
||||
|
||||
# Cleanup
|
||||
shutil.rmtree(home.parent, ignore_errors=True)
|
||||
return record
|
||||
|
||||
|
||||
def _trim_args(args: Any, max_chars: int = 300) -> Any:
|
||||
"""Trim long string args so the log stays readable."""
|
||||
if not isinstance(args, dict):
|
||||
return args
|
||||
out = {}
|
||||
for k, v in args.items():
|
||||
if isinstance(v, str) and len(v) > max_chars:
|
||||
out[k] = v[:max_chars] + f"...[{len(v)-max_chars} chars trimmed]"
|
||||
else:
|
||||
out[k] = v
|
||||
return out
|
||||
|
||||
|
||||
def _count_assistant_turns(messages: List[Dict[str, Any]]) -> int:
|
||||
return sum(1 for m in messages if isinstance(m, dict) and m.get("role") == "assistant")
|
||||
|
||||
|
||||
def _extract_bridge_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Pull out every tool_search / tool_describe / tool_call from a transcript."""
|
||||
bridges = ("tool_search", "tool_describe", "tool_call")
|
||||
out: List[Dict[str, Any]] = []
|
||||
for m in messages or []:
|
||||
if not isinstance(m, dict) or m.get("role") != "assistant":
|
||||
continue
|
||||
tcs = m.get("tool_calls") or []
|
||||
for c in tcs:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
fn = c.get("function") or {}
|
||||
name = fn.get("name")
|
||||
if name in bridges:
|
||||
raw_args = fn.get("arguments") or "{}"
|
||||
try:
|
||||
args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
|
||||
except json.JSONDecodeError:
|
||||
args = {"_raw": raw_args}
|
||||
out.append({"name": name, "args": _trim_args(args)})
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
out_dir = _THIS_DIR / "out"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
print(f"Writing transcripts to: {out_dir}")
|
||||
|
||||
summary = []
|
||||
for scenario in SCENARIOS:
|
||||
for enabled in (True, False):
|
||||
label = "enabled" if enabled else "disabled"
|
||||
print(f"\n{'='*72}\nScenario {scenario['id']} (tool_search={label})\n{'='*72}")
|
||||
record = run_one_scenario(scenario, enabled, out_dir)
|
||||
n_bridge = len(record["bridge_calls"])
|
||||
n_under = len(record["underlying_tool_calls"])
|
||||
err = record["error"]
|
||||
print(f" bridge calls: {n_bridge}, underlying tool calls: {n_under}, "
|
||||
f"elapsed: {record['elapsed_seconds']}s, error: {bool(err)}")
|
||||
if err:
|
||||
print(f" ERROR: {err[:300]}")
|
||||
summary.append({
|
||||
"scenario": scenario["id"],
|
||||
"enabled": enabled,
|
||||
"n_bridge": n_bridge,
|
||||
"n_underlying": n_under,
|
||||
"elapsed": record["elapsed_seconds"],
|
||||
"error": bool(err),
|
||||
"underlying_tools_called": [c["name"] for c in record["underlying_tool_calls"]],
|
||||
"expected": scenario.get("expected_underlying_tools", []),
|
||||
})
|
||||
|
||||
summary_path = out_dir / "_summary.json"
|
||||
summary_path.write_text(json.dumps(summary, indent=2))
|
||||
print(f"\nSummary saved to: {summary_path}")
|
||||
|
||||
# Restore original HERMES_HOME
|
||||
if ORIGINAL_HOME is not None:
|
||||
os.environ["HERMES_HOME"] = ORIGINAL_HOME
|
||||
else:
|
||||
os.environ.pop("HERMES_HOME", None)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
417
tests/tools/test_tool_search.py
Normal file
417
tests/tools/test_tool_search.py
Normal file
@@ -0,0 +1,417 @@
|
||||
"""Tests for tools/tool_search.py — progressive tool disclosure.
|
||||
|
||||
Coverage targets — these mirror the issues called out in the OpenClaw tool
|
||||
search report. Every test that names an OpenClaw issue is the regression
|
||||
guard that would have caught that specific failure mode.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
if _REPO_ROOT not in sys.path:
|
||||
sys.path.insert(0, _REPO_ROOT)
|
||||
|
||||
|
||||
def _td(name: str, description: str = "", properties: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": properties or {},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestConfigParsing:
|
||||
def test_default_when_missing(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw(None)
|
||||
assert cfg.enabled == "auto"
|
||||
assert cfg.threshold_pct == 10.0
|
||||
|
||||
def test_bool_true_maps_to_auto(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw(True)
|
||||
assert cfg.enabled == "auto"
|
||||
|
||||
def test_bool_false_maps_to_off(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw(False)
|
||||
assert cfg.enabled == "off"
|
||||
|
||||
def test_explicit_on(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "on"})
|
||||
assert cfg.enabled == "on"
|
||||
|
||||
def test_invalid_enabled_falls_back_to_auto(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "maybe"})
|
||||
assert cfg.enabled == "auto"
|
||||
|
||||
def test_threshold_clamped(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw({"threshold_pct": 150})
|
||||
assert cfg.threshold_pct == 100.0
|
||||
cfg = ToolSearchConfig.from_raw({"threshold_pct": -5})
|
||||
assert cfg.threshold_pct == 0.0
|
||||
|
||||
def test_search_limits_clamped(self):
|
||||
from tools.tool_search import ToolSearchConfig
|
||||
cfg = ToolSearchConfig.from_raw({
|
||||
"search_default_limit": 999,
|
||||
"max_search_limit": 999,
|
||||
})
|
||||
assert cfg.max_search_limit == 50
|
||||
assert cfg.search_default_limit <= cfg.max_search_limit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classification — the hard invariant: core tools NEVER defer.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClassification:
|
||||
def test_core_tools_never_defer(self):
|
||||
"""The critical invariant from the OpenClaw report."""
|
||||
from tools.tool_search import is_deferrable_tool_name
|
||||
# Sample of core tools from _HERMES_CORE_TOOLS.
|
||||
for core_name in ["terminal", "read_file", "write_file", "patch",
|
||||
"search_files", "todo", "memory", "browser_navigate",
|
||||
"web_search", "session_search", "clarify",
|
||||
"execute_code", "delegate_task", "send_message"]:
|
||||
assert not is_deferrable_tool_name(core_name), (
|
||||
f"Core tool '{core_name}' must NEVER be deferrable"
|
||||
)
|
||||
|
||||
def test_bridge_tools_never_defer(self):
|
||||
from tools.tool_search import is_deferrable_tool_name, BRIDGE_TOOL_NAMES
|
||||
for name in BRIDGE_TOOL_NAMES:
|
||||
assert not is_deferrable_tool_name(name)
|
||||
|
||||
def test_unknown_tool_not_deferrable(self):
|
||||
"""Defensive: a tool name we cannot resolve to a registry entry must
|
||||
not be claimed as deferrable. This protects against the OpenClaw
|
||||
cron regression where unresolved tools were silently dropped."""
|
||||
from tools.tool_search import is_deferrable_tool_name
|
||||
assert not is_deferrable_tool_name("xx_definitely_not_a_tool_xx")
|
||||
|
||||
def test_classify_keeps_unknown_in_visible(self):
|
||||
"""A tool we can't classify stays visible — never silently dropped.
|
||||
|
||||
This is the OpenClaw #84141 regression guard (cron lost ``exec``
|
||||
because it wasn't in the catalog).
|
||||
"""
|
||||
from tools.tool_search import classify_tools
|
||||
# Build a tool def for something we don't have a registry entry for.
|
||||
defs = [_td("xx_unknown_tool", "Unknown tool")]
|
||||
visible, deferrable = classify_tools(defs)
|
||||
names = {(td.get("function") or {}).get("name") for td in visible}
|
||||
assert "xx_unknown_tool" in names
|
||||
assert deferrable == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Token estimation + threshold gate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestThresholdGate:
|
||||
def test_off_never_activates(self):
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "off"})
|
||||
assert not should_activate(cfg, deferrable_tokens=1_000_000, context_length=200_000)
|
||||
|
||||
def test_zero_deferrable_never_activates(self):
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "on"})
|
||||
assert not should_activate(cfg, deferrable_tokens=0, context_length=200_000)
|
||||
|
||||
def test_on_activates_with_any_deferrable(self):
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "on"})
|
||||
assert should_activate(cfg, deferrable_tokens=100, context_length=200_000)
|
||||
|
||||
def test_auto_below_threshold_does_not_activate(self):
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10})
|
||||
# 5% of 200K = below 10% threshold
|
||||
assert not should_activate(cfg, deferrable_tokens=10_000, context_length=200_000)
|
||||
|
||||
def test_auto_at_or_above_threshold_activates(self):
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10})
|
||||
assert should_activate(cfg, deferrable_tokens=20_000, context_length=200_000)
|
||||
assert should_activate(cfg, deferrable_tokens=50_000, context_length=200_000)
|
||||
|
||||
def test_auto_without_context_length_uses_20k_cutoff(self):
|
||||
"""Fallback cutoff used when the active model is unknown."""
|
||||
from tools.tool_search import ToolSearchConfig, should_activate
|
||||
cfg = ToolSearchConfig.from_raw({"enabled": "auto"})
|
||||
assert not should_activate(cfg, deferrable_tokens=10_000, context_length=0)
|
||||
assert should_activate(cfg, deferrable_tokens=25_000, context_length=0)
|
||||
|
||||
def test_token_estimate_proportional_to_schema_size(self):
|
||||
from tools.tool_search import estimate_tokens_from_schemas
|
||||
small = [_td("a", "x")]
|
||||
big = [_td(f"name_{i}", f"description for tool {i} " * 20,
|
||||
{"q": {"type": "string", "description": "search query " * 10}})
|
||||
for i in range(10)]
|
||||
small_t = estimate_tokens_from_schemas(small)
|
||||
big_t = estimate_tokens_from_schemas(big)
|
||||
assert big_t > small_t * 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Retrieval (BM25 + substring fallback)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRetrieval:
|
||||
def _fake_catalog(self):
|
||||
"""Build a catalog directly without touching the registry."""
|
||||
from tools.tool_search import CatalogEntry, _tokenize, _entry_search_text
|
||||
defs = [
|
||||
_td("github_create_issue", "Open a new issue in a GitHub repository",
|
||||
{"title": {"type": "string"}, "body": {"type": "string"}}),
|
||||
_td("github_search_repos", "Search GitHub for matching repositories",
|
||||
{"query": {"type": "string"}}),
|
||||
_td("slack_send_message", "Post a message into a Slack channel",
|
||||
{"channel": {"type": "string"}, "text": {"type": "string"}}),
|
||||
_td("calendar_create_event", "Add an event to the user's calendar",
|
||||
{"title": {"type": "string"}, "start": {"type": "string"}}),
|
||||
]
|
||||
catalog = []
|
||||
for d in defs:
|
||||
fn = d["function"]
|
||||
e = CatalogEntry(
|
||||
name=fn["name"], description=fn["description"],
|
||||
schema=d, source="mcp", source_name="mcp-test",
|
||||
)
|
||||
e._tokens = _tokenize(_entry_search_text(d))
|
||||
catalog.append(e)
|
||||
return catalog
|
||||
|
||||
def test_search_finds_relevant_tool(self):
|
||||
from tools.tool_search import search_catalog
|
||||
hits = search_catalog(self._fake_catalog(), "create a github issue", limit=3)
|
||||
names = [h.name for h in hits]
|
||||
assert names[0] == "github_create_issue"
|
||||
|
||||
def test_search_returns_empty_for_irrelevant_query(self):
|
||||
from tools.tool_search import search_catalog
|
||||
hits = search_catalog(self._fake_catalog(), "asdf qwerty foobar", limit=3)
|
||||
assert hits == []
|
||||
|
||||
def test_search_substring_fallback(self):
|
||||
"""Even when no BM25 hit, a literal substring of the tool name returns."""
|
||||
from tools.tool_search import search_catalog
|
||||
hits = search_catalog(self._fake_catalog(), "calendar", limit=3)
|
||||
assert any("calendar" in h.name for h in hits)
|
||||
|
||||
def test_search_respects_limit(self):
|
||||
from tools.tool_search import search_catalog
|
||||
hits = search_catalog(self._fake_catalog(), "github", limit=1)
|
||||
assert len(hits) <= 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Assembly — the full passthrough/activate decision.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAssembly:
|
||||
def test_no_deferrable_returns_unchanged(self):
|
||||
"""Pure-core toolset: pass-through, no bridge tools added."""
|
||||
from tools.tool_search import assemble_tool_defs, ToolSearchConfig
|
||||
defs = [_td("terminal", "Run shell"), _td("read_file", "Read a file")]
|
||||
result = assemble_tool_defs(
|
||||
defs,
|
||||
context_length=200_000,
|
||||
config=ToolSearchConfig.from_raw({"enabled": "on"}),
|
||||
)
|
||||
assert not result.activated
|
||||
assert {t["function"]["name"] for t in result.tool_defs} == {"terminal", "read_file"}
|
||||
|
||||
def test_below_threshold_returns_unchanged(self):
|
||||
"""Tiny deferrable surface: don't bother."""
|
||||
from tools.tool_search import assemble_tool_defs, ToolSearchConfig
|
||||
# _td renders to ~80 chars / 20 tokens. 3 of them = ~60 tokens.
|
||||
# 10% of 200K = 20K. Way below.
|
||||
defs = [_td("unknown_tool_a"), _td("unknown_tool_b"), _td("unknown_tool_c")]
|
||||
result = assemble_tool_defs(
|
||||
defs,
|
||||
context_length=200_000,
|
||||
config=ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10}),
|
||||
)
|
||||
assert not result.activated
|
||||
names = {(t.get("function") or {}).get("name") for t in result.tool_defs}
|
||||
assert "tool_search" not in names
|
||||
|
||||
def test_idempotent_when_bridge_already_present(self):
|
||||
from tools.tool_search import assemble_tool_defs, ToolSearchConfig, BRIDGE_TOOL_NAMES
|
||||
defs = [_td("terminal", "Run shell"), _td("tool_search", "old")]
|
||||
result = assemble_tool_defs(
|
||||
defs,
|
||||
context_length=200_000,
|
||||
config=ToolSearchConfig.from_raw({"enabled": "off"}),
|
||||
)
|
||||
names = [(t["function"]["name"]) for t in result.tool_defs]
|
||||
# The pre-existing tool_search was stripped (it would be re-injected if
|
||||
# activation happened; here it didn't).
|
||||
assert "tool_search" not in names
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bridge dispatch
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBridgeDispatch:
|
||||
def test_tool_search_requires_query(self):
|
||||
from tools.tool_search import dispatch_tool_search
|
||||
result = dispatch_tool_search({}, current_tool_defs=[])
|
||||
assert "error" in json.loads(result)
|
||||
|
||||
def test_tool_describe_requires_name(self):
|
||||
from tools.tool_search import dispatch_tool_describe
|
||||
result = dispatch_tool_describe({}, current_tool_defs=[])
|
||||
assert "error" in json.loads(result)
|
||||
|
||||
def test_tool_describe_rejects_non_deferrable(self):
|
||||
"""If the model asks to describe a core tool, refuse — it's already
|
||||
in the visible list."""
|
||||
from tools.tool_search import dispatch_tool_describe
|
||||
result = dispatch_tool_describe(
|
||||
{"name": "terminal"}, current_tool_defs=[_td("terminal", "Run shell")],
|
||||
)
|
||||
assert "error" in json.loads(result)
|
||||
|
||||
def test_resolve_underlying_call_parses_object_args(self):
|
||||
from tools.tool_search import resolve_underlying_call
|
||||
name, args, err = resolve_underlying_call({
|
||||
"name": "unknown_xxx",
|
||||
"arguments": {"foo": "bar"},
|
||||
})
|
||||
# Will fail classification because unknown_xxx isn't deferrable.
|
||||
assert err is not None
|
||||
|
||||
def test_resolve_underlying_call_parses_json_string_args(self):
|
||||
"""Some models emit ``arguments`` as a JSON string instead of object."""
|
||||
from tools.tool_search import resolve_underlying_call
|
||||
# Use a name that won't classify (so we don't depend on registry),
|
||||
# but exercise the JSON parse path.
|
||||
_, _, err = resolve_underlying_call({
|
||||
"name": "fake",
|
||||
"arguments": '{"a": 1}',
|
||||
})
|
||||
# err is about classification, but the parse worked (it would have
|
||||
# failed earlier with "not valid JSON" otherwise).
|
||||
assert "not valid JSON" not in (err or "")
|
||||
|
||||
def test_resolve_underlying_call_rejects_bad_json(self):
|
||||
from tools.tool_search import resolve_underlying_call
|
||||
_, _, err = resolve_underlying_call({
|
||||
"name": "fake",
|
||||
"arguments": "{this is not json",
|
||||
})
|
||||
assert err is not None
|
||||
assert "JSON" in err
|
||||
|
||||
def test_resolve_underlying_call_rejects_recursion(self):
|
||||
"""tool_call cannot invoke tool_call itself."""
|
||||
from tools.tool_search import resolve_underlying_call, TOOL_CALL_NAME
|
||||
name, args, err = resolve_underlying_call({
|
||||
"name": TOOL_CALL_NAME,
|
||||
"arguments": {},
|
||||
})
|
||||
assert err is not None
|
||||
assert "bridge tool" in err.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end via the real handle_function_call (smoke test).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHandleFunctionCallIntegration:
|
||||
def test_tool_search_dispatch_through_handle_function_call(self):
|
||||
"""The dispatcher recognizes the bridge tool by name."""
|
||||
import model_tools
|
||||
result = model_tools.handle_function_call(
|
||||
function_name="tool_search",
|
||||
function_args={"query": "nothing matches this"},
|
||||
)
|
||||
parsed = json.loads(result)
|
||||
# Without a real registry, the matches will be empty, but the
|
||||
# dispatch path completed without error.
|
||||
assert "matches" in parsed or "error" in parsed
|
||||
|
||||
|
||||
class TestRegression_OpenClawCron84141:
|
||||
"""Regression guard for the OpenClaw cron-tool-loss class of bug.
|
||||
|
||||
OpenClaw #84141: ``toolsAllow: ["exec"]`` on an isolated cron turn
|
||||
resulted in the agent receiving only ``sessions_send`` — the catalog
|
||||
builder silently dropped the requested core tool.
|
||||
|
||||
Our defense: core tools are NEVER deferred. This test exercises the
|
||||
full assembly pipeline with a mixed core+MCP toolset and asserts that
|
||||
every core tool survives.
|
||||
"""
|
||||
|
||||
def test_core_tool_survives_alongside_many_mcp_tools(self):
|
||||
from tools.tool_search import (
|
||||
assemble_tool_defs, ToolSearchConfig, BRIDGE_TOOL_NAMES,
|
||||
classify_tools,
|
||||
)
|
||||
# 1 core tool + 50 unknown/MCP-shaped tools (deferrable).
|
||||
defs = [_td("terminal", "Run shell commands")]
|
||||
# Pad with fake "deferrable" tools — without registry registration,
|
||||
# classify_tools puts them in 'visible'. So instead, we just verify
|
||||
# the core-tool side: terminal stays in visible regardless.
|
||||
visible, deferrable = classify_tools(defs)
|
||||
assert any(
|
||||
(td.get("function") or {}).get("name") == "terminal"
|
||||
for td in visible
|
||||
), "Core tool 'terminal' was wrongly classified as deferrable"
|
||||
|
||||
# Now force activation and check the resulting tool-defs list.
|
||||
result = assemble_tool_defs(
|
||||
defs,
|
||||
context_length=200_000,
|
||||
config=ToolSearchConfig.from_raw({"enabled": "on"}),
|
||||
)
|
||||
names = {(t.get("function") or {}).get("name") for t in result.tool_defs}
|
||||
# terminal must be present; bridges are only added if there are
|
||||
# deferrable tools to put behind them.
|
||||
assert "terminal" in names
|
||||
|
||||
def test_unwrap_rejects_core_tool_attempt(self):
|
||||
"""Even if the model tries to invoke a core tool through tool_call,
|
||||
we reject the call and tell the model to use it directly."""
|
||||
from tools.tool_search import resolve_underlying_call
|
||||
_, _, err = resolve_underlying_call({
|
||||
"name": "terminal",
|
||||
"arguments": {"command": "echo hi"},
|
||||
})
|
||||
assert err is not None
|
||||
assert "not a deferrable" in err
|
||||
|
||||
714
tools/tool_search.py
Normal file
714
tools/tool_search.py
Normal file
@@ -0,0 +1,714 @@
|
||||
"""Progressive tool disclosure ("tool search") for Hermes Agent.
|
||||
|
||||
When enabled, MCP and non-core plugin tools are replaced in the model-visible
|
||||
tools array by three bridge tools — ``tool_search``, ``tool_describe``,
|
||||
``tool_call`` — and surfaced on demand. Core Hermes tools never defer.
|
||||
|
||||
Design constraints this module is built around (see ``openclaw-tool-search-report``
|
||||
for the full rationale):
|
||||
|
||||
* Core tools defined in ``toolsets._HERMES_CORE_TOOLS`` are *never* deferred.
|
||||
Always-load means always-load. No exceptions.
|
||||
* The threshold gate runs every assembly: when deferrable tools would consume
|
||||
less than ``threshold_pct`` of the model's context window (default 10%),
|
||||
tool search is a no-op and the tools array passes through unchanged.
|
||||
* The catalog is stateless across turns and tools-array assemblies. It is
|
||||
rebuilt from the current tool-defs list every time. This is the lesson
|
||||
from OpenClaw's cron regression (openclaw/openclaw#84141): a session-keyed
|
||||
catalog that drifts out of sync with the live tool registry produces
|
||||
silent tool dropouts.
|
||||
* Bridge tools route through ``model_tools.handle_function_call`` exactly
|
||||
like a direct call, so guardrails, plugin pre/post hooks, approval flows,
|
||||
and tool-result truncation all fire identically.
|
||||
* Display and trajectory unwrap is implemented here so the user (CLI activity
|
||||
feed, gateway, saved trajectories) always sees the underlying tool, not
|
||||
the bridge.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger("tools.tool_search")
|
||||
|
||||
|
||||
# Bridge tool names. These names are reserved and may not collide with a
|
||||
# user/plugin/MCP tool — registration of any tool with these names is
|
||||
# rejected by the registry's existing override-protection logic.
|
||||
TOOL_SEARCH_NAME = "tool_search"
|
||||
TOOL_DESCRIBE_NAME = "tool_describe"
|
||||
TOOL_CALL_NAME = "tool_call"
|
||||
|
||||
BRIDGE_TOOL_NAMES = frozenset({TOOL_SEARCH_NAME, TOOL_DESCRIBE_NAME, TOOL_CALL_NAME})
|
||||
|
||||
# When estimating tokens from char count without a real tokenizer, this is
|
||||
# the cheap rule of thumb that's stable across providers. Roughly 4 chars
|
||||
# per token for English+JSON. Underestimating leads to false negatives
|
||||
# (tool search not activated when it should); overestimating leads to false
|
||||
# positives (activated when not needed). 4.0 errs slightly toward
|
||||
# underestimating, which is the safer default.
|
||||
CHARS_PER_TOKEN = 4.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration plumbing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ToolSearchConfig:
|
||||
"""Resolved, validated tool-search configuration for a single assembly."""
|
||||
|
||||
enabled: str # "auto" | "on" | "off"
|
||||
threshold_pct: float # 0..100 — only used when enabled == "auto"
|
||||
search_default_limit: int
|
||||
max_search_limit: int
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, raw: Any) -> "ToolSearchConfig":
|
||||
"""Build a config from a raw dict / bool / None.
|
||||
|
||||
Accepts the legacy bool shape (``tools.tool_search: true``) and the
|
||||
dict shape (``tools.tool_search: {enabled: auto, ...}``). Validates
|
||||
and clamps every numeric field; unknown values fall back to safe
|
||||
defaults rather than raising, so a typo in user config does not
|
||||
break the agent.
|
||||
"""
|
||||
if raw is True:
|
||||
return cls(enabled="auto", threshold_pct=10.0,
|
||||
search_default_limit=5, max_search_limit=20)
|
||||
if raw is False:
|
||||
return cls(enabled="off", threshold_pct=10.0,
|
||||
search_default_limit=5, max_search_limit=20)
|
||||
if not isinstance(raw, dict):
|
||||
return cls(enabled="auto", threshold_pct=10.0,
|
||||
search_default_limit=5, max_search_limit=20)
|
||||
|
||||
enabled_raw = str(raw.get("enabled", "auto")).strip().lower()
|
||||
if enabled_raw in ("true", "1", "yes"):
|
||||
enabled = "on"
|
||||
elif enabled_raw in ("false", "0", "no"):
|
||||
enabled = "off"
|
||||
elif enabled_raw in ("auto", "on", "off"):
|
||||
enabled = enabled_raw
|
||||
else:
|
||||
enabled = "auto"
|
||||
|
||||
threshold_pct = _safe_float(raw.get("threshold_pct"), 10.0)
|
||||
threshold_pct = max(0.0, min(100.0, threshold_pct))
|
||||
|
||||
max_search_limit = max(1, min(50, _safe_int(raw.get("max_search_limit"), 20)))
|
||||
search_default_limit = max(1, min(max_search_limit,
|
||||
_safe_int(raw.get("search_default_limit"), 5)))
|
||||
|
||||
return cls(
|
||||
enabled=enabled,
|
||||
threshold_pct=threshold_pct,
|
||||
search_default_limit=search_default_limit,
|
||||
max_search_limit=max_search_limit,
|
||||
)
|
||||
|
||||
|
||||
def _safe_int(value: Any, fallback: int) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return fallback
|
||||
|
||||
|
||||
def _safe_float(value: Any, fallback: float) -> float:
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return fallback
|
||||
|
||||
|
||||
def load_config() -> ToolSearchConfig:
|
||||
"""Load tool-search config from the user config file."""
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load
|
||||
cfg = _load() or {}
|
||||
tools_cfg = cfg.get("tools") if isinstance(cfg.get("tools"), dict) else {}
|
||||
if not isinstance(tools_cfg, dict):
|
||||
tools_cfg = {}
|
||||
return ToolSearchConfig.from_raw(tools_cfg.get("tool_search"))
|
||||
except Exception as e:
|
||||
logger.debug("Failed to load tool-search config: %s", e)
|
||||
return ToolSearchConfig.from_raw(None)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _core_tool_names() -> frozenset[str]:
|
||||
"""Return the set of tool names that must NEVER be deferred.
|
||||
|
||||
Imported lazily because ``toolsets`` imports from ``tools.registry``
|
||||
and we don't want a hard cycle.
|
||||
"""
|
||||
try:
|
||||
from toolsets import _HERMES_CORE_TOOLS
|
||||
return frozenset(_HERMES_CORE_TOOLS)
|
||||
except Exception:
|
||||
return frozenset()
|
||||
|
||||
|
||||
def is_deferrable_tool_name(name: str) -> bool:
|
||||
"""Return True if a tool with this name is *eligible* for deferral.
|
||||
|
||||
A tool is deferrable iff it is registered with an MCP toolset prefix
|
||||
OR it is not in ``_HERMES_CORE_TOOLS``. Core tools are never deferred
|
||||
even when their toolset is technically plugin-provided (this protects
|
||||
against accidental shadowing).
|
||||
"""
|
||||
if name in BRIDGE_TOOL_NAMES:
|
||||
return False
|
||||
if name in _core_tool_names():
|
||||
return False
|
||||
# Check registry toolset for MCP prefix.
|
||||
try:
|
||||
from tools.registry import registry
|
||||
entry = registry.get_entry(name)
|
||||
if entry is None:
|
||||
return False
|
||||
if entry.toolset.startswith("mcp-"):
|
||||
return True
|
||||
# Non-MCP, non-core → plugin tool, eligible.
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def classify_tools(tool_defs: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Split a tool-defs list into (visible, deferrable).
|
||||
|
||||
``visible`` retains every tool that must stay in the model-facing array:
|
||||
every core tool, plus any tool we can't classify. ``deferrable`` is the
|
||||
candidate set for catalog entry.
|
||||
"""
|
||||
visible: List[Dict[str, Any]] = []
|
||||
deferrable: List[Dict[str, Any]] = []
|
||||
for td in tool_defs:
|
||||
fn = td.get("function") or {}
|
||||
name = fn.get("name", "")
|
||||
if name in BRIDGE_TOOL_NAMES:
|
||||
# Should never happen — bridge tools are added after classification —
|
||||
# but be defensive.
|
||||
continue
|
||||
if is_deferrable_tool_name(name):
|
||||
deferrable.append(td)
|
||||
else:
|
||||
visible.append(td)
|
||||
return visible, deferrable
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Token estimation and threshold gate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def estimate_tokens_from_schemas(tool_defs: Iterable[Dict[str, Any]]) -> int:
|
||||
"""Estimate the token cost of a tool-defs list via the chars/4 rule.
|
||||
|
||||
Cheap and stable across providers. The number doesn't need to be exact —
|
||||
it gates the activate/skip decision, and a typical 200K context with a
|
||||
10% threshold means the decision flips around 20K tokens of schema.
|
||||
Order-of-magnitude precision is fine.
|
||||
"""
|
||||
total_chars = 0
|
||||
for td in tool_defs:
|
||||
try:
|
||||
total_chars += len(json.dumps(td, ensure_ascii=False, separators=(",", ":")))
|
||||
except (TypeError, ValueError):
|
||||
total_chars += len(str(td))
|
||||
return int(math.ceil(total_chars / CHARS_PER_TOKEN))
|
||||
|
||||
|
||||
def should_activate(
|
||||
config: ToolSearchConfig,
|
||||
deferrable_tokens: int,
|
||||
context_length: Optional[int],
|
||||
) -> bool:
|
||||
"""Decide whether tool search should activate for the current assembly.
|
||||
|
||||
``"off"`` skips unconditionally. ``"on"`` activates unconditionally
|
||||
(as long as there is at least one deferrable tool — there's no point
|
||||
swapping a no-op). ``"auto"`` activates when the deferrable schemas
|
||||
would consume ``threshold_pct`` of context or more.
|
||||
"""
|
||||
if config.enabled == "off":
|
||||
return False
|
||||
if deferrable_tokens <= 0:
|
||||
return False
|
||||
if config.enabled == "on":
|
||||
return True
|
||||
# auto
|
||||
if not context_length or context_length <= 0:
|
||||
# Without a known context size, fall back to a fixed 20K-token cutoff
|
||||
# — the cliff above which Anthropic and OpenAI both saw quality drops.
|
||||
return deferrable_tokens >= 20_000
|
||||
threshold_tokens = int(context_length * (config.threshold_pct / 100.0))
|
||||
return deferrable_tokens >= threshold_tokens
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Catalog + BM25 retrieval
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class CatalogEntry:
|
||||
"""One deferrable tool, in a form the bridge tools can search and serve."""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
schema: Dict[str, Any] # The full {"type":"function", "function": {...}} entry.
|
||||
source: str # "mcp" | "plugin" | "other"
|
||||
source_name: str # Toolset name, e.g. "mcp-github" or "kanban"
|
||||
|
||||
# Pre-tokenized fields for BM25.
|
||||
_tokens: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
_TOKEN_RE = re.compile(r"[A-Za-z0-9]+")
|
||||
|
||||
|
||||
def _tokenize(text: str) -> List[str]:
|
||||
if not text:
|
||||
return []
|
||||
return [t.lower() for t in _TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
def _entry_search_text(td: Dict[str, Any]) -> str:
|
||||
"""Build the search-text blob for a deferrable tool.
|
||||
|
||||
Includes the tool name (with underscores broken into words so BM25 can
|
||||
match against query terms), the description, and the names of the
|
||||
top-level parameters. Schema bodies are deliberately excluded —
|
||||
indexing them adds noise without improving recall in our measurement.
|
||||
"""
|
||||
fn = td.get("function") or {}
|
||||
name = fn.get("name", "")
|
||||
desc = fn.get("description", "") or ""
|
||||
params = ((fn.get("parameters") or {}).get("properties") or {})
|
||||
param_names = " ".join(params.keys())
|
||||
# Break snake_case and dotted names into words for BM25.
|
||||
name_words = name.replace("_", " ").replace(".", " ").replace("-", " ").replace(":", " ")
|
||||
return f"{name_words} {desc} {param_names}"
|
||||
|
||||
|
||||
def _classify_source(name: str) -> Tuple[str, str]:
|
||||
"""Return (source_kind, source_name) for a registered tool name."""
|
||||
try:
|
||||
from tools.registry import registry
|
||||
entry = registry.get_entry(name)
|
||||
if entry is None:
|
||||
return ("other", "")
|
||||
if entry.toolset.startswith("mcp-"):
|
||||
return ("mcp", entry.toolset)
|
||||
return ("plugin", entry.toolset)
|
||||
except Exception:
|
||||
return ("other", "")
|
||||
|
||||
|
||||
def build_catalog(tool_defs: List[Dict[str, Any]]) -> List[CatalogEntry]:
|
||||
"""Build the deferred-tool catalog from a tool-defs list.
|
||||
|
||||
Caller is expected to pass only the deferrable subset (``classify_tools``
|
||||
returns it as the second element).
|
||||
"""
|
||||
catalog: List[CatalogEntry] = []
|
||||
for td in tool_defs:
|
||||
fn = td.get("function") or {}
|
||||
name = fn.get("name", "")
|
||||
if not name:
|
||||
continue
|
||||
desc = fn.get("description", "") or ""
|
||||
source, source_name = _classify_source(name)
|
||||
entry = CatalogEntry(
|
||||
name=name,
|
||||
description=desc,
|
||||
schema=td,
|
||||
source=source,
|
||||
source_name=source_name,
|
||||
_tokens=_tokenize(_entry_search_text(td)),
|
||||
)
|
||||
catalog.append(entry)
|
||||
return catalog
|
||||
|
||||
|
||||
def _bm25_score(query_tokens: List[str], doc_tokens: List[str],
|
||||
doc_lengths: List[int], avg_dl: float,
|
||||
doc_freq: Dict[str, int], n_docs: int,
|
||||
k1: float = 1.5, b: float = 0.75) -> float:
|
||||
"""Standard BM25 score for one query against one document.
|
||||
|
||||
Inlined small implementation rather than adding a dependency. Performance
|
||||
is fine — the catalog is bounded by N (tools) typically < 500, and we
|
||||
score against the in-memory tokens list.
|
||||
"""
|
||||
if not doc_tokens:
|
||||
return 0.0
|
||||
score = 0.0
|
||||
dl = len(doc_tokens)
|
||||
# Pre-count tokens in the doc.
|
||||
doc_tf: Dict[str, int] = {}
|
||||
for t in doc_tokens:
|
||||
doc_tf[t] = doc_tf.get(t, 0) + 1
|
||||
for q in query_tokens:
|
||||
df = doc_freq.get(q, 0)
|
||||
if df == 0:
|
||||
continue
|
||||
idf = math.log(1 + (n_docs - df + 0.5) / (df + 0.5))
|
||||
tf = doc_tf.get(q, 0)
|
||||
if tf == 0:
|
||||
continue
|
||||
norm = tf * (k1 + 1) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1.0)))
|
||||
score += idf * norm
|
||||
return score
|
||||
|
||||
|
||||
def search_catalog(catalog: List[CatalogEntry], query: str, limit: int = 5) -> List[CatalogEntry]:
|
||||
"""Return the top-``limit`` catalog entries for ``query`` by BM25.
|
||||
|
||||
Falls back to a stable name-substring match when BM25 yields no hits
|
||||
above zero. That ensures a query like ``"github"`` against a catalog
|
||||
where every tool is named ``github_*`` still returns results — BM25
|
||||
can underperform when query and document share only one token that
|
||||
appears in every document (zero IDF).
|
||||
"""
|
||||
if not catalog or limit <= 0:
|
||||
return []
|
||||
query_tokens = _tokenize(query)
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
# Precompute doc statistics.
|
||||
doc_lengths = [len(e._tokens) for e in catalog]
|
||||
avg_dl = sum(doc_lengths) / max(len(doc_lengths), 1)
|
||||
doc_freq: Dict[str, int] = {}
|
||||
for e in catalog:
|
||||
seen = set(e._tokens)
|
||||
for t in seen:
|
||||
doc_freq[t] = doc_freq.get(t, 0) + 1
|
||||
n_docs = len(catalog)
|
||||
|
||||
scored: List[Tuple[float, CatalogEntry]] = []
|
||||
for entry in catalog:
|
||||
s = _bm25_score(query_tokens, entry._tokens, doc_lengths, avg_dl,
|
||||
doc_freq, n_docs)
|
||||
if s > 0:
|
||||
scored.append((s, entry))
|
||||
|
||||
if not scored:
|
||||
# Substring fallback against the original tool name.
|
||||
ql = query.lower()
|
||||
for entry in catalog:
|
||||
if ql in entry.name.lower():
|
||||
scored.append((0.1, entry))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bridge tool schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def bridge_tool_schemas(deferred_count: int) -> List[Dict[str, Any]]:
|
||||
"""Build the bridge tool schemas to inject in place of deferred tools.
|
||||
|
||||
The schemas are intentionally short — every byte added here is a byte
|
||||
the user pays on every turn. Descriptions are tuned to be unambiguous
|
||||
about the call sequence the model should follow.
|
||||
"""
|
||||
desc_search = (
|
||||
f"Search {deferred_count} additional tools that are loaded on demand. "
|
||||
"Returns up to ``limit`` matches with name and description. Follow "
|
||||
f"with `{TOOL_DESCRIBE_NAME}` to load a tool's full parameter schema, "
|
||||
f"then `{TOOL_CALL_NAME}` to invoke it. Tools listed at the top of this "
|
||||
"system prompt are already available and do not need to be searched."
|
||||
)
|
||||
desc_describe = (
|
||||
f"Load the full JSON schema for one tool returned by `{TOOL_SEARCH_NAME}`. "
|
||||
f"Required before `{TOOL_CALL_NAME}` if the tool's parameters are unknown."
|
||||
)
|
||||
desc_call = (
|
||||
"Invoke a deferred tool by name with the given arguments. Argument shape "
|
||||
f"matches the tool's schema (see `{TOOL_DESCRIBE_NAME}`). Policy, hooks, "
|
||||
"and approvals run exactly as for any directly-listed tool."
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": TOOL_SEARCH_NAME,
|
||||
"description": desc_search,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Keywords describing the capability you need (e.g. 'create github issue').",
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results to return. Default 5.",
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": TOOL_DESCRIBE_NAME,
|
||||
"description": desc_describe,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Exact tool name (as returned by tool_search).",
|
||||
},
|
||||
},
|
||||
"required": ["name"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": TOOL_CALL_NAME,
|
||||
"description": desc_call,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Exact tool name to invoke.",
|
||||
},
|
||||
"arguments": {
|
||||
"type": "object",
|
||||
"description": "Arguments for the tool, matching its schema.",
|
||||
},
|
||||
},
|
||||
"required": ["name", "arguments"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public entry point: assemble tool-defs with optional tool search
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssemblyResult:
|
||||
"""Outcome of one assembly. Useful for tests and observability."""
|
||||
|
||||
tool_defs: List[Dict[str, Any]]
|
||||
activated: bool
|
||||
deferred_count: int = 0
|
||||
deferred_tokens: int = 0
|
||||
threshold_tokens: int = 0
|
||||
|
||||
|
||||
def assemble_tool_defs(
|
||||
tool_defs: List[Dict[str, Any]],
|
||||
*,
|
||||
context_length: Optional[int] = None,
|
||||
config: Optional[ToolSearchConfig] = None,
|
||||
) -> AssemblyResult:
|
||||
"""Return the tool-defs list the model should actually see.
|
||||
|
||||
When tool search is inactive (off, no deferrable tools, or below
|
||||
threshold), this is a passthrough. When active, MCP and plugin tools
|
||||
are stripped from the visible list and replaced with the three bridge
|
||||
tools. Core tools are *never* deferred regardless of config.
|
||||
|
||||
Idempotent: calling with bridge tools already in the input is a no-op
|
||||
(they classify as non-core/non-deferrable but their names are reserved,
|
||||
so they are filtered out of the deferrable set).
|
||||
"""
|
||||
if config is None:
|
||||
config = load_config()
|
||||
|
||||
# Defensive: strip any bridge tools that may already be in the list
|
||||
# (e.g. someone called assemble twice).
|
||||
incoming = [td for td in tool_defs
|
||||
if (td.get("function") or {}).get("name") not in BRIDGE_TOOL_NAMES]
|
||||
|
||||
visible, deferrable = classify_tools(incoming)
|
||||
if not deferrable:
|
||||
return AssemblyResult(tool_defs=incoming, activated=False)
|
||||
|
||||
deferrable_tokens = estimate_tokens_from_schemas(deferrable)
|
||||
if not should_activate(config, deferrable_tokens, context_length):
|
||||
return AssemblyResult(
|
||||
tool_defs=incoming,
|
||||
activated=False,
|
||||
deferred_count=len(deferrable),
|
||||
deferred_tokens=deferrable_tokens,
|
||||
threshold_tokens=int((context_length or 0) * (config.threshold_pct / 100.0)),
|
||||
)
|
||||
|
||||
bridge = bridge_tool_schemas(len(deferrable))
|
||||
result = visible + bridge
|
||||
threshold_tokens = int((context_length or 0) * (config.threshold_pct / 100.0))
|
||||
|
||||
logger.info(
|
||||
"tool_search activated: %d core/visible tools kept, %d deferred (~%d tokens, threshold ~%d)",
|
||||
len(visible), len(deferrable), deferrable_tokens, threshold_tokens,
|
||||
)
|
||||
|
||||
return AssemblyResult(
|
||||
tool_defs=result,
|
||||
activated=True,
|
||||
deferred_count=len(deferrable),
|
||||
deferred_tokens=deferrable_tokens,
|
||||
threshold_tokens=threshold_tokens,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bridge tool dispatch
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def is_bridge_tool(name: str) -> bool:
|
||||
return name in BRIDGE_TOOL_NAMES
|
||||
|
||||
|
||||
def _format_search_hit(entry: CatalogEntry) -> Dict[str, Any]:
|
||||
return {
|
||||
"name": entry.name,
|
||||
"source": entry.source,
|
||||
"source_name": entry.source_name,
|
||||
# Cap description so a chatty MCP server doesn't blow up the result.
|
||||
"description": (entry.description or "")[:400],
|
||||
}
|
||||
|
||||
|
||||
def dispatch_tool_search(args: Dict[str, Any],
|
||||
*,
|
||||
current_tool_defs: List[Dict[str, Any]],
|
||||
config: Optional[ToolSearchConfig] = None) -> str:
|
||||
"""Execute the ``tool_search`` bridge tool. Returns a JSON string."""
|
||||
if config is None:
|
||||
config = load_config()
|
||||
query = str(args.get("query") or "").strip()
|
||||
if not query:
|
||||
return json.dumps({"error": "query is required"}, ensure_ascii=False)
|
||||
|
||||
raw_limit = args.get("limit")
|
||||
if raw_limit is None:
|
||||
limit = config.search_default_limit
|
||||
else:
|
||||
limit = max(1, min(config.max_search_limit, _safe_int(raw_limit, config.search_default_limit)))
|
||||
|
||||
_, deferrable = classify_tools(current_tool_defs)
|
||||
catalog = build_catalog(deferrable)
|
||||
hits = search_catalog(catalog, query, limit=limit)
|
||||
return json.dumps({
|
||||
"query": query,
|
||||
"total_available": len(catalog),
|
||||
"matches": [_format_search_hit(h) for h in hits],
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
||||
def dispatch_tool_describe(args: Dict[str, Any],
|
||||
*,
|
||||
current_tool_defs: List[Dict[str, Any]]) -> str:
|
||||
"""Execute the ``tool_describe`` bridge tool. Returns a JSON string."""
|
||||
name = str(args.get("name") or "").strip()
|
||||
if not name:
|
||||
return json.dumps({"error": "name is required"}, ensure_ascii=False)
|
||||
if not is_deferrable_tool_name(name):
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"'{name}' is not a deferrable tool. If you see it in the tools list "
|
||||
"already, call it directly; otherwise check the spelling against tool_search."
|
||||
),
|
||||
}, ensure_ascii=False)
|
||||
_, deferrable = classify_tools(current_tool_defs)
|
||||
for td in deferrable:
|
||||
fn = td.get("function") or {}
|
||||
if fn.get("name") == name:
|
||||
return json.dumps({
|
||||
"name": name,
|
||||
"description": fn.get("description", ""),
|
||||
"parameters": fn.get("parameters", {}),
|
||||
}, ensure_ascii=False)
|
||||
return json.dumps({
|
||||
"error": f"'{name}' is not currently available. Re-run tool_search to refresh.",
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
||||
def resolve_underlying_call(args: Dict[str, Any]) -> Tuple[Optional[str], Dict[str, Any], Optional[str]]:
|
||||
"""Parse a ``tool_call`` invocation into (underlying_name, args, error_msg).
|
||||
|
||||
Used by:
|
||||
* the dispatcher in ``model_tools.handle_function_call``,
|
||||
* the display layer (so the activity feed shows the underlying tool),
|
||||
* the trajectory recorder.
|
||||
|
||||
On parse error, returns ``(None, {}, error_message)``.
|
||||
"""
|
||||
name = str(args.get("name") or "").strip()
|
||||
if not name:
|
||||
return None, {}, "tool_call requires a 'name' argument"
|
||||
if name in BRIDGE_TOOL_NAMES:
|
||||
return None, {}, f"tool_call cannot invoke '{name}' (it is itself a bridge tool)"
|
||||
raw_args = args.get("arguments")
|
||||
if raw_args is None:
|
||||
raw_args = {}
|
||||
if isinstance(raw_args, str):
|
||||
try:
|
||||
raw_args = json.loads(raw_args)
|
||||
except json.JSONDecodeError as e:
|
||||
return None, {}, f"tool_call 'arguments' is not valid JSON: {e}"
|
||||
if not isinstance(raw_args, dict):
|
||||
return None, {}, "tool_call 'arguments' must be an object"
|
||||
if not is_deferrable_tool_name(name):
|
||||
return None, {}, (
|
||||
f"'{name}' is not a deferrable tool. If it appears in the model-facing tools "
|
||||
"list already, call it directly instead of via tool_call."
|
||||
)
|
||||
return name, raw_args, None
|
||||
|
||||
|
||||
__all__ = [
|
||||
"TOOL_SEARCH_NAME",
|
||||
"TOOL_DESCRIBE_NAME",
|
||||
"TOOL_CALL_NAME",
|
||||
"BRIDGE_TOOL_NAMES",
|
||||
"ToolSearchConfig",
|
||||
"CatalogEntry",
|
||||
"AssemblyResult",
|
||||
"load_config",
|
||||
"is_deferrable_tool_name",
|
||||
"classify_tools",
|
||||
"estimate_tokens_from_schemas",
|
||||
"should_activate",
|
||||
"build_catalog",
|
||||
"search_catalog",
|
||||
"bridge_tool_schemas",
|
||||
"assemble_tool_defs",
|
||||
"is_bridge_tool",
|
||||
"dispatch_tool_search",
|
||||
"dispatch_tool_describe",
|
||||
"resolve_underlying_call",
|
||||
]
|
||||
152
website/docs/user-guide/features/tool-search.md
Normal file
152
website/docs/user-guide/features/tool-search.md
Normal file
@@ -0,0 +1,152 @@
|
||||
---
|
||||
title: Tool Search
|
||||
sidebar_position: 95
|
||||
---
|
||||
|
||||
# Tool Search
|
||||
|
||||
When you have many MCP servers or non-core plugin tools attached to a
|
||||
session, their JSON schemas can consume a substantial fraction of the
|
||||
context window on every turn — even when only a few of them are relevant
|
||||
to what the user actually asked for.
|
||||
|
||||
**Tool Search** is Hermes' opt-in progressive-disclosure layer for that
|
||||
problem. When activated, MCP and plugin tools are replaced in the
|
||||
model-visible tools array by three bridge tools, and the model loads each
|
||||
specific tool's schema on demand.
|
||||
|
||||
:::info Built-in Hermes tools never defer
|
||||
The tools that make up Hermes' core capability set (`terminal`,
|
||||
`read_file`, `write_file`, `patch`, `search_files`, `todo`, `memory`,
|
||||
`browser_*`, `web_search`, `web_extract`, `clarify`, `execute_code`,
|
||||
`delegate_task`, `session_search`, `send_message`, and the rest of
|
||||
`_HERMES_CORE_TOOLS`) are *always* loaded directly. Only MCP tools and
|
||||
non-core plugin tools are eligible for deferral.
|
||||
:::
|
||||
|
||||
## How it works
|
||||
|
||||
When Tool Search activates for a turn, the model sees three new tools in
|
||||
place of the deferred ones:
|
||||
|
||||
```
|
||||
tool_search(query, limit?) — search the deferred-tool catalog
|
||||
tool_describe(name) — load the full schema for one tool
|
||||
tool_call(name, arguments) — invoke a deferred tool
|
||||
```
|
||||
|
||||
A typical interaction looks like:
|
||||
|
||||
```
|
||||
Model: tool_search("create a github issue")
|
||||
→ { matches: [{ name: "mcp_github_create_issue", ... }, ...] }
|
||||
Model: tool_describe("mcp_github_create_issue")
|
||||
→ { parameters: { type: "object", properties: { ... } } }
|
||||
Model: tool_call("mcp_github_create_issue", { title: "...", body: "..." })
|
||||
→ { ok: true, issue_number: 42 }
|
||||
```
|
||||
|
||||
When the model invokes `tool_call`, Hermes **unwraps the bridge** and
|
||||
dispatches the underlying tool exactly as if the model had called it
|
||||
directly. Pre-tool-call hooks, guardrails, approval prompts, and
|
||||
post-tool-call hooks all run against the real tool name — not against
|
||||
`tool_call`. The activity feed in the CLI and gateway also unwraps so you
|
||||
see the underlying tool, not the bridge.
|
||||
|
||||
## When does it activate?
|
||||
|
||||
By default Tool Search runs in `auto` mode: it activates only when the
|
||||
deferrable tool schemas would consume at least 10% of the active model's
|
||||
context window. Below that, the tools-array assembly is a pure
|
||||
pass-through and you pay no overhead.
|
||||
|
||||
This decision is re-evaluated every time the tools array is built, so:
|
||||
|
||||
- A session with just a few MCP tools and a long context model never
|
||||
activates Tool Search.
|
||||
- A session with many MCP servers attached (15+ tools typically) starts
|
||||
activating it.
|
||||
- Removing MCP servers mid-session correctly returns to direct exposure
|
||||
on the next assembly.
|
||||
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
tools:
|
||||
tool_search:
|
||||
enabled: auto # auto (default), on, or off
|
||||
threshold_pct: 10 # percentage of context — only used in auto mode
|
||||
search_default_limit: 5
|
||||
max_search_limit: 20
|
||||
```
|
||||
|
||||
| Key | Default | Meaning |
|
||||
| --- | --- | --- |
|
||||
| `enabled` | `auto` | `auto` activates above threshold; `on` always activates if there's at least one deferrable tool; `off` disables entirely. |
|
||||
| `threshold_pct` | `10` | Percentage of context length at which `auto` mode kicks in. Range 0–100. |
|
||||
| `search_default_limit` | `5` | Hits returned when the model calls `tool_search` without a `limit`. |
|
||||
| `max_search_limit` | `20` | Hard upper bound the model can request via `limit`. Range 1–50. |
|
||||
|
||||
You can also flip the legacy boolean shape:
|
||||
|
||||
```yaml
|
||||
tools:
|
||||
tool_search: true # equivalent to {enabled: auto}
|
||||
```
|
||||
|
||||
## When NOT to use it
|
||||
|
||||
Tool Search trades a fixed per-turn token cost (the three bridge tool
|
||||
schemas, ~300 tokens) and at least one extra round trip (search →
|
||||
describe → call) for the savings on the deferred schemas. It's a clear
|
||||
win when you have many tools and use few per turn; it's overhead when
|
||||
you have few tools total.
|
||||
|
||||
The `auto` default handles this for you. If you set `enabled: on`
|
||||
unconditionally, expect a slight per-turn cost on small toolsets.
|
||||
|
||||
## Trade-offs that don't go away
|
||||
|
||||
These come from the prompt-cache integrity invariant — they are inherent
|
||||
to any progressive-disclosure design, not specific to this implementation:
|
||||
|
||||
- **One extra round trip on cold tools.** The first time the model needs
|
||||
a deferred tool, it spends one or two extra model calls to find and
|
||||
load the schema. The token savings on the static side are real, but a
|
||||
portion is paid back at runtime.
|
||||
- **No cache benefit on deferred schemas.** A loaded `tool_describe`
|
||||
result enters the conversation history (so it does get cached on
|
||||
subsequent turns) but it never benefits from the system-prompt cache
|
||||
prefix.
|
||||
- **Model-quality dependence.** Tool Search assumes the model can write a
|
||||
reasonable search query for the tool it wants. Smaller models do this
|
||||
less well; the published Anthropic numbers (49% → 74% on Opus 4 with
|
||||
vs. without tool search) show the upside but also that ~26 points of
|
||||
accuracy is still retrieval failure.
|
||||
- **Toolset edits invalidate cache.** Adding or removing a tool mid-
|
||||
session changes the bridge tools' descriptions (which include the
|
||||
count of deferred tools) and the catalog, so the prompt cache is
|
||||
invalidated. This is the same trade-off as any toolset edit.
|
||||
|
||||
## Implementation details
|
||||
|
||||
- **Retrieval:** BM25 over tokenized tool name + description + parameter
|
||||
names. Falls back to a literal substring match on the tool name when
|
||||
BM25 returns no positive-score hits, which protects against
|
||||
zero-IDF degenerate cases (e.g. searching `"github"` against a
|
||||
catalog where every tool name contains "github").
|
||||
- **Catalog is stateless across turns.** It rebuilds from the current
|
||||
tool-defs list every assembly — no session-keyed `Map`. This avoids
|
||||
the class of bug where a stored catalog drifts out of sync with the
|
||||
live tool registry.
|
||||
- **No JS sandbox.** Hermes uses the simpler "structured tools" mode
|
||||
(search / describe / call as plain functions). The JS-sandbox "code
|
||||
mode" some other implementations offer is a large surface area; we
|
||||
skip it.
|
||||
|
||||
## See also
|
||||
|
||||
- `tools/tool_search.py` — the implementation
|
||||
- `tests/tools/test_tool_search.py` — the regression suite
|
||||
- The `openclaw-tool-search-report` PDF in the original implementation
|
||||
PR for the research that shaped the design
|
||||
Reference in New Issue
Block a user