mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 20:29:00 +08:00
Adds a real-model live test for the tool_search feature. Spins up a real
AIAgent against Claude Haiku 4.5 via OpenRouter, registers 20 fake MCP
tools with realistic shapes, runs 5 scenarios twice each (tool_search ON
and OFF), and records the full transcript per run.
Captures both the bridge call sequence the model emitted (tool_search /
tool_describe / tool_call) and the underlying tool calls that actually
executed through the registry. Records iteration count, elapsed time,
and final response for an A/B comparison.
Scenarios cover:
A. Obvious single tool — direct keyword match
B. Vague paraphrased intent — stress retrieval quality
C. Multi-step chain — two deferred tools in sequence
D. Mixed core + deferred — verify core tools (read_file) get called
directly, not through tool_call
E. No tool needed — verify no spurious tool_search invocations
Baseline run included in scripts/out/ for reference. All 10 runs
(5 scenarios x 2 modes) pass — every expected underlying tool was
invoked, no core tool was incorrectly routed through tool_call, no
tool name was hallucinated.
Round-trip cost observed: tool_search enabled added +3 to +4 model
round trips per task vs disabled. Single-tool tasks completed in ~16-20s
vs ~10-11s direct. Multi-tool tasks ~20s vs ~14s. The bridge overhead
is real and measurable but the task completion rate is identical.
64 lines
1.7 KiB
JSON
64 lines
1.7 KiB
JSON
{
|
|
"scenario_id": "C_multi_tool_chain",
|
|
"scenario_description": "Multi-step task requiring 2-3 deferred tools",
|
|
"tool_search_enabled": true,
|
|
"model": "anthropic/claude-haiku-4.5 (via openrouter)",
|
|
"prompt": "Find the open pull requests on repo 'acme/widget', then post a summary of how many there are to the #engineering Slack channel. Then tell me you're done.",
|
|
"expected_underlying_tools": [
|
|
"github_list_pulls",
|
|
"slack_send_message"
|
|
],
|
|
"n_fake_tools_registered": 20,
|
|
"elapsed_seconds": 20.3,
|
|
"bridge_calls": [
|
|
{
|
|
"name": "tool_search",
|
|
"args": {
|
|
"query": "GitHub pull requests"
|
|
}
|
|
},
|
|
{
|
|
"name": "tool_search",
|
|
"args": {
|
|
"query": "Slack post message channel"
|
|
}
|
|
},
|
|
{
|
|
"name": "tool_call",
|
|
"args": {
|
|
"name": "github_list_pulls",
|
|
"arguments": {
|
|
"repo": "acme/widget"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"name": "tool_call",
|
|
"args": {
|
|
"arguments": {
|
|
"channel": "#engineering",
|
|
"message": "Open PR Summary for acme/widget: 1 open pull request\n\n\u2022 PR #31163: feat(tools): tool search"
|
|
},
|
|
"name": "slack_send_message"
|
|
}
|
|
}
|
|
],
|
|
"underlying_tool_calls": [
|
|
{
|
|
"name": "github_list_pulls",
|
|
"args": {
|
|
"repo": "acme/widget"
|
|
}
|
|
},
|
|
{
|
|
"name": "slack_send_message",
|
|
"args": {
|
|
"channel": "#engineering",
|
|
"message": "Open PR Summary for acme/widget: 1 open pull request\n\n\u2022 PR #31163: feat(tools): tool search"
|
|
}
|
|
}
|
|
],
|
|
"final_response": "Done. Found 1 open pull request in acme/widget (PR #31163: feat(tools): tool search) and posted a summary to #engineering.",
|
|
"n_iterations": 4,
|
|
"error": null
|
|
} |