Files
hermes-agent/tests/test_transform_tool_result_hook.py

194 lines
6.0 KiB
Python
Raw Normal View History

"""Tests for the ``transform_tool_result`` plugin hook wired into
``model_tools.handle_function_call``.
Mirrors the ``transform_terminal_output`` hook tests from Phase 1 but
targets the generic tool-result seam that runs for every tool dispatch.
"""
import json
import os
from pathlib import Path
from unittest.mock import MagicMock
import hermes_cli.plugins as plugins_mod
import model_tools
_UNSET = object()
def _run_handle_function_call(
monkeypatch,
*,
tool_name="dummy_tool",
tool_args=None,
dispatch_result='{"output": "original"}',
invoke_hook=_UNSET,
):
"""Drive ``handle_function_call`` with a mocked registry dispatch."""
from tools.registry import registry
monkeypatch.setattr(
registry, "dispatch",
lambda name, args, **kw: dispatch_result,
)
# Skip unrelated side effects (read-loop tracker).
monkeypatch.setattr(model_tools, "_READ_SEARCH_TOOLS", frozenset())
if invoke_hook is not _UNSET:
# Patch the symbol actually imported inside handle_function_call.
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", invoke_hook)
return model_tools.handle_function_call(
tool_name,
tool_args or {},
task_id="t1",
session_id="s1",
tool_call_id="tc1",
skip_pre_tool_call_hook=True,
)
def test_result_unchanged_when_no_hook_registered(monkeypatch):
# Real invoke_hook with no plugins loaded returns [].
monkeypatch.setenv("HERMES_HOME", "/tmp/hermes_no_plugins")
# Force a fresh plugin manager so no stale plugins pollute state.
plugins_mod._plugin_manager = plugins_mod.PluginManager()
out = _run_handle_function_call(monkeypatch)
assert out == '{"output": "original"}'
def test_result_unchanged_for_none_hook_return(monkeypatch):
out = _run_handle_function_call(
monkeypatch,
invoke_hook=lambda hook_name, **kw: [None],
)
assert out == '{"output": "original"}'
def test_result_ignores_non_string_hook_returns(monkeypatch):
out = _run_handle_function_call(
monkeypatch,
invoke_hook=lambda hook_name, **kw: [{"bad": True}, 123, ["nope"]],
)
assert out == '{"output": "original"}'
def test_first_valid_string_return_replaces_result(monkeypatch):
out = _run_handle_function_call(
monkeypatch,
invoke_hook=lambda hook_name, **kw: [None, {"x": 1}, "first", "second"],
)
assert out == "first"
def test_hook_receives_expected_kwargs(monkeypatch):
captured = {}
def _hook(hook_name, **kwargs):
if hook_name == "transform_tool_result":
captured.update(kwargs)
return []
out = _run_handle_function_call(
monkeypatch,
tool_name="my_tool",
tool_args={"a": 1, "b": "x"},
dispatch_result='{"ok": true}',
invoke_hook=_hook,
)
assert out == '{"ok": true}'
assert captured["tool_name"] == "my_tool"
assert captured["args"] == {"a": 1, "b": "x"}
assert captured["result"] == '{"ok": true}'
assert captured["task_id"] == "t1"
assert captured["session_id"] == "s1"
assert captured["tool_call_id"] == "tc1"
def test_hook_exception_falls_back_to_original(monkeypatch):
def _raise(*_a, **_kw):
raise RuntimeError("boom")
out = _run_handle_function_call(
monkeypatch,
invoke_hook=_raise,
)
assert out == '{"output": "original"}'
def test_post_tool_call_remains_observational(monkeypatch):
"""post_tool_call return values must NOT replace the result."""
def _hook(hook_name, **kw):
if hook_name == "post_tool_call":
# Observers returning a string must be ignored.
return ["observer return should be ignored"]
return []
out = _run_handle_function_call(
monkeypatch,
invoke_hook=_hook,
)
assert out == '{"output": "original"}'
def test_transform_tool_result_runs_after_post_tool_call(monkeypatch):
"""post_tool_call sees ORIGINAL result; transform_tool_result sees same and may replace."""
observed = []
def _hook(hook_name, **kw):
if hook_name == "post_tool_call":
observed.append(("post_tool_call", kw["result"]))
return []
if hook_name == "transform_tool_result":
observed.append(("transform_tool_result", kw["result"]))
return ["rewritten"]
return []
out = _run_handle_function_call(
monkeypatch,
dispatch_result='{"raw": "value"}',
invoke_hook=_hook,
)
assert out == "rewritten"
# Both hooks saw the ORIGINAL (untransformed) result.
assert observed == [
("post_tool_call", '{"raw": "value"}'),
("transform_tool_result", '{"raw": "value"}'),
]
def test_transform_tool_result_integration_with_real_plugin(monkeypatch, tmp_path):
"""End-to-end: load a real plugin from HERMES_HOME and verify it rewrites results."""
test: stop testing mutable data — convert change-detectors to invariants (#13363) Catalog snapshots, config version literals, and enumeration counts are data that changes as designed. Tests that assert on those values add no behavioral coverage — they just break CI on every routine update and cost engineering time to 'fix.' Replace with invariants where one exists, delete where none does. Deleted (pure snapshots): - TestMinimaxModelCatalog (3 tests): 'MiniMax-M2.7 in models' et al - TestGeminiModelCatalog: 'gemini-2.5-pro in models', 'gemini-3.x in models' - test_browser_camofox_state::test_config_version_matches_current_schema (docstring literally said it would break on unrelated bumps) Relaxed (keep plumbing check, drop snapshot): - Xiaomi / Arcee / Kimi moonshot / Kimi coding / HuggingFace static lists: now assert 'provider exists and has >= 1 entry' instead of specific names - HuggingFace main/models.py consistency test: drop 'len >= 6' floor Dynamicized (follow source, not a literal): - 3x test_config.py migration tests: raw['_config_version'] == DEFAULT_CONFIG['_config_version'] instead of hardcoded 21 Fixed stale tests against intentional behavior changes: - test_insights::test_gateway_format_hides_cost: name matches new behavior (no dollar figures); remove contradicting '$' in text assertion - test_config::prefers_api_then_url_then_base_url: flipped per PR #9332; rename + update to base_url > url > api - test_anthropic_adapter: relax assert_called_once() (xdist-flaky) to assert called — contract is 'credential flowed through' - test_interrupt_propagation: add provider/model/_base_url to bare-agent fixture so the stale-timeout code path resolves Fixed stale integration tests against opt-in plugin gate: - transform_tool_result + transform_terminal_output: write plugins.enabled allow-list to config.yaml and reset the plugin manager singleton Source fix (real consistency invariant): - agent/model_metadata.py: add moonshotai/Kimi-K2.6 context length (262144, same as K2.5). test_model_metadata_has_context_lengths was correctly catching the gap. Policy: - AGENTS.md Testing section: new subsection 'Don't write change-detector tests' with do/don't examples. Reviewers should reject catalog-snapshot assertions in new tests. Covers every test that failed on the last completed main CI run (24703345583) except test_modal_sandbox_fixes::test_terminal_tool_present + test_terminal_and_file_toolsets_resolve_all_tools, which now pass both alone and with the full tests/tools/ directory (xdist ordering flake that resolved itself).
2026-04-20 23:20:33 -07:00
import yaml
hermes_home = Path(os.environ["HERMES_HOME"])
plugins_dir = hermes_home / "plugins"
plugin_dir = plugins_dir / "transform_result_canon"
plugin_dir.mkdir(parents=True)
(plugin_dir / "plugin.yaml").write_text("name: transform_result_canon\n", encoding="utf-8")
(plugin_dir / "__init__.py").write_text(
"def register(ctx):\n"
' ctx.register_hook("transform_tool_result", '
'lambda **kw: f\'CANON[{kw["tool_name"]}]\' + kw["result"])\n',
encoding="utf-8",
)
test: stop testing mutable data — convert change-detectors to invariants (#13363) Catalog snapshots, config version literals, and enumeration counts are data that changes as designed. Tests that assert on those values add no behavioral coverage — they just break CI on every routine update and cost engineering time to 'fix.' Replace with invariants where one exists, delete where none does. Deleted (pure snapshots): - TestMinimaxModelCatalog (3 tests): 'MiniMax-M2.7 in models' et al - TestGeminiModelCatalog: 'gemini-2.5-pro in models', 'gemini-3.x in models' - test_browser_camofox_state::test_config_version_matches_current_schema (docstring literally said it would break on unrelated bumps) Relaxed (keep plumbing check, drop snapshot): - Xiaomi / Arcee / Kimi moonshot / Kimi coding / HuggingFace static lists: now assert 'provider exists and has >= 1 entry' instead of specific names - HuggingFace main/models.py consistency test: drop 'len >= 6' floor Dynamicized (follow source, not a literal): - 3x test_config.py migration tests: raw['_config_version'] == DEFAULT_CONFIG['_config_version'] instead of hardcoded 21 Fixed stale tests against intentional behavior changes: - test_insights::test_gateway_format_hides_cost: name matches new behavior (no dollar figures); remove contradicting '$' in text assertion - test_config::prefers_api_then_url_then_base_url: flipped per PR #9332; rename + update to base_url > url > api - test_anthropic_adapter: relax assert_called_once() (xdist-flaky) to assert called — contract is 'credential flowed through' - test_interrupt_propagation: add provider/model/_base_url to bare-agent fixture so the stale-timeout code path resolves Fixed stale integration tests against opt-in plugin gate: - transform_tool_result + transform_terminal_output: write plugins.enabled allow-list to config.yaml and reset the plugin manager singleton Source fix (real consistency invariant): - agent/model_metadata.py: add moonshotai/Kimi-K2.6 context length (262144, same as K2.5). test_model_metadata_has_context_lengths was correctly catching the gap. Policy: - AGENTS.md Testing section: new subsection 'Don't write change-detector tests' with do/don't examples. Reviewers should reject catalog-snapshot assertions in new tests. Covers every test that failed on the last completed main CI run (24703345583) except test_modal_sandbox_fixes::test_terminal_tool_present + test_terminal_and_file_toolsets_resolve_all_tools, which now pass both alone and with the full tests/tools/ directory (xdist ordering flake that resolved itself).
2026-04-20 23:20:33 -07:00
# Plugins are opt-in — must be listed in plugins.enabled to load.
cfg_path = hermes_home / "config.yaml"
cfg_path.write_text(
yaml.safe_dump({"plugins": {"enabled": ["transform_result_canon"]}}),
encoding="utf-8",
)
test: stop testing mutable data — convert change-detectors to invariants (#13363) Catalog snapshots, config version literals, and enumeration counts are data that changes as designed. Tests that assert on those values add no behavioral coverage — they just break CI on every routine update and cost engineering time to 'fix.' Replace with invariants where one exists, delete where none does. Deleted (pure snapshots): - TestMinimaxModelCatalog (3 tests): 'MiniMax-M2.7 in models' et al - TestGeminiModelCatalog: 'gemini-2.5-pro in models', 'gemini-3.x in models' - test_browser_camofox_state::test_config_version_matches_current_schema (docstring literally said it would break on unrelated bumps) Relaxed (keep plumbing check, drop snapshot): - Xiaomi / Arcee / Kimi moonshot / Kimi coding / HuggingFace static lists: now assert 'provider exists and has >= 1 entry' instead of specific names - HuggingFace main/models.py consistency test: drop 'len >= 6' floor Dynamicized (follow source, not a literal): - 3x test_config.py migration tests: raw['_config_version'] == DEFAULT_CONFIG['_config_version'] instead of hardcoded 21 Fixed stale tests against intentional behavior changes: - test_insights::test_gateway_format_hides_cost: name matches new behavior (no dollar figures); remove contradicting '$' in text assertion - test_config::prefers_api_then_url_then_base_url: flipped per PR #9332; rename + update to base_url > url > api - test_anthropic_adapter: relax assert_called_once() (xdist-flaky) to assert called — contract is 'credential flowed through' - test_interrupt_propagation: add provider/model/_base_url to bare-agent fixture so the stale-timeout code path resolves Fixed stale integration tests against opt-in plugin gate: - transform_tool_result + transform_terminal_output: write plugins.enabled allow-list to config.yaml and reset the plugin manager singleton Source fix (real consistency invariant): - agent/model_metadata.py: add moonshotai/Kimi-K2.6 context length (262144, same as K2.5). test_model_metadata_has_context_lengths was correctly catching the gap. Policy: - AGENTS.md Testing section: new subsection 'Don't write change-detector tests' with do/don't examples. Reviewers should reject catalog-snapshot assertions in new tests. Covers every test that failed on the last completed main CI run (24703345583) except test_modal_sandbox_fixes::test_terminal_tool_present + test_terminal_and_file_toolsets_resolve_all_tools, which now pass both alone and with the full tests/tools/ directory (xdist ordering flake that resolved itself).
2026-04-20 23:20:33 -07:00
# Force a fresh plugin manager so the new config is picked up.
plugins_mod._plugin_manager = plugins_mod.PluginManager()
plugins_mod.discover_plugins()
out = _run_handle_function_call(
monkeypatch,
tool_name="some_tool",
dispatch_result='{"payload": 42}',
)
assert out == 'CANON[some_tool]{"payload": 42}'