Files
hermes-agent/tests/tools/test_tool_result_storage.py
alt-glitch 3dfce74099 feat(tools): add tool result persistence module + registry support
Add tools/tool_result_storage.py implementing Layer 2 (per-result) and
Layer 3 (per-turn budget) persistence for large tool outputs. Results
exceeding thresholds are written to disk with a <persisted-output>
preview block replacing the inline content. Extend ToolEntry and
ToolRegistry with max_result_size_chars for per-tool threshold control.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 01:31:48 +00:00

398 lines
16 KiB
Python

"""Tests for tools.tool_result_storage — Layer 2 + Layer 3 persistence logic."""
import pytest
from tools.tool_result_storage import (
DEFAULT_MAX_RESULT_SIZE_CHARS,
MAX_TURN_BUDGET_CHARS,
PERSISTED_OUTPUT_CLOSING_TAG,
PERSISTED_OUTPUT_TAG,
PREVIEW_SIZE_BYTES,
PersistedResult,
build_persisted_output_message,
enforce_turn_budget,
generate_preview,
maybe_persist_tool_result,
persist_large_result,
)
# ------------------------------------------------------------------ #
# generate_preview
# ------------------------------------------------------------------ #
class TestGeneratePreview:
def test_short_content_unchanged(self):
"""Content under limit returns as-is, has_more=False."""
text = "hello world"
preview, has_more = generate_preview(text)
assert preview == text
assert has_more is False
def test_truncates_at_newline_boundary(self):
"""Multi-line content truncated at last newline within budget."""
# Build content with lines that exceed the budget
lines = [f"line {i}: " + "x" * 80 for i in range(50)]
content = "\n".join(lines)
assert len(content) > PREVIEW_SIZE_BYTES
preview, has_more = generate_preview(content)
assert has_more is True
assert len(preview) <= PREVIEW_SIZE_BYTES
# Should end at a newline boundary
assert preview.endswith("\n")
def test_single_line_truncates_at_max(self):
"""Single long line truncated at max_bytes exactly."""
content = "x" * 5000 # No newlines
preview, has_more = generate_preview(content, max_bytes=100)
assert has_more is True
assert len(preview) == 100
def test_empty_content(self):
"""Empty string returns ('', False)."""
preview, has_more = generate_preview("")
assert preview == ""
assert has_more is False
def test_exact_boundary(self):
"""Content exactly at max_bytes returns as-is."""
content = "x" * PREVIEW_SIZE_BYTES
preview, has_more = generate_preview(content)
assert preview == content
assert has_more is False
def test_newline_only_used_if_past_halfway(self):
"""Newline before halfway mark is ignored; truncation at max_bytes."""
# Newline at position 10 out of 100 — way before halfway (50)
content = "a" * 10 + "\n" + "b" * 200
preview, has_more = generate_preview(content, max_bytes=100)
assert has_more is True
# Should NOT truncate at position 10 since it's before halfway
assert len(preview) == 100
# ------------------------------------------------------------------ #
# persist_large_result
# ------------------------------------------------------------------ #
class TestPersistLargeResult:
def test_small_returns_none(self, tmp_path):
"""Content under DEFAULT_MAX_RESULT_SIZE_CHARS returns None."""
content = "small output"
result = persist_large_result(content, "tool_1", tmp_path)
assert result is None
def test_large_writes_file(self, tmp_path):
"""Content over threshold writes file, returns PersistedResult."""
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
result = persist_large_result(content, "tool_2", tmp_path)
assert result is not None
assert isinstance(result, PersistedResult)
assert result.tool_use_id == "tool_2"
assert result.original_size == len(content)
assert result.file_path == str(tmp_path / "tool_2.txt")
assert (tmp_path / "tool_2.txt").exists()
def test_dedup_via_exclusive_create(self, tmp_path):
"""Second call with same tool_use_id doesn't crash, returns result."""
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
result1 = persist_large_result(content, "tool_dup", tmp_path)
result2 = persist_large_result(content, "tool_dup", tmp_path)
assert result1 is not None
assert result2 is not None
# Both return valid PersistedResult
assert result1.tool_use_id == result2.tool_use_id
def test_file_contains_full_content(self, tmp_path):
"""Verify the written file has the complete original content."""
content = "line1\nline2\nline3\n" * 10000 # Well over threshold
result = persist_large_result(content, "tool_full", tmp_path)
assert result is not None
on_disk = (tmp_path / "tool_full.txt").read_text(encoding="utf-8")
assert on_disk == content
def test_exactly_at_threshold_returns_none(self, tmp_path):
"""Content exactly at DEFAULT_MAX_RESULT_SIZE_CHARS is not persisted."""
content = "x" * DEFAULT_MAX_RESULT_SIZE_CHARS
result = persist_large_result(content, "tool_exact", tmp_path)
assert result is None
# ------------------------------------------------------------------ #
# build_persisted_output_message
# ------------------------------------------------------------------ #
class TestBuildPersistedOutputMessage:
@pytest.fixture
def sample_result(self):
return PersistedResult(
tool_use_id="test_id",
original_size=100_000,
file_path="/tmp/test_id.txt",
preview="first line\nsecond line\n",
has_more=True,
)
def test_contains_file_path(self, sample_result):
msg = build_persisted_output_message(sample_result)
assert sample_result.file_path in msg
def test_contains_preview(self, sample_result):
msg = build_persisted_output_message(sample_result)
assert "first line\nsecond line\n" in msg
def test_contains_size_info(self, sample_result):
msg = build_persisted_output_message(sample_result)
assert "100,000 characters" in msg
assert "97.7 KB" in msg
def test_contains_tags(self, sample_result):
msg = build_persisted_output_message(sample_result)
assert msg.startswith(PERSISTED_OUTPUT_TAG)
assert msg.endswith(PERSISTED_OUTPUT_CLOSING_TAG)
def test_has_more_false_no_ellipsis(self):
result = PersistedResult(
tool_use_id="t",
original_size=60_000,
file_path="/tmp/t.txt",
preview="all content",
has_more=False,
)
msg = build_persisted_output_message(result)
assert "\n..." not in msg
def test_has_more_true_shows_ellipsis(self, sample_result):
msg = build_persisted_output_message(sample_result)
assert "\n..." in msg
def test_large_mb_size(self):
result = PersistedResult(
tool_use_id="big",
original_size=2_000_000,
file_path="/tmp/big.txt",
preview="preview",
has_more=True,
)
msg = build_persisted_output_message(result)
assert "MB" in msg
# ------------------------------------------------------------------ #
# maybe_persist_tool_result
# ------------------------------------------------------------------ #
class TestMaybePersistToolResult:
def test_small_passes_through(self, tmp_path, monkeypatch):
"""Under threshold, returns original content."""
monkeypatch.setattr(
"tools.registry.registry.get_max_result_size",
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
)
content = "small output"
result = maybe_persist_tool_result(content, "test_tool", "id_1", tmp_path)
assert result == content
def test_large_returns_persisted_block(self, tmp_path, monkeypatch):
"""Over threshold, returns <persisted-output> block."""
monkeypatch.setattr(
"tools.registry.registry.get_max_result_size",
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
)
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
result = maybe_persist_tool_result(content, "test_tool", "id_2", tmp_path)
assert PERSISTED_OUTPUT_TAG in result
assert PERSISTED_OUTPUT_CLOSING_TAG in result
# File written
assert (tmp_path / "id_2.txt").exists()
def test_read_file_never_persisted(self, tmp_path, monkeypatch):
"""read_file with inf threshold always passes through."""
monkeypatch.setattr(
"tools.registry.registry.get_max_result_size",
lambda name: float('inf'),
)
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS * 3)
result = maybe_persist_tool_result(content, "read_file", "id_3", tmp_path)
assert result == content # Unchanged
def test_unknown_tool_uses_default(self, tmp_path, monkeypatch):
"""Unregistered tool name uses 50K default."""
monkeypatch.setattr(
"tools.registry.registry.get_max_result_size",
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
)
# Under default: passes through
content_under = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS - 1)
result = maybe_persist_tool_result(content_under, "no_such_tool", "id_4", tmp_path)
assert result == content_under
# Over default: persisted
content_over = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
result = maybe_persist_tool_result(content_over, "no_such_tool", "id_5", tmp_path)
assert PERSISTED_OUTPUT_TAG in result
def test_custom_threshold_via_registry(self, tmp_path, monkeypatch):
"""Tool with custom lower threshold persists sooner."""
custom_limit = 1000
monkeypatch.setattr(
"tools.registry.registry.get_max_result_size",
lambda name: custom_limit,
)
content = "x" * (custom_limit + 1)
# persist_large_result checks DEFAULT_MAX_RESULT_SIZE_CHARS internally,
# but maybe_persist_tool_result should still create the persisted message.
# Since content > custom_limit but <= DEFAULT_MAX_RESULT_SIZE_CHARS,
# persist_large_result returns None and we get original content back.
# This is expected behavior — Layer 2 threshold gates the call,
# but persist_large_result has its own floor.
result = maybe_persist_tool_result(content, "small_tool", "id_6", tmp_path)
# Content is 1001 chars, well under 50K, so persist_large_result returns None
assert result == content
# ------------------------------------------------------------------ #
# enforce_turn_budget
# ------------------------------------------------------------------ #
class TestEnforceTurnBudget:
def test_under_budget_no_changes(self, tmp_path):
"""All messages fit, nothing changed."""
messages = [
{"content": "short result", "tool_call_id": "t1"},
{"content": "another short", "tool_call_id": "t2"},
]
result = enforce_turn_budget(messages, tmp_path)
assert result[0]["content"] == "short result"
assert result[1]["content"] == "another short"
def test_over_budget_persists_largest(self, tmp_path):
"""Total > 200K, largest result gets persisted first."""
small = "s" * 50_001
large = "L" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100_001) # 150K+
messages = [
{"content": small, "tool_call_id": "small_1"},
{"content": large, "tool_call_id": "large_1"},
]
total_before = len(small) + len(large)
assert total_before > MAX_TURN_BUDGET_CHARS
result = enforce_turn_budget(messages, tmp_path)
# The large one should be persisted
assert PERSISTED_OUTPUT_TAG in result[1]["content"]
# The small one should be unchanged
assert result[0]["content"] == small
# File written
assert (tmp_path / "large_1.txt").exists()
def test_already_persisted_skipped(self, tmp_path):
"""Messages with <persisted-output> not re-persisted."""
already = f"{PERSISTED_OUTPUT_TAG}\nalready persisted\n{PERSISTED_OUTPUT_CLOSING_TAG}"
large = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
messages = [
{"content": already, "tool_call_id": "p1"},
{"content": large, "tool_call_id": "new_1"},
]
result = enforce_turn_budget(messages, tmp_path, budget=100)
# already-persisted one is untouched (same object)
assert result[0]["content"] == already
# new large one gets persisted
assert PERSISTED_OUTPUT_TAG in result[1]["content"]
assert result[1]["content"] != already # Different from the first
def test_parallel_80k_results(self, tmp_path):
"""5 messages each 80K = 400K total, should persist enough to get under 200K."""
messages = [
{"content": "x" * 80_000, "tool_call_id": f"par_{i}"}
for i in range(5)
]
total_before = sum(len(m["content"]) for m in messages)
assert total_before == 400_000
assert total_before > MAX_TURN_BUDGET_CHARS
result = enforce_turn_budget(messages, tmp_path)
# Count how many were persisted vs kept inline
persisted_count = sum(
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
)
inline_count = 5 - persisted_count
# At least some must be persisted to get under budget.
# Each 80K result is > 50K threshold so persist_large_result will work.
assert persisted_count >= 1
# Total should now be under budget (or close — replacement text adds some)
total_after = sum(len(m["content"]) for m in result)
assert total_after < total_before # Definitely reduced
def test_empty_messages(self, tmp_path):
"""Empty list returns empty list."""
result = enforce_turn_budget([], tmp_path)
assert result == []
def test_budget_parameter_respected(self, tmp_path):
"""Custom budget parameter is used instead of default."""
# Two messages each 100 chars, budget=150 should trigger persistence
messages = [
{"content": "a" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100), "tool_call_id": "b1"},
{"content": "b" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100), "tool_call_id": "b2"},
]
result = enforce_turn_budget(messages, tmp_path, budget=50_000)
# At least one should be persisted
persisted_count = sum(
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
)
assert persisted_count >= 1
# ------------------------------------------------------------------ #
# Registry integration: get_max_result_size
# ------------------------------------------------------------------ #
class TestRegistryGetMaxResultSize:
def test_default_for_unknown_tool(self):
"""Unregistered tool returns DEFAULT_MAX_RESULT_SIZE_CHARS."""
from tools.registry import ToolRegistry
reg = ToolRegistry()
assert reg.get_max_result_size("nonexistent") == DEFAULT_MAX_RESULT_SIZE_CHARS
def test_custom_threshold(self):
"""Tool registered with max_result_size_chars returns that value."""
from tools.registry import ToolRegistry
reg = ToolRegistry()
reg.register(
name="custom_tool",
toolset="test",
schema={"description": "test"},
handler=lambda args: "ok",
max_result_size_chars=10_000,
)
assert reg.get_max_result_size("custom_tool") == 10_000
def test_inf_threshold(self):
"""Tool with inf threshold returns inf."""
from tools.registry import ToolRegistry
reg = ToolRegistry()
reg.register(
name="read_file",
toolset="test",
schema={"description": "test"},
handler=lambda args: "ok",
max_result_size_chars=float('inf'),
)
assert reg.get_max_result_size("read_file") == float('inf')
def test_none_falls_back_to_default(self):
"""Tool registered without max_result_size_chars uses default."""
from tools.registry import ToolRegistry
reg = ToolRegistry()
reg.register(
name="plain_tool",
toolset="test",
schema={"description": "test"},
handler=lambda args: "ok",
)
assert reg.get_max_result_size("plain_tool") == DEFAULT_MAX_RESULT_SIZE_CHARS