mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 09:17:09 +08:00
Add tools/tool_result_storage.py implementing Layer 2 (per-result) and Layer 3 (per-turn budget) persistence for large tool outputs. Results exceeding thresholds are written to disk with a <persisted-output> preview block replacing the inline content. Extend ToolEntry and ToolRegistry with max_result_size_chars for per-tool threshold control. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
398 lines
16 KiB
Python
398 lines
16 KiB
Python
"""Tests for tools.tool_result_storage — Layer 2 + Layer 3 persistence logic."""
|
|
|
|
import pytest
|
|
|
|
from tools.tool_result_storage import (
|
|
DEFAULT_MAX_RESULT_SIZE_CHARS,
|
|
MAX_TURN_BUDGET_CHARS,
|
|
PERSISTED_OUTPUT_CLOSING_TAG,
|
|
PERSISTED_OUTPUT_TAG,
|
|
PREVIEW_SIZE_BYTES,
|
|
PersistedResult,
|
|
build_persisted_output_message,
|
|
enforce_turn_budget,
|
|
generate_preview,
|
|
maybe_persist_tool_result,
|
|
persist_large_result,
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# generate_preview
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestGeneratePreview:
|
|
def test_short_content_unchanged(self):
|
|
"""Content under limit returns as-is, has_more=False."""
|
|
text = "hello world"
|
|
preview, has_more = generate_preview(text)
|
|
assert preview == text
|
|
assert has_more is False
|
|
|
|
def test_truncates_at_newline_boundary(self):
|
|
"""Multi-line content truncated at last newline within budget."""
|
|
# Build content with lines that exceed the budget
|
|
lines = [f"line {i}: " + "x" * 80 for i in range(50)]
|
|
content = "\n".join(lines)
|
|
assert len(content) > PREVIEW_SIZE_BYTES
|
|
|
|
preview, has_more = generate_preview(content)
|
|
assert has_more is True
|
|
assert len(preview) <= PREVIEW_SIZE_BYTES
|
|
# Should end at a newline boundary
|
|
assert preview.endswith("\n")
|
|
|
|
def test_single_line_truncates_at_max(self):
|
|
"""Single long line truncated at max_bytes exactly."""
|
|
content = "x" * 5000 # No newlines
|
|
preview, has_more = generate_preview(content, max_bytes=100)
|
|
assert has_more is True
|
|
assert len(preview) == 100
|
|
|
|
def test_empty_content(self):
|
|
"""Empty string returns ('', False)."""
|
|
preview, has_more = generate_preview("")
|
|
assert preview == ""
|
|
assert has_more is False
|
|
|
|
def test_exact_boundary(self):
|
|
"""Content exactly at max_bytes returns as-is."""
|
|
content = "x" * PREVIEW_SIZE_BYTES
|
|
preview, has_more = generate_preview(content)
|
|
assert preview == content
|
|
assert has_more is False
|
|
|
|
def test_newline_only_used_if_past_halfway(self):
|
|
"""Newline before halfway mark is ignored; truncation at max_bytes."""
|
|
# Newline at position 10 out of 100 — way before halfway (50)
|
|
content = "a" * 10 + "\n" + "b" * 200
|
|
preview, has_more = generate_preview(content, max_bytes=100)
|
|
assert has_more is True
|
|
# Should NOT truncate at position 10 since it's before halfway
|
|
assert len(preview) == 100
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# persist_large_result
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestPersistLargeResult:
|
|
def test_small_returns_none(self, tmp_path):
|
|
"""Content under DEFAULT_MAX_RESULT_SIZE_CHARS returns None."""
|
|
content = "small output"
|
|
result = persist_large_result(content, "tool_1", tmp_path)
|
|
assert result is None
|
|
|
|
def test_large_writes_file(self, tmp_path):
|
|
"""Content over threshold writes file, returns PersistedResult."""
|
|
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
|
|
result = persist_large_result(content, "tool_2", tmp_path)
|
|
assert result is not None
|
|
assert isinstance(result, PersistedResult)
|
|
assert result.tool_use_id == "tool_2"
|
|
assert result.original_size == len(content)
|
|
assert result.file_path == str(tmp_path / "tool_2.txt")
|
|
assert (tmp_path / "tool_2.txt").exists()
|
|
|
|
def test_dedup_via_exclusive_create(self, tmp_path):
|
|
"""Second call with same tool_use_id doesn't crash, returns result."""
|
|
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
|
|
result1 = persist_large_result(content, "tool_dup", tmp_path)
|
|
result2 = persist_large_result(content, "tool_dup", tmp_path)
|
|
assert result1 is not None
|
|
assert result2 is not None
|
|
# Both return valid PersistedResult
|
|
assert result1.tool_use_id == result2.tool_use_id
|
|
|
|
def test_file_contains_full_content(self, tmp_path):
|
|
"""Verify the written file has the complete original content."""
|
|
content = "line1\nline2\nline3\n" * 10000 # Well over threshold
|
|
result = persist_large_result(content, "tool_full", tmp_path)
|
|
assert result is not None
|
|
on_disk = (tmp_path / "tool_full.txt").read_text(encoding="utf-8")
|
|
assert on_disk == content
|
|
|
|
def test_exactly_at_threshold_returns_none(self, tmp_path):
|
|
"""Content exactly at DEFAULT_MAX_RESULT_SIZE_CHARS is not persisted."""
|
|
content = "x" * DEFAULT_MAX_RESULT_SIZE_CHARS
|
|
result = persist_large_result(content, "tool_exact", tmp_path)
|
|
assert result is None
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# build_persisted_output_message
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestBuildPersistedOutputMessage:
|
|
@pytest.fixture
|
|
def sample_result(self):
|
|
return PersistedResult(
|
|
tool_use_id="test_id",
|
|
original_size=100_000,
|
|
file_path="/tmp/test_id.txt",
|
|
preview="first line\nsecond line\n",
|
|
has_more=True,
|
|
)
|
|
|
|
def test_contains_file_path(self, sample_result):
|
|
msg = build_persisted_output_message(sample_result)
|
|
assert sample_result.file_path in msg
|
|
|
|
def test_contains_preview(self, sample_result):
|
|
msg = build_persisted_output_message(sample_result)
|
|
assert "first line\nsecond line\n" in msg
|
|
|
|
def test_contains_size_info(self, sample_result):
|
|
msg = build_persisted_output_message(sample_result)
|
|
assert "100,000 characters" in msg
|
|
assert "97.7 KB" in msg
|
|
|
|
def test_contains_tags(self, sample_result):
|
|
msg = build_persisted_output_message(sample_result)
|
|
assert msg.startswith(PERSISTED_OUTPUT_TAG)
|
|
assert msg.endswith(PERSISTED_OUTPUT_CLOSING_TAG)
|
|
|
|
def test_has_more_false_no_ellipsis(self):
|
|
result = PersistedResult(
|
|
tool_use_id="t",
|
|
original_size=60_000,
|
|
file_path="/tmp/t.txt",
|
|
preview="all content",
|
|
has_more=False,
|
|
)
|
|
msg = build_persisted_output_message(result)
|
|
assert "\n..." not in msg
|
|
|
|
def test_has_more_true_shows_ellipsis(self, sample_result):
|
|
msg = build_persisted_output_message(sample_result)
|
|
assert "\n..." in msg
|
|
|
|
def test_large_mb_size(self):
|
|
result = PersistedResult(
|
|
tool_use_id="big",
|
|
original_size=2_000_000,
|
|
file_path="/tmp/big.txt",
|
|
preview="preview",
|
|
has_more=True,
|
|
)
|
|
msg = build_persisted_output_message(result)
|
|
assert "MB" in msg
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# maybe_persist_tool_result
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestMaybePersistToolResult:
|
|
def test_small_passes_through(self, tmp_path, monkeypatch):
|
|
"""Under threshold, returns original content."""
|
|
monkeypatch.setattr(
|
|
"tools.registry.registry.get_max_result_size",
|
|
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
|
|
)
|
|
content = "small output"
|
|
result = maybe_persist_tool_result(content, "test_tool", "id_1", tmp_path)
|
|
assert result == content
|
|
|
|
def test_large_returns_persisted_block(self, tmp_path, monkeypatch):
|
|
"""Over threshold, returns <persisted-output> block."""
|
|
monkeypatch.setattr(
|
|
"tools.registry.registry.get_max_result_size",
|
|
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
|
|
)
|
|
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
|
|
result = maybe_persist_tool_result(content, "test_tool", "id_2", tmp_path)
|
|
assert PERSISTED_OUTPUT_TAG in result
|
|
assert PERSISTED_OUTPUT_CLOSING_TAG in result
|
|
# File written
|
|
assert (tmp_path / "id_2.txt").exists()
|
|
|
|
def test_read_file_never_persisted(self, tmp_path, monkeypatch):
|
|
"""read_file with inf threshold always passes through."""
|
|
monkeypatch.setattr(
|
|
"tools.registry.registry.get_max_result_size",
|
|
lambda name: float('inf'),
|
|
)
|
|
content = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS * 3)
|
|
result = maybe_persist_tool_result(content, "read_file", "id_3", tmp_path)
|
|
assert result == content # Unchanged
|
|
|
|
def test_unknown_tool_uses_default(self, tmp_path, monkeypatch):
|
|
"""Unregistered tool name uses 50K default."""
|
|
monkeypatch.setattr(
|
|
"tools.registry.registry.get_max_result_size",
|
|
lambda name: DEFAULT_MAX_RESULT_SIZE_CHARS,
|
|
)
|
|
# Under default: passes through
|
|
content_under = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS - 1)
|
|
result = maybe_persist_tool_result(content_under, "no_such_tool", "id_4", tmp_path)
|
|
assert result == content_under
|
|
|
|
# Over default: persisted
|
|
content_over = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
|
|
result = maybe_persist_tool_result(content_over, "no_such_tool", "id_5", tmp_path)
|
|
assert PERSISTED_OUTPUT_TAG in result
|
|
|
|
def test_custom_threshold_via_registry(self, tmp_path, monkeypatch):
|
|
"""Tool with custom lower threshold persists sooner."""
|
|
custom_limit = 1000
|
|
monkeypatch.setattr(
|
|
"tools.registry.registry.get_max_result_size",
|
|
lambda name: custom_limit,
|
|
)
|
|
content = "x" * (custom_limit + 1)
|
|
# persist_large_result checks DEFAULT_MAX_RESULT_SIZE_CHARS internally,
|
|
# but maybe_persist_tool_result should still create the persisted message.
|
|
# Since content > custom_limit but <= DEFAULT_MAX_RESULT_SIZE_CHARS,
|
|
# persist_large_result returns None and we get original content back.
|
|
# This is expected behavior — Layer 2 threshold gates the call,
|
|
# but persist_large_result has its own floor.
|
|
result = maybe_persist_tool_result(content, "small_tool", "id_6", tmp_path)
|
|
# Content is 1001 chars, well under 50K, so persist_large_result returns None
|
|
assert result == content
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# enforce_turn_budget
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestEnforceTurnBudget:
|
|
def test_under_budget_no_changes(self, tmp_path):
|
|
"""All messages fit, nothing changed."""
|
|
messages = [
|
|
{"content": "short result", "tool_call_id": "t1"},
|
|
{"content": "another short", "tool_call_id": "t2"},
|
|
]
|
|
result = enforce_turn_budget(messages, tmp_path)
|
|
assert result[0]["content"] == "short result"
|
|
assert result[1]["content"] == "another short"
|
|
|
|
def test_over_budget_persists_largest(self, tmp_path):
|
|
"""Total > 200K, largest result gets persisted first."""
|
|
small = "s" * 50_001
|
|
large = "L" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100_001) # 150K+
|
|
messages = [
|
|
{"content": small, "tool_call_id": "small_1"},
|
|
{"content": large, "tool_call_id": "large_1"},
|
|
]
|
|
total_before = len(small) + len(large)
|
|
assert total_before > MAX_TURN_BUDGET_CHARS
|
|
|
|
result = enforce_turn_budget(messages, tmp_path)
|
|
# The large one should be persisted
|
|
assert PERSISTED_OUTPUT_TAG in result[1]["content"]
|
|
# The small one should be unchanged
|
|
assert result[0]["content"] == small
|
|
# File written
|
|
assert (tmp_path / "large_1.txt").exists()
|
|
|
|
def test_already_persisted_skipped(self, tmp_path):
|
|
"""Messages with <persisted-output> not re-persisted."""
|
|
already = f"{PERSISTED_OUTPUT_TAG}\nalready persisted\n{PERSISTED_OUTPUT_CLOSING_TAG}"
|
|
large = "x" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 1)
|
|
messages = [
|
|
{"content": already, "tool_call_id": "p1"},
|
|
{"content": large, "tool_call_id": "new_1"},
|
|
]
|
|
result = enforce_turn_budget(messages, tmp_path, budget=100)
|
|
# already-persisted one is untouched (same object)
|
|
assert result[0]["content"] == already
|
|
# new large one gets persisted
|
|
assert PERSISTED_OUTPUT_TAG in result[1]["content"]
|
|
assert result[1]["content"] != already # Different from the first
|
|
|
|
def test_parallel_80k_results(self, tmp_path):
|
|
"""5 messages each 80K = 400K total, should persist enough to get under 200K."""
|
|
messages = [
|
|
{"content": "x" * 80_000, "tool_call_id": f"par_{i}"}
|
|
for i in range(5)
|
|
]
|
|
total_before = sum(len(m["content"]) for m in messages)
|
|
assert total_before == 400_000
|
|
assert total_before > MAX_TURN_BUDGET_CHARS
|
|
|
|
result = enforce_turn_budget(messages, tmp_path)
|
|
|
|
# Count how many were persisted vs kept inline
|
|
persisted_count = sum(
|
|
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
|
|
)
|
|
inline_count = 5 - persisted_count
|
|
|
|
# At least some must be persisted to get under budget.
|
|
# Each 80K result is > 50K threshold so persist_large_result will work.
|
|
assert persisted_count >= 1
|
|
|
|
# Total should now be under budget (or close — replacement text adds some)
|
|
total_after = sum(len(m["content"]) for m in result)
|
|
assert total_after < total_before # Definitely reduced
|
|
|
|
def test_empty_messages(self, tmp_path):
|
|
"""Empty list returns empty list."""
|
|
result = enforce_turn_budget([], tmp_path)
|
|
assert result == []
|
|
|
|
def test_budget_parameter_respected(self, tmp_path):
|
|
"""Custom budget parameter is used instead of default."""
|
|
# Two messages each 100 chars, budget=150 should trigger persistence
|
|
messages = [
|
|
{"content": "a" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100), "tool_call_id": "b1"},
|
|
{"content": "b" * (DEFAULT_MAX_RESULT_SIZE_CHARS + 100), "tool_call_id": "b2"},
|
|
]
|
|
result = enforce_turn_budget(messages, tmp_path, budget=50_000)
|
|
# At least one should be persisted
|
|
persisted_count = sum(
|
|
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
|
|
)
|
|
assert persisted_count >= 1
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Registry integration: get_max_result_size
|
|
# ------------------------------------------------------------------ #
|
|
|
|
class TestRegistryGetMaxResultSize:
|
|
def test_default_for_unknown_tool(self):
|
|
"""Unregistered tool returns DEFAULT_MAX_RESULT_SIZE_CHARS."""
|
|
from tools.registry import ToolRegistry
|
|
reg = ToolRegistry()
|
|
assert reg.get_max_result_size("nonexistent") == DEFAULT_MAX_RESULT_SIZE_CHARS
|
|
|
|
def test_custom_threshold(self):
|
|
"""Tool registered with max_result_size_chars returns that value."""
|
|
from tools.registry import ToolRegistry
|
|
reg = ToolRegistry()
|
|
reg.register(
|
|
name="custom_tool",
|
|
toolset="test",
|
|
schema={"description": "test"},
|
|
handler=lambda args: "ok",
|
|
max_result_size_chars=10_000,
|
|
)
|
|
assert reg.get_max_result_size("custom_tool") == 10_000
|
|
|
|
def test_inf_threshold(self):
|
|
"""Tool with inf threshold returns inf."""
|
|
from tools.registry import ToolRegistry
|
|
reg = ToolRegistry()
|
|
reg.register(
|
|
name="read_file",
|
|
toolset="test",
|
|
schema={"description": "test"},
|
|
handler=lambda args: "ok",
|
|
max_result_size_chars=float('inf'),
|
|
)
|
|
assert reg.get_max_result_size("read_file") == float('inf')
|
|
|
|
def test_none_falls_back_to_default(self):
|
|
"""Tool registered without max_result_size_chars uses default."""
|
|
from tools.registry import ToolRegistry
|
|
reg = ToolRegistry()
|
|
reg.register(
|
|
name="plain_tool",
|
|
toolset="test",
|
|
schema={"description": "test"},
|
|
handler=lambda args: "ok",
|
|
)
|
|
assert reg.get_max_result_size("plain_tool") == DEFAULT_MAX_RESULT_SIZE_CHARS
|