mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 16:57:36 +08:00
fix: address PR review — alias expansion + L3 budget enforcement
- Add `shopt -s expand_aliases` to snapshot so aliases captured by `alias -p` actually work under `bash -c` (review comment #2) - Pass threshold=0 in enforce_turn_budget() so L3 can force-persist results below the 50K default when aggregate budget is exceeded (review comment #3) - Add regression test: 6x42K results (each under 50K) exceeding 200K budget are now correctly persisted
This commit is contained in:
@@ -7,7 +7,7 @@ from tools.tool_result_storage import (
|
||||
MAX_TURN_BUDGET_CHARS,
|
||||
PERSISTED_OUTPUT_CLOSING_TAG,
|
||||
PERSISTED_OUTPUT_TAG,
|
||||
PREVIEW_SIZE_BYTES,
|
||||
PREVIEW_SIZE_CHARS,
|
||||
PersistedResult,
|
||||
build_persisted_output_message,
|
||||
enforce_turn_budget,
|
||||
@@ -34,18 +34,18 @@ class TestGeneratePreview:
|
||||
# Build content with lines that exceed the budget
|
||||
lines = [f"line {i}: " + "x" * 80 for i in range(50)]
|
||||
content = "\n".join(lines)
|
||||
assert len(content) > PREVIEW_SIZE_BYTES
|
||||
assert len(content) > PREVIEW_SIZE_CHARS
|
||||
|
||||
preview, has_more = generate_preview(content)
|
||||
assert has_more is True
|
||||
assert len(preview) <= PREVIEW_SIZE_BYTES
|
||||
assert len(preview) <= PREVIEW_SIZE_CHARS
|
||||
# Should end at a newline boundary
|
||||
assert preview.endswith("\n")
|
||||
|
||||
def test_single_line_truncates_at_max(self):
|
||||
"""Single long line truncated at max_bytes exactly."""
|
||||
content = "x" * 5000 # No newlines
|
||||
preview, has_more = generate_preview(content, max_bytes=100)
|
||||
preview, has_more = generate_preview(content, max_chars=100)
|
||||
assert has_more is True
|
||||
assert len(preview) == 100
|
||||
|
||||
@@ -57,7 +57,7 @@ class TestGeneratePreview:
|
||||
|
||||
def test_exact_boundary(self):
|
||||
"""Content exactly at max_bytes returns as-is."""
|
||||
content = "x" * PREVIEW_SIZE_BYTES
|
||||
content = "x" * PREVIEW_SIZE_CHARS
|
||||
preview, has_more = generate_preview(content)
|
||||
assert preview == content
|
||||
assert has_more is False
|
||||
@@ -66,7 +66,7 @@ class TestGeneratePreview:
|
||||
"""Newline before halfway mark is ignored; truncation at max_bytes."""
|
||||
# Newline at position 10 out of 100 — way before halfway (50)
|
||||
content = "a" * 10 + "\n" + "b" * 200
|
||||
preview, has_more = generate_preview(content, max_bytes=100)
|
||||
preview, has_more = generate_preview(content, max_chars=100)
|
||||
assert has_more is True
|
||||
# Should NOT truncate at position 10 since it's before halfway
|
||||
assert len(preview) == 100
|
||||
@@ -241,15 +241,11 @@ class TestMaybePersistToolResult:
|
||||
lambda name: custom_limit,
|
||||
)
|
||||
content = "x" * (custom_limit + 1)
|
||||
# persist_large_result checks DEFAULT_MAX_RESULT_SIZE_CHARS internally,
|
||||
# but maybe_persist_tool_result should still create the persisted message.
|
||||
# Since content > custom_limit but <= DEFAULT_MAX_RESULT_SIZE_CHARS,
|
||||
# persist_large_result returns None and we get original content back.
|
||||
# This is expected behavior — Layer 2 threshold gates the call,
|
||||
# but persist_large_result has its own floor.
|
||||
result = maybe_persist_tool_result(content, "small_tool", "id_6", tmp_path)
|
||||
# Content is 1001 chars, well under 50K, so persist_large_result returns None
|
||||
assert result == content
|
||||
# Content exceeds the per-tool threshold (1001 > 1000) so it should
|
||||
# be persisted even though it's well below the 50K default.
|
||||
assert PERSISTED_OUTPUT_TAG in result
|
||||
assert (tmp_path / "id_6.txt").exists()
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
@@ -332,6 +328,35 @@ class TestEnforceTurnBudget:
|
||||
result = enforce_turn_budget([], tmp_path)
|
||||
assert result == []
|
||||
|
||||
def test_medium_results_under_default_threshold_still_persisted(self, tmp_path):
|
||||
"""Multiple 42K results (under 50K default) exceed 200K budget.
|
||||
|
||||
Regression test: L3 must force-persist even when individual results
|
||||
are below DEFAULT_MAX_RESULT_SIZE_CHARS. Without threshold=0 in the
|
||||
enforce_turn_budget call, these would all be skipped.
|
||||
"""
|
||||
# 6 x 42K = 252K > 200K budget, but each is under 50K default
|
||||
messages = [
|
||||
{"content": "x" * 42_000, "tool_call_id": f"med_{i}"}
|
||||
for i in range(6)
|
||||
]
|
||||
total_before = sum(len(m["content"]) for m in messages)
|
||||
assert total_before == 252_000
|
||||
assert total_before > MAX_TURN_BUDGET_CHARS
|
||||
# Each individual result is under the default 50K threshold
|
||||
assert all(len(m["content"]) < DEFAULT_MAX_RESULT_SIZE_CHARS for m in messages)
|
||||
|
||||
result = enforce_turn_budget(messages, tmp_path)
|
||||
|
||||
persisted_count = sum(
|
||||
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
|
||||
)
|
||||
# At least one must be persisted to bring total under budget
|
||||
assert persisted_count >= 1
|
||||
|
||||
total_after = sum(len(m["content"]) for m in result)
|
||||
assert total_after < total_before
|
||||
|
||||
def test_budget_parameter_respected(self, tmp_path):
|
||||
"""Custom budget parameter is used instead of default."""
|
||||
# Two messages each 100 chars, budget=150 should trigger persistence
|
||||
|
||||
@@ -19,6 +19,11 @@ logger = logging.getLogger(__name__)
|
||||
# process can extract the remote shell's cwd without a separate round-trip.
|
||||
_CWD_MARKER = "__HERMES_CWD__"
|
||||
|
||||
# Min seconds between file-sync checks in _before_execute hooks.
|
||||
# Remote backends (SSH, Modal, Daytona) skip re-walking the skills
|
||||
# directory and re-statting credential files within this window.
|
||||
_SYNC_INTERVAL_SECONDS: float = 5.0
|
||||
|
||||
|
||||
def get_sandbox_dir() -> Path:
|
||||
"""Return the host-side root for all sandbox storage (Docker workspaces,
|
||||
@@ -65,24 +70,23 @@ class _ThreadedProcessHandle:
|
||||
self.stdin = None
|
||||
|
||||
def _run():
|
||||
# Open the write end exactly once to avoid double-close races.
|
||||
writer = os.fdopen(self._write_fd, "w")
|
||||
try:
|
||||
output, exit_code = exec_fn()
|
||||
writer = os.fdopen(self._write_fd, "w")
|
||||
writer.write(output)
|
||||
writer.close()
|
||||
self._returncode = exit_code
|
||||
except Exception as e:
|
||||
try:
|
||||
writer = os.fdopen(self._write_fd, "w")
|
||||
writer.write(str(e))
|
||||
writer.close()
|
||||
except Exception:
|
||||
try:
|
||||
os.close(self._write_fd)
|
||||
except Exception:
|
||||
pass
|
||||
pass
|
||||
self._returncode = 1
|
||||
finally:
|
||||
try:
|
||||
writer.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._done.set()
|
||||
|
||||
self._thread = threading.Thread(target=_run, daemon=True)
|
||||
@@ -192,6 +196,7 @@ class BaseEnvironment(ABC):
|
||||
f"if type declare >/dev/null 2>&1; then "
|
||||
f"declare -f >> {self._snapshot_path} 2>/dev/null; fi\n"
|
||||
f"alias -p >> {self._snapshot_path} 2>/dev/null || true\n"
|
||||
f"echo 'shopt -s expand_aliases' >> {self._snapshot_path}\n"
|
||||
f"echo 'set +e' >> {self._snapshot_path}\n"
|
||||
f"echo 'set +u' >> {self._snapshot_path}\n"
|
||||
f"printf '{_CWD_MARKER}%s{_CWD_MARKER}' \"$(pwd -P)\"\n"
|
||||
|
||||
@@ -28,7 +28,6 @@ logger = logging.getLogger(__name__)
|
||||
DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
|
||||
MAX_TURN_BUDGET_CHARS: int = 200_000
|
||||
PREVIEW_SIZE_CHARS: int = 2000
|
||||
PREVIEW_SIZE_BYTES = PREVIEW_SIZE_CHARS # deprecated alias
|
||||
PERSISTED_OUTPUT_TAG = "<persisted-output>"
|
||||
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
|
||||
|
||||
@@ -179,7 +178,8 @@ def enforce_turn_budget(
|
||||
content = msg["content"]
|
||||
tool_use_id = msg.get("tool_call_id", f"budget_{idx}")
|
||||
|
||||
result = persist_large_result(content, tool_use_id, storage_dir)
|
||||
result = persist_large_result(content, tool_use_id, storage_dir,
|
||||
threshold=0)
|
||||
if result:
|
||||
replacement = build_persisted_output_message(result)
|
||||
total_size -= size
|
||||
|
||||
Reference in New Issue
Block a user