fix: address PR review — alias expansion + L3 budget enforcement

- Add `shopt -s expand_aliases` to snapshot so aliases captured by
  `alias -p` actually work under `bash -c` (review comment #2)
- Pass threshold=0 in enforce_turn_budget() so L3 can force-persist
  results below the 50K default when aggregate budget is exceeded
  (review comment #3)
- Add regression test: 6x42K results (each under 50K) exceeding 200K
  budget are now correctly persisted
This commit is contained in:
alt-glitch
2026-04-03 01:03:02 +05:30
committed by Hermes Agent
parent 72bd14e09d
commit 51cf4e0bbc
3 changed files with 54 additions and 24 deletions

View File

@@ -7,7 +7,7 @@ from tools.tool_result_storage import (
MAX_TURN_BUDGET_CHARS,
PERSISTED_OUTPUT_CLOSING_TAG,
PERSISTED_OUTPUT_TAG,
PREVIEW_SIZE_BYTES,
PREVIEW_SIZE_CHARS,
PersistedResult,
build_persisted_output_message,
enforce_turn_budget,
@@ -34,18 +34,18 @@ class TestGeneratePreview:
# Build content with lines that exceed the budget
lines = [f"line {i}: " + "x" * 80 for i in range(50)]
content = "\n".join(lines)
assert len(content) > PREVIEW_SIZE_BYTES
assert len(content) > PREVIEW_SIZE_CHARS
preview, has_more = generate_preview(content)
assert has_more is True
assert len(preview) <= PREVIEW_SIZE_BYTES
assert len(preview) <= PREVIEW_SIZE_CHARS
# Should end at a newline boundary
assert preview.endswith("\n")
def test_single_line_truncates_at_max(self):
"""Single long line truncated at max_bytes exactly."""
content = "x" * 5000 # No newlines
preview, has_more = generate_preview(content, max_bytes=100)
preview, has_more = generate_preview(content, max_chars=100)
assert has_more is True
assert len(preview) == 100
@@ -57,7 +57,7 @@ class TestGeneratePreview:
def test_exact_boundary(self):
"""Content exactly at max_bytes returns as-is."""
content = "x" * PREVIEW_SIZE_BYTES
content = "x" * PREVIEW_SIZE_CHARS
preview, has_more = generate_preview(content)
assert preview == content
assert has_more is False
@@ -66,7 +66,7 @@ class TestGeneratePreview:
"""Newline before halfway mark is ignored; truncation at max_bytes."""
# Newline at position 10 out of 100 — way before halfway (50)
content = "a" * 10 + "\n" + "b" * 200
preview, has_more = generate_preview(content, max_bytes=100)
preview, has_more = generate_preview(content, max_chars=100)
assert has_more is True
# Should NOT truncate at position 10 since it's before halfway
assert len(preview) == 100
@@ -241,15 +241,11 @@ class TestMaybePersistToolResult:
lambda name: custom_limit,
)
content = "x" * (custom_limit + 1)
# persist_large_result checks DEFAULT_MAX_RESULT_SIZE_CHARS internally,
# but maybe_persist_tool_result should still create the persisted message.
# Since content > custom_limit but <= DEFAULT_MAX_RESULT_SIZE_CHARS,
# persist_large_result returns None and we get original content back.
# This is expected behavior — Layer 2 threshold gates the call,
# but persist_large_result has its own floor.
result = maybe_persist_tool_result(content, "small_tool", "id_6", tmp_path)
# Content is 1001 chars, well under 50K, so persist_large_result returns None
assert result == content
# Content exceeds the per-tool threshold (1001 > 1000) so it should
# be persisted even though it's well below the 50K default.
assert PERSISTED_OUTPUT_TAG in result
assert (tmp_path / "id_6.txt").exists()
# ------------------------------------------------------------------ #
@@ -332,6 +328,35 @@ class TestEnforceTurnBudget:
result = enforce_turn_budget([], tmp_path)
assert result == []
def test_medium_results_under_default_threshold_still_persisted(self, tmp_path):
"""Multiple 42K results (under 50K default) exceed 200K budget.
Regression test: L3 must force-persist even when individual results
are below DEFAULT_MAX_RESULT_SIZE_CHARS. Without threshold=0 in the
enforce_turn_budget call, these would all be skipped.
"""
# 6 x 42K = 252K > 200K budget, but each is under 50K default
messages = [
{"content": "x" * 42_000, "tool_call_id": f"med_{i}"}
for i in range(6)
]
total_before = sum(len(m["content"]) for m in messages)
assert total_before == 252_000
assert total_before > MAX_TURN_BUDGET_CHARS
# Each individual result is under the default 50K threshold
assert all(len(m["content"]) < DEFAULT_MAX_RESULT_SIZE_CHARS for m in messages)
result = enforce_turn_budget(messages, tmp_path)
persisted_count = sum(
1 for m in result if PERSISTED_OUTPUT_TAG in m["content"]
)
# At least one must be persisted to bring total under budget
assert persisted_count >= 1
total_after = sum(len(m["content"]) for m in result)
assert total_after < total_before
def test_budget_parameter_respected(self, tmp_path):
"""Custom budget parameter is used instead of default."""
# Two messages each 100 chars, budget=150 should trigger persistence

View File

@@ -19,6 +19,11 @@ logger = logging.getLogger(__name__)
# process can extract the remote shell's cwd without a separate round-trip.
_CWD_MARKER = "__HERMES_CWD__"
# Min seconds between file-sync checks in _before_execute hooks.
# Remote backends (SSH, Modal, Daytona) skip re-walking the skills
# directory and re-statting credential files within this window.
_SYNC_INTERVAL_SECONDS: float = 5.0
def get_sandbox_dir() -> Path:
"""Return the host-side root for all sandbox storage (Docker workspaces,
@@ -65,24 +70,23 @@ class _ThreadedProcessHandle:
self.stdin = None
def _run():
# Open the write end exactly once to avoid double-close races.
writer = os.fdopen(self._write_fd, "w")
try:
output, exit_code = exec_fn()
writer = os.fdopen(self._write_fd, "w")
writer.write(output)
writer.close()
self._returncode = exit_code
except Exception as e:
try:
writer = os.fdopen(self._write_fd, "w")
writer.write(str(e))
writer.close()
except Exception:
try:
os.close(self._write_fd)
except Exception:
pass
pass
self._returncode = 1
finally:
try:
writer.close()
except Exception:
pass
self._done.set()
self._thread = threading.Thread(target=_run, daemon=True)
@@ -192,6 +196,7 @@ class BaseEnvironment(ABC):
f"if type declare >/dev/null 2>&1; then "
f"declare -f >> {self._snapshot_path} 2>/dev/null; fi\n"
f"alias -p >> {self._snapshot_path} 2>/dev/null || true\n"
f"echo 'shopt -s expand_aliases' >> {self._snapshot_path}\n"
f"echo 'set +e' >> {self._snapshot_path}\n"
f"echo 'set +u' >> {self._snapshot_path}\n"
f"printf '{_CWD_MARKER}%s{_CWD_MARKER}' \"$(pwd -P)\"\n"

View File

@@ -28,7 +28,6 @@ logger = logging.getLogger(__name__)
DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
MAX_TURN_BUDGET_CHARS: int = 200_000
PREVIEW_SIZE_CHARS: int = 2000
PREVIEW_SIZE_BYTES = PREVIEW_SIZE_CHARS # deprecated alias
PERSISTED_OUTPUT_TAG = "<persisted-output>"
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
@@ -179,7 +178,8 @@ def enforce_turn_budget(
content = msg["content"]
tool_use_id = msg.get("tool_call_id", f"budget_{idx}")
result = persist_large_result(content, tool_use_id, storage_dir)
result = persist_large_result(content, tool_use_id, storage_dir,
threshold=0)
if result:
replacement = build_persisted_output_message(result)
total_size -= size