mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 01:07:31 +08:00
Add spawn-per-call execution model to BaseEnvironment: - ProcessHandle protocol for backend abstraction - Shell snapshot creation (bash -l once, capture env to file) - Command wrapping template (source snapshot + cd + eval + pwd tracking) - Unified _wait_for_process with interrupt/timeout handling - CWD tracking via cwdfile read after process exit Migrate LocalEnvironment to unified model: - Remove PersistentShellMixin inheritance - Implement _run_bash (Popen + process group kill) - Remove fence markers, oneshot method, shell noise cleanup - Session snapshot captures env vars, functions, aliases New test capabilities validated: - CWD persists across execute() calls (was 37% manual cd prefix) - stdin_data piping works - Exit codes preserved through wrapper - Single quotes survive eval escaping - Snapshot fallback works when creation fails 42/42 tests passing on local backend. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
433 lines
15 KiB
Python
433 lines
15 KiB
Python
"""
|
|
Cross-environment backend compatibility tests.
|
|
|
|
Derived from analysis of 218 real Hermes agent sessions (590 terminal calls).
|
|
Tests the command execution patterns the agent actually uses, against any
|
|
environment backend (local, docker, ssh, modal, daytona, singularity).
|
|
|
|
Usage:
|
|
# Local (default)
|
|
uv run pytest tests/test_env_backend_compat.py -v
|
|
|
|
# Docker
|
|
TERMINAL_ENV=docker TERMINAL_DOCKER_IMAGE=ubuntu:24.04 \
|
|
uv run pytest tests/test_env_backend_compat.py -v
|
|
|
|
# SSH
|
|
TERMINAL_ENV=ssh TERMINAL_SSH_HOST=... TERMINAL_SSH_USER=... \
|
|
uv run pytest tests/test_env_backend_compat.py -v
|
|
|
|
# Modal
|
|
TERMINAL_ENV=modal uv run pytest tests/test_env_backend_compat.py -v
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import pytest
|
|
|
|
ENV_TYPE = os.getenv("TERMINAL_ENV", "local")
|
|
|
|
|
|
def _get_env():
|
|
"""Create and return an environment backend based on TERMINAL_ENV."""
|
|
from tools.terminal_tool import _create_environment
|
|
|
|
env_type = ENV_TYPE
|
|
image = os.getenv("TERMINAL_DOCKER_IMAGE", "ubuntu:24.04")
|
|
cwd = os.getenv("TERMINAL_CWD", "/tmp")
|
|
timeout = int(os.getenv("TERMINAL_TIMEOUT", "30"))
|
|
|
|
ssh_config = None
|
|
if env_type == "ssh":
|
|
ssh_config = {
|
|
"host": os.environ["TERMINAL_SSH_HOST"],
|
|
"user": os.environ["TERMINAL_SSH_USER"],
|
|
"port": int(os.getenv("TERMINAL_SSH_PORT", "22")),
|
|
"key": os.getenv("TERMINAL_SSH_KEY", ""),
|
|
}
|
|
|
|
container_config = {
|
|
"container_cpu": int(os.getenv("TERMINAL_CPU", "1")),
|
|
"container_memory": int(os.getenv("TERMINAL_MEMORY", "2048")),
|
|
"container_disk": int(os.getenv("TERMINAL_DISK", "10240")),
|
|
"container_persistent": True,
|
|
}
|
|
|
|
return _create_environment(
|
|
env_type=env_type,
|
|
image=image,
|
|
cwd=cwd,
|
|
timeout=timeout,
|
|
ssh_config=ssh_config,
|
|
container_config=container_config,
|
|
task_id="test_env_compat",
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def env():
|
|
"""Module-scoped environment — created once, reused across tests."""
|
|
e = _get_env()
|
|
yield e
|
|
if hasattr(e, "cleanup"):
|
|
try:
|
|
e.cleanup()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _exec(env, command: str, timeout: int = 30) -> dict:
|
|
"""Execute a command and parse the result dict."""
|
|
result = env.execute(command, timeout=timeout)
|
|
assert isinstance(result, dict), f"Expected dict, got {type(result)}"
|
|
return result
|
|
|
|
|
|
def _output(result: dict) -> str:
|
|
return result.get("output", "")
|
|
|
|
|
|
def _rc(result: dict) -> int:
|
|
return result.get("returncode", result.get("exit_code", -999))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 1: Basic execution
|
|
# From session data: simple single commands are the foundation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestBasicExecution:
|
|
def test_echo(self, env):
|
|
"""Most basic: can we run a command and get output?"""
|
|
r = _exec(env, "echo hello")
|
|
assert "hello" in _output(r)
|
|
assert _rc(r) == 0
|
|
|
|
def test_exit_code_success(self, env):
|
|
r = _exec(env, "true")
|
|
assert _rc(r) == 0
|
|
|
|
def test_exit_code_failure(self, env):
|
|
r = _exec(env, "false")
|
|
assert _rc(r) != 0
|
|
|
|
def test_stderr_captured(self, env):
|
|
"""Agent relies on 2>&1 patterns; stderr must be captured."""
|
|
r = _exec(env, "echo err >&2")
|
|
# stderr may be merged into output or separate — just verify no crash
|
|
assert isinstance(_output(r), str)
|
|
|
|
def test_multiline_output(self, env):
|
|
r = _exec(env, "printf 'line1\\nline2\\nline3'")
|
|
lines = _output(r).strip().split("\n")
|
|
assert len(lines) >= 3
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 2: cd && command chains
|
|
# 37% of all terminal commands use this pattern. CRITICAL.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCdAndChain:
|
|
def test_cd_and_command(self, env):
|
|
"""cd /tmp && ls — the most common pattern in session data."""
|
|
r = _exec(env, "cd /tmp && echo 'in_tmp'")
|
|
assert "in_tmp" in _output(r)
|
|
|
|
def test_chained_and(self, env):
|
|
"""Multiple && chains: agent does cd X && source Y && cmd Z."""
|
|
r = _exec(env, "echo a && echo b && echo c")
|
|
out = _output(r)
|
|
assert "a" in out and "b" in out and "c" in out
|
|
|
|
def test_chained_semicolon(self, env):
|
|
"""Semicolon chains: agent uses '; echo "---"' as separators."""
|
|
r = _exec(env, "echo first; echo '---'; echo second")
|
|
out = _output(r)
|
|
assert "first" in out and "---" in out and "second" in out
|
|
|
|
def test_cd_nonexistent_and_fails(self, env):
|
|
"""cd to bad dir && cmd should fail (not run cmd)."""
|
|
r = _exec(env, "cd /nonexistent_dir_xyz && echo should_not_see")
|
|
assert "should_not_see" not in _output(r)
|
|
|
|
def test_cwd_persists_across_calls(self, env):
|
|
"""CWD now persists via cwdfile tracking (unified execution model).
|
|
Previously: 37% of commands needed 'cd X &&' prefix. Now automatic."""
|
|
_exec(env, "cd /tmp")
|
|
r = _exec(env, "pwd")
|
|
assert "/tmp" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 3: Pipes
|
|
# 46% of commands use pipes. Essential.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPipes:
|
|
def test_simple_pipe(self, env):
|
|
r = _exec(env, "echo 'hello world' | wc -w")
|
|
assert "2" in _output(r)
|
|
|
|
def test_multi_pipe(self, env):
|
|
"""Agent chains: find X | grep Y | head -N"""
|
|
r = _exec(env, "echo -e 'a\\nb\\nc\\nd\\ne' | grep -v c | wc -l")
|
|
assert "4" in _output(r)
|
|
|
|
def test_pipe_with_grep(self, env):
|
|
"""Common pattern: cmd 2>&1 | grep pattern"""
|
|
r = _exec(env, "echo -e 'foo\\nbar\\nbaz' | grep ba")
|
|
out = _output(r)
|
|
assert "bar" in out and "baz" in out
|
|
assert "foo" not in out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 4: Environment variables and source
|
|
# 19% of commands use source. Agent does: source ~/.bashrc && cmd
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEnvAndSource:
|
|
def test_inline_env_var(self, env):
|
|
r = _exec(env, "MY_VAR=hello && echo $MY_VAR")
|
|
assert "hello" in _output(r)
|
|
|
|
def test_export_and_use(self, env):
|
|
r = _exec(env, "export FOO=bar && echo $FOO")
|
|
assert "bar" in _output(r)
|
|
|
|
def test_env_does_not_persist(self, env):
|
|
"""Env vars don't persist across execute() calls."""
|
|
_exec(env, "export HERMES_TEST_VAR=1234")
|
|
r = _exec(env, "echo ${HERMES_TEST_VAR:-unset}")
|
|
assert "unset" in _output(r)
|
|
|
|
def test_source_inline_script(self, env):
|
|
"""Agent pattern: write a file, source it, use its vars."""
|
|
r = _exec(env, (
|
|
"echo 'export TEST_SOURCED=yes' > /tmp/hermes_test_source.sh && "
|
|
"source /tmp/hermes_test_source.sh && "
|
|
"echo $TEST_SOURCED"
|
|
))
|
|
assert "yes" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 5: File I/O via shell
|
|
# Agent uses cat, heredoc, find, ls extensively
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFileIO:
|
|
def test_write_and_read(self, env):
|
|
r = _exec(env, (
|
|
"echo 'test content' > /tmp/hermes_test_file.txt && "
|
|
"cat /tmp/hermes_test_file.txt"
|
|
))
|
|
assert "test content" in _output(r)
|
|
|
|
def test_heredoc_write(self, env):
|
|
"""0.5% of commands use heredoc — rare but important for config files."""
|
|
r = _exec(env, """cat > /tmp/hermes_heredoc_test.txt << 'EOF'
|
|
line one
|
|
line two
|
|
line three
|
|
EOF
|
|
cat /tmp/hermes_heredoc_test.txt""")
|
|
out = _output(r)
|
|
assert "line one" in out and "line three" in out
|
|
|
|
def test_mkdir_p(self, env):
|
|
r = _exec(env, "mkdir -p /tmp/hermes_test_deep/a/b/c && ls /tmp/hermes_test_deep/a/b/")
|
|
assert "c" in _output(r)
|
|
|
|
def test_find(self, env):
|
|
r = _exec(env, (
|
|
"mkdir -p /tmp/hermes_find_test && "
|
|
"touch /tmp/hermes_find_test/a.py /tmp/hermes_find_test/b.txt && "
|
|
"find /tmp/hermes_find_test -name '*.py'"
|
|
))
|
|
assert "a.py" in _output(r)
|
|
|
|
def test_file_persistence_within_session(self, env):
|
|
"""Files written and read in a single execute() call."""
|
|
r = _exec(env, (
|
|
"echo 'persistent' > /tmp/hermes_persist_test.txt && "
|
|
"cat /tmp/hermes_persist_test.txt"
|
|
))
|
|
assert "persistent" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 6: Multiline commands
|
|
# 6% of commands are multiline. Agent sends literal newlines.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMultiline:
|
|
def test_multiline_script(self, env):
|
|
r = _exec(env, """echo "step 1"
|
|
echo "step 2"
|
|
echo "step 3" """)
|
|
out = _output(r)
|
|
assert "step 1" in out and "step 3" in out
|
|
|
|
def test_multiline_with_variable(self, env):
|
|
r = _exec(env, """X=42
|
|
echo "value is $X" """)
|
|
assert "value is 42" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 7: Timeouts
|
|
# 50% of terminal calls specify a timeout. Some go up to 1800s.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestTimeouts:
|
|
def test_fast_command_with_timeout(self, env):
|
|
r = _exec(env, "echo fast", timeout=5)
|
|
assert "fast" in _output(r)
|
|
|
|
def test_slow_command_timeout(self, env):
|
|
"""Command that exceeds timeout should be killed."""
|
|
start = time.time()
|
|
r = _exec(env, "sleep 60", timeout=3)
|
|
elapsed = time.time() - start
|
|
# Should return in roughly timeout seconds, not 60
|
|
assert elapsed < 15, f"Command took {elapsed}s, should have timed out at ~3s"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 8: Output handling
|
|
# Verifying the contract: {output: str, exit_code/returncode: int, error: ...}
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestOutputContract:
|
|
def test_result_has_output_key(self, env):
|
|
r = _exec(env, "echo test")
|
|
assert "output" in r
|
|
|
|
def test_result_has_returncode(self, env):
|
|
r = _exec(env, "echo test")
|
|
assert "returncode" in r or "exit_code" in r
|
|
|
|
def test_large_output_not_truncated_at_execute_level(self, env):
|
|
"""The env.execute() should return raw output.
|
|
Truncation happens in terminal_tool.py, not in the backend."""
|
|
r = _exec(env, "seq 1 5000")
|
|
lines = _output(r).strip().split("\n")
|
|
# Should get all 5000 lines from the backend itself
|
|
assert len(lines) >= 4900, f"Expected ~5000 lines, got {len(lines)}"
|
|
|
|
def test_binary_output_doesnt_crash(self, env):
|
|
"""Agent sometimes runs commands that produce partial binary output."""
|
|
r = _exec(env, "echo -e '\\x00\\x01\\x02hello\\x03'")
|
|
# Just verify it doesn't crash
|
|
assert isinstance(_output(r), str)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 9: Package/tool availability
|
|
# Agent frequently checks for tools before using them
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestToolAvailability:
|
|
def test_which_pattern(self, env):
|
|
"""Agent pattern: which X 2>/dev/null || echo 'not found'"""
|
|
r = _exec(env, "which bash 2>/dev/null || echo 'not found'")
|
|
out = _output(r)
|
|
assert "bash" in out or "not found" in out
|
|
|
|
def test_python_available(self, env):
|
|
"""Agent uses python3 extensively."""
|
|
r = _exec(env, "which python3 2>/dev/null && python3 --version || echo 'no python3'")
|
|
assert "Python" in _output(r) or "no python3" in _output(r)
|
|
|
|
def test_git_available(self, env):
|
|
"""52 git operations in session data."""
|
|
r = _exec(env, "which git 2>/dev/null && git --version || echo 'no git'")
|
|
assert "git" in _output(r).lower() or "no git" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 10: Error handling edge cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEdgeCases:
|
|
def test_empty_command(self, env):
|
|
"""Empty or whitespace command shouldn't crash."""
|
|
try:
|
|
r = _exec(env, "")
|
|
# May succeed with empty output or fail gracefully
|
|
except Exception:
|
|
pass # Acceptable to raise
|
|
|
|
def test_command_not_found(self, env):
|
|
r = _exec(env, "nonexistent_command_xyz_123 2>&1")
|
|
assert _rc(r) != 0
|
|
|
|
def test_special_characters_in_output(self, env):
|
|
"""Agent processes JSON, YAML, code — special chars must survive."""
|
|
r = _exec(env, """echo '{"key": "value", "list": [1,2,3]}'""")
|
|
out = _output(r)
|
|
assert '"key"' in out
|
|
|
|
def test_long_command_string(self, env):
|
|
"""Agent sends commands up to ~500 chars. Verify no truncation on input."""
|
|
long_val = "A" * 500
|
|
r = _exec(env, f"echo {long_val} | wc -c")
|
|
count = int(_output(r).strip())
|
|
assert count >= 500
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category 11: Unified execution model — new capabilities
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestUnifiedExecution:
|
|
def test_cwd_tracking_updates_env(self, env):
|
|
"""env.cwd should update after cd command."""
|
|
_exec(env, "cd /tmp")
|
|
assert env.cwd == "/tmp"
|
|
|
|
def test_stdin_data(self, env):
|
|
"""stdin_data should be piped to the command."""
|
|
r = env.execute("cat", stdin_data="hello from stdin\n")
|
|
assert "hello from stdin" in _output(r)
|
|
|
|
def test_snapshot_fallback(self, env):
|
|
"""Commands work even when snapshot is missing/broken."""
|
|
old_snapshot = env._snapshot_ready
|
|
old_path = env._snapshot_path
|
|
env._snapshot_ready = False
|
|
env._snapshot_path = None
|
|
try:
|
|
r = _exec(env, "echo still_works")
|
|
assert "still_works" in _output(r)
|
|
finally:
|
|
env._snapshot_ready = old_snapshot
|
|
env._snapshot_path = old_path
|
|
|
|
def test_exit_code_preserved_through_wrapper(self, env):
|
|
"""Exit code from the user command should pass through the wrapper."""
|
|
r = _exec(env, "exit 42")
|
|
assert _rc(r) == 42
|
|
|
|
def test_single_quotes_in_command(self, env):
|
|
"""Commands with single quotes must survive the eval wrapper."""
|
|
r = _exec(env, "echo 'it'\\''s a test'")
|
|
assert "it's a test" in _output(r)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cleanup
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.fixture(autouse=True, scope="module")
|
|
def cleanup_test_files(env):
|
|
"""Clean up test artifacts after all tests."""
|
|
yield
|
|
try:
|
|
env.execute("rm -rf /tmp/hermes_test_* /tmp/hermes_find_test /tmp/hermes_heredoc_test.txt /tmp/hermes_persist_test.txt /tmp/hermes_test_source.sh", timeout=5)
|
|
except Exception:
|
|
pass
|