Files
hermes-agent/tests/test_env_backend_compat.py
alt-glitch fe2e1c16c6 refactor(environments): unify execution layer — Phase 0+1 (base + local)
Add spawn-per-call execution model to BaseEnvironment:
- ProcessHandle protocol for backend abstraction
- Shell snapshot creation (bash -l once, capture env to file)
- Command wrapping template (source snapshot + cd + eval + pwd tracking)
- Unified _wait_for_process with interrupt/timeout handling
- CWD tracking via cwdfile read after process exit

Migrate LocalEnvironment to unified model:
- Remove PersistentShellMixin inheritance
- Implement _run_bash (Popen + process group kill)
- Remove fence markers, oneshot method, shell noise cleanup
- Session snapshot captures env vars, functions, aliases

New test capabilities validated:
- CWD persists across execute() calls (was 37% manual cd prefix)
- stdin_data piping works
- Exit codes preserved through wrapper
- Single quotes survive eval escaping
- Snapshot fallback works when creation fails

42/42 tests passing on local backend.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 01:31:08 +00:00

433 lines
15 KiB
Python

"""
Cross-environment backend compatibility tests.
Derived from analysis of 218 real Hermes agent sessions (590 terminal calls).
Tests the command execution patterns the agent actually uses, against any
environment backend (local, docker, ssh, modal, daytona, singularity).
Usage:
# Local (default)
uv run pytest tests/test_env_backend_compat.py -v
# Docker
TERMINAL_ENV=docker TERMINAL_DOCKER_IMAGE=ubuntu:24.04 \
uv run pytest tests/test_env_backend_compat.py -v
# SSH
TERMINAL_ENV=ssh TERMINAL_SSH_HOST=... TERMINAL_SSH_USER=... \
uv run pytest tests/test_env_backend_compat.py -v
# Modal
TERMINAL_ENV=modal uv run pytest tests/test_env_backend_compat.py -v
"""
import json
import os
import time
import pytest
ENV_TYPE = os.getenv("TERMINAL_ENV", "local")
def _get_env():
"""Create and return an environment backend based on TERMINAL_ENV."""
from tools.terminal_tool import _create_environment
env_type = ENV_TYPE
image = os.getenv("TERMINAL_DOCKER_IMAGE", "ubuntu:24.04")
cwd = os.getenv("TERMINAL_CWD", "/tmp")
timeout = int(os.getenv("TERMINAL_TIMEOUT", "30"))
ssh_config = None
if env_type == "ssh":
ssh_config = {
"host": os.environ["TERMINAL_SSH_HOST"],
"user": os.environ["TERMINAL_SSH_USER"],
"port": int(os.getenv("TERMINAL_SSH_PORT", "22")),
"key": os.getenv("TERMINAL_SSH_KEY", ""),
}
container_config = {
"container_cpu": int(os.getenv("TERMINAL_CPU", "1")),
"container_memory": int(os.getenv("TERMINAL_MEMORY", "2048")),
"container_disk": int(os.getenv("TERMINAL_DISK", "10240")),
"container_persistent": True,
}
return _create_environment(
env_type=env_type,
image=image,
cwd=cwd,
timeout=timeout,
ssh_config=ssh_config,
container_config=container_config,
task_id="test_env_compat",
)
@pytest.fixture(scope="module")
def env():
"""Module-scoped environment — created once, reused across tests."""
e = _get_env()
yield e
if hasattr(e, "cleanup"):
try:
e.cleanup()
except Exception:
pass
def _exec(env, command: str, timeout: int = 30) -> dict:
"""Execute a command and parse the result dict."""
result = env.execute(command, timeout=timeout)
assert isinstance(result, dict), f"Expected dict, got {type(result)}"
return result
def _output(result: dict) -> str:
return result.get("output", "")
def _rc(result: dict) -> int:
return result.get("returncode", result.get("exit_code", -999))
# ---------------------------------------------------------------------------
# Category 1: Basic execution
# From session data: simple single commands are the foundation
# ---------------------------------------------------------------------------
class TestBasicExecution:
def test_echo(self, env):
"""Most basic: can we run a command and get output?"""
r = _exec(env, "echo hello")
assert "hello" in _output(r)
assert _rc(r) == 0
def test_exit_code_success(self, env):
r = _exec(env, "true")
assert _rc(r) == 0
def test_exit_code_failure(self, env):
r = _exec(env, "false")
assert _rc(r) != 0
def test_stderr_captured(self, env):
"""Agent relies on 2>&1 patterns; stderr must be captured."""
r = _exec(env, "echo err >&2")
# stderr may be merged into output or separate — just verify no crash
assert isinstance(_output(r), str)
def test_multiline_output(self, env):
r = _exec(env, "printf 'line1\\nline2\\nline3'")
lines = _output(r).strip().split("\n")
assert len(lines) >= 3
# ---------------------------------------------------------------------------
# Category 2: cd && command chains
# 37% of all terminal commands use this pattern. CRITICAL.
# ---------------------------------------------------------------------------
class TestCdAndChain:
def test_cd_and_command(self, env):
"""cd /tmp && ls — the most common pattern in session data."""
r = _exec(env, "cd /tmp && echo 'in_tmp'")
assert "in_tmp" in _output(r)
def test_chained_and(self, env):
"""Multiple && chains: agent does cd X && source Y && cmd Z."""
r = _exec(env, "echo a && echo b && echo c")
out = _output(r)
assert "a" in out and "b" in out and "c" in out
def test_chained_semicolon(self, env):
"""Semicolon chains: agent uses '; echo "---"' as separators."""
r = _exec(env, "echo first; echo '---'; echo second")
out = _output(r)
assert "first" in out and "---" in out and "second" in out
def test_cd_nonexistent_and_fails(self, env):
"""cd to bad dir && cmd should fail (not run cmd)."""
r = _exec(env, "cd /nonexistent_dir_xyz && echo should_not_see")
assert "should_not_see" not in _output(r)
def test_cwd_persists_across_calls(self, env):
"""CWD now persists via cwdfile tracking (unified execution model).
Previously: 37% of commands needed 'cd X &&' prefix. Now automatic."""
_exec(env, "cd /tmp")
r = _exec(env, "pwd")
assert "/tmp" in _output(r)
# ---------------------------------------------------------------------------
# Category 3: Pipes
# 46% of commands use pipes. Essential.
# ---------------------------------------------------------------------------
class TestPipes:
def test_simple_pipe(self, env):
r = _exec(env, "echo 'hello world' | wc -w")
assert "2" in _output(r)
def test_multi_pipe(self, env):
"""Agent chains: find X | grep Y | head -N"""
r = _exec(env, "echo -e 'a\\nb\\nc\\nd\\ne' | grep -v c | wc -l")
assert "4" in _output(r)
def test_pipe_with_grep(self, env):
"""Common pattern: cmd 2>&1 | grep pattern"""
r = _exec(env, "echo -e 'foo\\nbar\\nbaz' | grep ba")
out = _output(r)
assert "bar" in out and "baz" in out
assert "foo" not in out
# ---------------------------------------------------------------------------
# Category 4: Environment variables and source
# 19% of commands use source. Agent does: source ~/.bashrc && cmd
# ---------------------------------------------------------------------------
class TestEnvAndSource:
def test_inline_env_var(self, env):
r = _exec(env, "MY_VAR=hello && echo $MY_VAR")
assert "hello" in _output(r)
def test_export_and_use(self, env):
r = _exec(env, "export FOO=bar && echo $FOO")
assert "bar" in _output(r)
def test_env_does_not_persist(self, env):
"""Env vars don't persist across execute() calls."""
_exec(env, "export HERMES_TEST_VAR=1234")
r = _exec(env, "echo ${HERMES_TEST_VAR:-unset}")
assert "unset" in _output(r)
def test_source_inline_script(self, env):
"""Agent pattern: write a file, source it, use its vars."""
r = _exec(env, (
"echo 'export TEST_SOURCED=yes' > /tmp/hermes_test_source.sh && "
"source /tmp/hermes_test_source.sh && "
"echo $TEST_SOURCED"
))
assert "yes" in _output(r)
# ---------------------------------------------------------------------------
# Category 5: File I/O via shell
# Agent uses cat, heredoc, find, ls extensively
# ---------------------------------------------------------------------------
class TestFileIO:
def test_write_and_read(self, env):
r = _exec(env, (
"echo 'test content' > /tmp/hermes_test_file.txt && "
"cat /tmp/hermes_test_file.txt"
))
assert "test content" in _output(r)
def test_heredoc_write(self, env):
"""0.5% of commands use heredoc — rare but important for config files."""
r = _exec(env, """cat > /tmp/hermes_heredoc_test.txt << 'EOF'
line one
line two
line three
EOF
cat /tmp/hermes_heredoc_test.txt""")
out = _output(r)
assert "line one" in out and "line three" in out
def test_mkdir_p(self, env):
r = _exec(env, "mkdir -p /tmp/hermes_test_deep/a/b/c && ls /tmp/hermes_test_deep/a/b/")
assert "c" in _output(r)
def test_find(self, env):
r = _exec(env, (
"mkdir -p /tmp/hermes_find_test && "
"touch /tmp/hermes_find_test/a.py /tmp/hermes_find_test/b.txt && "
"find /tmp/hermes_find_test -name '*.py'"
))
assert "a.py" in _output(r)
def test_file_persistence_within_session(self, env):
"""Files written and read in a single execute() call."""
r = _exec(env, (
"echo 'persistent' > /tmp/hermes_persist_test.txt && "
"cat /tmp/hermes_persist_test.txt"
))
assert "persistent" in _output(r)
# ---------------------------------------------------------------------------
# Category 6: Multiline commands
# 6% of commands are multiline. Agent sends literal newlines.
# ---------------------------------------------------------------------------
class TestMultiline:
def test_multiline_script(self, env):
r = _exec(env, """echo "step 1"
echo "step 2"
echo "step 3" """)
out = _output(r)
assert "step 1" in out and "step 3" in out
def test_multiline_with_variable(self, env):
r = _exec(env, """X=42
echo "value is $X" """)
assert "value is 42" in _output(r)
# ---------------------------------------------------------------------------
# Category 7: Timeouts
# 50% of terminal calls specify a timeout. Some go up to 1800s.
# ---------------------------------------------------------------------------
class TestTimeouts:
def test_fast_command_with_timeout(self, env):
r = _exec(env, "echo fast", timeout=5)
assert "fast" in _output(r)
def test_slow_command_timeout(self, env):
"""Command that exceeds timeout should be killed."""
start = time.time()
r = _exec(env, "sleep 60", timeout=3)
elapsed = time.time() - start
# Should return in roughly timeout seconds, not 60
assert elapsed < 15, f"Command took {elapsed}s, should have timed out at ~3s"
# ---------------------------------------------------------------------------
# Category 8: Output handling
# Verifying the contract: {output: str, exit_code/returncode: int, error: ...}
# ---------------------------------------------------------------------------
class TestOutputContract:
def test_result_has_output_key(self, env):
r = _exec(env, "echo test")
assert "output" in r
def test_result_has_returncode(self, env):
r = _exec(env, "echo test")
assert "returncode" in r or "exit_code" in r
def test_large_output_not_truncated_at_execute_level(self, env):
"""The env.execute() should return raw output.
Truncation happens in terminal_tool.py, not in the backend."""
r = _exec(env, "seq 1 5000")
lines = _output(r).strip().split("\n")
# Should get all 5000 lines from the backend itself
assert len(lines) >= 4900, f"Expected ~5000 lines, got {len(lines)}"
def test_binary_output_doesnt_crash(self, env):
"""Agent sometimes runs commands that produce partial binary output."""
r = _exec(env, "echo -e '\\x00\\x01\\x02hello\\x03'")
# Just verify it doesn't crash
assert isinstance(_output(r), str)
# ---------------------------------------------------------------------------
# Category 9: Package/tool availability
# Agent frequently checks for tools before using them
# ---------------------------------------------------------------------------
class TestToolAvailability:
def test_which_pattern(self, env):
"""Agent pattern: which X 2>/dev/null || echo 'not found'"""
r = _exec(env, "which bash 2>/dev/null || echo 'not found'")
out = _output(r)
assert "bash" in out or "not found" in out
def test_python_available(self, env):
"""Agent uses python3 extensively."""
r = _exec(env, "which python3 2>/dev/null && python3 --version || echo 'no python3'")
assert "Python" in _output(r) or "no python3" in _output(r)
def test_git_available(self, env):
"""52 git operations in session data."""
r = _exec(env, "which git 2>/dev/null && git --version || echo 'no git'")
assert "git" in _output(r).lower() or "no git" in _output(r)
# ---------------------------------------------------------------------------
# Category 10: Error handling edge cases
# ---------------------------------------------------------------------------
class TestEdgeCases:
def test_empty_command(self, env):
"""Empty or whitespace command shouldn't crash."""
try:
r = _exec(env, "")
# May succeed with empty output or fail gracefully
except Exception:
pass # Acceptable to raise
def test_command_not_found(self, env):
r = _exec(env, "nonexistent_command_xyz_123 2>&1")
assert _rc(r) != 0
def test_special_characters_in_output(self, env):
"""Agent processes JSON, YAML, code — special chars must survive."""
r = _exec(env, """echo '{"key": "value", "list": [1,2,3]}'""")
out = _output(r)
assert '"key"' in out
def test_long_command_string(self, env):
"""Agent sends commands up to ~500 chars. Verify no truncation on input."""
long_val = "A" * 500
r = _exec(env, f"echo {long_val} | wc -c")
count = int(_output(r).strip())
assert count >= 500
# ---------------------------------------------------------------------------
# Category 11: Unified execution model — new capabilities
# ---------------------------------------------------------------------------
class TestUnifiedExecution:
def test_cwd_tracking_updates_env(self, env):
"""env.cwd should update after cd command."""
_exec(env, "cd /tmp")
assert env.cwd == "/tmp"
def test_stdin_data(self, env):
"""stdin_data should be piped to the command."""
r = env.execute("cat", stdin_data="hello from stdin\n")
assert "hello from stdin" in _output(r)
def test_snapshot_fallback(self, env):
"""Commands work even when snapshot is missing/broken."""
old_snapshot = env._snapshot_ready
old_path = env._snapshot_path
env._snapshot_ready = False
env._snapshot_path = None
try:
r = _exec(env, "echo still_works")
assert "still_works" in _output(r)
finally:
env._snapshot_ready = old_snapshot
env._snapshot_path = old_path
def test_exit_code_preserved_through_wrapper(self, env):
"""Exit code from the user command should pass through the wrapper."""
r = _exec(env, "exit 42")
assert _rc(r) == 42
def test_single_quotes_in_command(self, env):
"""Commands with single quotes must survive the eval wrapper."""
r = _exec(env, "echo 'it'\\''s a test'")
assert "it's a test" in _output(r)
# ---------------------------------------------------------------------------
# Cleanup
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True, scope="module")
def cleanup_test_files(env):
"""Clean up test artifacts after all tests."""
yield
try:
env.execute("rm -rf /tmp/hermes_test_* /tmp/hermes_find_test /tmp/hermes_heredoc_test.txt /tmp/hermes_persist_test.txt /tmp/hermes_test_source.sh", timeout=5)
except Exception:
pass