Files
hermes-agent/tools/environments/modal.py
alt-glitch ef8a985ee3 refactor(environments): migrate Modal + Daytona to unified model — Phase 5+6
ModalEnvironment:
- Create _ModalProcessHandle adapter (async SDK → ProcessHandle protocol)
- Routes async sandbox.exec through thread + OS pipe for stdout
- Remove BaseModalExecutionEnvironment inheritance, use BaseEnvironment
- Remove _start_modal_exec/_poll_modal_exec/_cancel_modal_exec
- Move file sync to _before_execute hook
- Preserve _AsyncWorker, sandbox lifecycle, snapshot management

DaytonaEnvironment:
- Create _DaytonaProcessHandle adapter (blocking SDK → ProcessHandle)
- Preserves shell timeout wrapper (SDK timeout unreliable)
- Add _run_bash with heredoc stdin embedding
- Move file sync to _before_execute hook
- Preserve sandbox lifecycle, persistent resume logic

ManagedModal left unchanged (HTTP-based, keeps modal_common.py dep).
42/42 local tests passing. SDK backends untested (require credentials).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 01:31:41 +00:00

460 lines
16 KiB
Python

"""Modal cloud execution environment using the native Modal SDK directly.
Uses ``Sandbox.create()`` + ``Sandbox.exec()`` instead of the older runtime
wrapper, while preserving Hermes' persistent snapshot behavior across sessions.
"""
import asyncio
import json
import logging
import os
import shlex
import threading
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional
from hermes_constants import get_hermes_home
from tools.environments.base import BaseEnvironment
logger = logging.getLogger(__name__)
_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
_DIRECT_SNAPSHOT_NAMESPACE = "direct"
def _load_snapshots() -> Dict[str, str]:
"""Load snapshot ID mapping from disk."""
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
"""Persist snapshot ID mapping to disk."""
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
def _direct_snapshot_key(task_id: str) -> str:
return f"{_DIRECT_SNAPSHOT_NAMESPACE}:{task_id}"
def _get_snapshot_restore_candidate(task_id: str) -> tuple[str | None, bool]:
"""Return a snapshot id and whether it came from the legacy key format."""
snapshots = _load_snapshots()
namespaced_key = _direct_snapshot_key(task_id)
snapshot_id = snapshots.get(namespaced_key)
if isinstance(snapshot_id, str) and snapshot_id:
return snapshot_id, False
legacy_snapshot_id = snapshots.get(task_id)
if isinstance(legacy_snapshot_id, str) and legacy_snapshot_id:
return legacy_snapshot_id, True
return None, False
def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
"""Persist the direct Modal snapshot id under the direct namespace."""
snapshots = _load_snapshots()
snapshots[_direct_snapshot_key(task_id)] = snapshot_id
snapshots.pop(task_id, None)
_save_snapshots(snapshots)
def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> None:
"""Remove direct Modal snapshot entries for a task, including legacy keys."""
snapshots = _load_snapshots()
updated = False
for key in (_direct_snapshot_key(task_id), task_id):
value = snapshots.get(key)
if value is None:
continue
if snapshot_id is None or value == snapshot_id:
snapshots.pop(key, None)
updated = True
if updated:
_save_snapshots(snapshots)
def _resolve_modal_image(image_spec: Any) -> Any:
"""Convert registry references or snapshot ids into Modal image objects."""
import modal as _modal
if not isinstance(image_spec, str):
return image_spec
if image_spec.startswith("im-"):
return _modal.Image.from_id(image_spec)
return _modal.Image.from_registry(
image_spec,
setup_dockerfile_commands=[
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
],
)
class _AsyncWorker:
"""Background thread with its own event loop for async-safe Modal calls."""
def __init__(self):
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._thread: Optional[threading.Thread] = None
self._started = threading.Event()
def start(self):
self._thread = threading.Thread(target=self._run_loop, daemon=True)
self._thread.start()
self._started.wait(timeout=30)
def _run_loop(self):
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
self._started.set()
self._loop.run_forever()
def run_coroutine(self, coro, timeout=600):
if self._loop is None or self._loop.is_closed():
raise RuntimeError("AsyncWorker loop is not running")
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
return future.result(timeout=timeout)
def stop(self):
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(self._loop.stop)
if self._thread:
self._thread.join(timeout=10)
class _ModalProcessHandle:
"""Adapter making Modal's async sandbox.exec look like Popen."""
def __init__(self, worker, sandbox, cmd_string, timeout):
self._done = threading.Event()
self._returncode = None
self._read_fd, self._write_fd = os.pipe()
self.stdout = os.fdopen(self._read_fd, "r")
self.stdin = None
def _run():
try:
async def _exec():
process = await sandbox.exec.aio(
"bash", "-c", cmd_string, timeout=timeout,
)
stdout = await process.stdout.read.aio()
stderr = await process.stderr.read.aio()
exit_code = await process.wait.aio()
return stdout, stderr, exit_code
stdout, stderr, exit_code = worker.run_coroutine(
_exec(), timeout=timeout + 30,
)
writer = os.fdopen(self._write_fd, "w")
if stdout:
writer.write(
stdout if isinstance(stdout, str)
else stdout.decode("utf-8", errors="replace")
)
if stderr:
writer.write(
stderr if isinstance(stderr, str)
else stderr.decode("utf-8", errors="replace")
)
writer.close()
self._returncode = exit_code
except Exception as e:
try:
writer = os.fdopen(self._write_fd, "w")
writer.write(f"Modal execution error: {e}")
writer.close()
except Exception:
try:
os.close(self._write_fd)
except Exception:
pass
self._returncode = 1
finally:
self._done.set()
self._thread = threading.Thread(target=_run, daemon=True)
self._thread.start()
def poll(self):
return self._returncode if self._done.is_set() else None
def kill(self):
pass # Handled by sandbox termination in environment
def wait(self, timeout=None):
self._done.wait(timeout=timeout)
return self._returncode
@property
def returncode(self):
return self._returncode
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via native Modal sandboxes.
Uses the unified spawn-per-call model: _run_bash() returns a
_ModalProcessHandle that wraps Modal's async SDK in a thread + OS pipe,
satisfying the ProcessHandle protocol for BaseEnvironment._wait_for_process().
"""
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
persistent_filesystem: bool = True,
task_id: str = "default",
):
super().__init__(cwd=cwd, timeout=timeout)
self._persistent = persistent_filesystem
self._task_id = task_id
self._base_image = image
self._sandbox = None
self._app = None
self._worker = _AsyncWorker()
self._synced_files: Dict[str, tuple] = {}
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
restored_snapshot_id = None
restored_from_legacy_key = False
if self._persistent:
restored_snapshot_id, restored_from_legacy_key = _get_snapshot_restore_candidate(
self._task_id
)
if restored_snapshot_id:
logger.info("Modal: restoring from snapshot %s", restored_snapshot_id[:20])
import modal as _modal
cred_mounts = []
try:
from tools.credential_files import get_credential_file_mounts, iter_skills_files
for mount_entry in get_credential_file_mounts():
cred_mounts.append(
_modal.Mount.from_local_file(
mount_entry["host_path"],
remote_path=mount_entry["container_path"],
)
)
logger.info(
"Modal: mounting credential %s -> %s",
mount_entry["host_path"],
mount_entry["container_path"],
)
# Mount individual skill files (symlinks filtered out).
skills_files = iter_skills_files()
for entry in skills_files:
cred_mounts.append(
_modal.Mount.from_local_file(
entry["host_path"],
remote_path=entry["container_path"],
)
)
if skills_files:
logger.info("Modal: mounting %d skill files", len(skills_files))
except Exception as e:
logger.debug("Modal: could not load credential file mounts: %s", e)
self._worker.start()
async def _create_sandbox(image_spec: Any):
app = await _modal.App.lookup.aio("hermes-agent", create_if_missing=True)
create_kwargs = dict(sandbox_kwargs)
if cred_mounts:
existing_mounts = list(create_kwargs.pop("mounts", []))
existing_mounts.extend(cred_mounts)
create_kwargs["mounts"] = existing_mounts
sandbox = await _modal.Sandbox.create.aio(
"sleep",
"infinity",
image=image_spec,
app=app,
timeout=int(create_kwargs.pop("timeout", 3600)),
**create_kwargs,
)
return app, sandbox
try:
target_image_spec = restored_snapshot_id or image
try:
# _resolve_modal_image keeps the Modal bootstrap fix together:
# it applies setup_dockerfile_commands with ensurepip before
# Modal builds registry images, while snapshot ids restore via
# modal.Image.from_id() without rebuilding.
effective_image = _resolve_modal_image(target_image_spec)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(effective_image),
timeout=300,
)
except Exception as exc:
if not restored_snapshot_id:
raise
logger.warning(
"Modal: failed to restore snapshot %s, retrying with base image: %s",
restored_snapshot_id[:20],
exc,
)
_delete_direct_snapshot(self._task_id, restored_snapshot_id)
base_image = _resolve_modal_image(image)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(base_image),
timeout=300,
)
else:
if restored_snapshot_id and restored_from_legacy_key:
_store_direct_snapshot(self._task_id, restored_snapshot_id)
logger.info(
"Modal: migrated legacy snapshot entry for task %s",
self._task_id,
)
except Exception:
self._worker.stop()
raise
logger.info("Modal: sandbox created (task=%s)", self._task_id)
# Capture login-shell environment into a snapshot for the unified model
self.init_session()
# ------------------------------------------------------------------
# File sync
# ------------------------------------------------------------------
def _push_file_to_sandbox(self, host_path: str, container_path: str) -> bool:
"""Push a single file into the sandbox if changed. Returns True if synced."""
hp = Path(host_path)
try:
stat = hp.stat()
file_key = (stat.st_mtime, stat.st_size)
except OSError:
return False
if self._synced_files.get(container_path) == file_key:
return False
try:
content = hp.read_bytes()
except Exception:
return False
import base64
b64 = base64.b64encode(content).decode("ascii")
container_dir = str(Path(container_path).parent)
cmd = (
f"mkdir -p {shlex.quote(container_dir)} && "
f"echo {shlex.quote(b64)} | base64 -d > {shlex.quote(container_path)}"
)
async def _write():
proc = await self._sandbox.exec.aio("bash", "-c", cmd)
await proc.wait.aio()
self._worker.run_coroutine(_write(), timeout=15)
self._synced_files[container_path] = file_key
return True
def _sync_files(self) -> None:
"""Push credential files and skill files into the running sandbox.
Runs before each command. Uses mtime+size caching so only changed
files are pushed (~13us overhead in the no-op case).
"""
try:
from tools.credential_files import get_credential_file_mounts, iter_skills_files
for entry in get_credential_file_mounts():
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
logger.debug("Modal: synced credential %s", entry["container_path"])
for entry in iter_skills_files():
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
logger.debug("Modal: synced skill file %s", entry["container_path"])
except Exception as e:
logger.debug("Modal: file sync failed: %s", e)
# ------------------------------------------------------------------
# Unified execution hooks
# ------------------------------------------------------------------
def _before_execute(self) -> None:
self._sync_files()
def _run_bash(self, cmd_string: str, *, stdin_data: str | None = None):
"""Spawn ``bash -c <cmd_string>`` inside the Modal sandbox.
Returns a _ModalProcessHandle (satisfies the ProcessHandle protocol).
stdin_data is embedded as a heredoc since Modal cannot pipe stdin.
"""
if stdin_data is not None:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
cmd_string = f"{cmd_string} << '{marker}'\n{stdin_data}\n{marker}"
return _ModalProcessHandle(
self._worker, self._sandbox, cmd_string, self.timeout,
)
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
if self._sandbox is None:
return
if self._persistent:
try:
async def _snapshot():
img = await self._sandbox.snapshot_filesystem.aio()
return img.object_id
try:
snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
except Exception:
snapshot_id = None
if snapshot_id:
_store_direct_snapshot(self._task_id, snapshot_id)
logger.info(
"Modal: saved filesystem snapshot %s for task %s",
snapshot_id[:20],
self._task_id,
)
except Exception as e:
logger.warning("Modal: filesystem snapshot failed: %s", e)
try:
self._worker.run_coroutine(
self._sandbox.terminate.aio(),
timeout=15,
)
except Exception:
pass
finally:
self._worker.stop()
self._sandbox = None
self._app = None