hermes-agent/tools/environments/modal.py

"""Modal cloud execution environment using the native Modal SDK directly.

Uses ``Sandbox.create()`` + ``Sandbox.exec()`` instead of the older runtime
wrapper, while preserving Hermes' persistent snapshot behavior across sessions.
"""

import asyncio
import json
import logging
import re
import shlex
import threading
from pathlib import Path
from typing import Any, Dict, Optional

from hermes_constants import get_hermes_home
from tools.environments.base import BaseEnvironment

logger = logging.getLogger(__name__)

_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
_DIRECT_SNAPSHOT_NAMESPACE = "direct"

# Matches official Python Docker Hub tags: python:3.12, python:3.12-slim,
# python:3.12.3-bookworm, etc.  Rejects alpine (not Debian-based) and
# ambiguous tags like python:3 or python:latest.
_PYTHON_IMAGE_RE = re.compile(
    r"^python:(\d+\.\d+)(?:\.\d+)?(?:-(slim|bookworm|bullseye|slim-bookworm|slim-bullseye))?$"
)


def _load_snapshots() -> Dict[str, str]:
    """Load snapshot ID mapping from disk."""
    if _SNAPSHOT_STORE.exists():
        try:
            return json.loads(_SNAPSHOT_STORE.read_text())
        except Exception:
            pass
    return {}


def _save_snapshots(data: Dict[str, str]) -> None:
    """Persist snapshot ID mapping to disk."""
    _SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
    _SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))


def _direct_snapshot_key(task_id: str) -> str:
    return f"{_DIRECT_SNAPSHOT_NAMESPACE}:{task_id}"


def _get_snapshot_restore_candidate(task_id: str) -> tuple[str | None, bool]:
    """Return a snapshot id and whether it came from the legacy key format."""
    snapshots = _load_snapshots()

    namespaced_key = _direct_snapshot_key(task_id)
    snapshot_id = snapshots.get(namespaced_key)
    if isinstance(snapshot_id, str) and snapshot_id:
        return snapshot_id, False

    legacy_snapshot_id = snapshots.get(task_id)
    if isinstance(legacy_snapshot_id, str) and legacy_snapshot_id:
        return legacy_snapshot_id, True

    return None, False


def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
    """Persist the direct Modal snapshot id under the direct namespace."""
    snapshots = _load_snapshots()
    snapshots[_direct_snapshot_key(task_id)] = snapshot_id
    snapshots.pop(task_id, None)
    _save_snapshots(snapshots)


def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> None:
    """Remove direct Modal snapshot entries for a task, including legacy keys."""
    snapshots = _load_snapshots()
    updated = False

    for key in (_direct_snapshot_key(task_id), task_id):
        value = snapshots.get(key)
        if value is None:
            continue
        if snapshot_id is None or value == snapshot_id:
            snapshots.pop(key, None)
            updated = True

    if updated:
        _save_snapshots(snapshots)


def _resolve_modal_image(image_spec: Any) -> Any:
    """Convert registry references or snapshot ids into Modal image objects.

    Official ``python:X.Y*`` images are mapped to
    ``modal.Image.debian_slim(python_version=X.Y)`` because the stock
    Docker Hub images lack build tools (gcc/g++) required by Modal's
    internal pip bootstrap.
    """
    import modal as _modal

    if not isinstance(image_spec, str):
        return image_spec

    if image_spec.startswith("im-"):
        return _modal.Image.from_id(image_spec)

    m = _PYTHON_IMAGE_RE.match(image_spec)
    if m:
        return _modal.Image.debian_slim(python_version=m.group(1))

    return _modal.Image.from_registry(
        image_spec,
        setup_dockerfile_commands=[
            "RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
            "python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
        ],
    )


class _AsyncWorker:
    """Background thread with its own event loop for async-safe Modal calls."""

    def __init__(self):
        self._loop: Optional[asyncio.AbstractEventLoop] = None
        self._thread: Optional[threading.Thread] = None
        self._started = threading.Event()

    def start(self):
        self._thread = threading.Thread(target=self._run_loop, daemon=True)
        self._thread.start()
        self._started.wait(timeout=30)

    def _run_loop(self):
        self._loop = asyncio.new_event_loop()
        asyncio.set_event_loop(self._loop)
        self._started.set()
        self._loop.run_forever()

    def run_coroutine(self, coro, timeout=600):
        if self._loop is None or self._loop.is_closed():
            raise RuntimeError("AsyncWorker loop is not running")
        future = asyncio.run_coroutine_threadsafe(coro, self._loop)
        return future.result(timeout=timeout)

    def stop(self):
        if self._loop and self._loop.is_running():
            self._loop.call_soon_threadsafe(self._loop.stop)
        if self._thread:
            self._thread.join(timeout=10)


class ModalEnvironment(BaseEnvironment):
    """Modal cloud execution via native Modal sandboxes.

    Uses the unified spawn-per-call model: _run_bash() returns a
    _ThreadedProcessHandle that wraps Modal's async SDK in a thread + OS pipe,
    satisfying the ProcessHandle protocol for BaseEnvironment._wait_for_process().
    """

    _snapshot_timeout = 60  # Modal sandbox cold start can be slow

    def __init__(
        self,
        image: str,
        cwd: str = "/root",
        timeout: int = 60,
        modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
        persistent_filesystem: bool = True,
        task_id: str = "default",
    ):
        super().__init__(cwd=cwd, timeout=timeout)

        self._persistent = persistent_filesystem
        self._task_id = task_id
        self._base_image = image
        self._sandbox = None
        self._app = None
        self._worker = _AsyncWorker()
        self._synced_files: Dict[str, tuple] = {}

        sandbox_kwargs = dict(modal_sandbox_kwargs or {})

        restored_snapshot_id = None
        restored_from_legacy_key = False
        if self._persistent:
            restored_snapshot_id, restored_from_legacy_key = _get_snapshot_restore_candidate(
                self._task_id
            )
            if restored_snapshot_id:
                logger.info("Modal: restoring from snapshot %s", restored_snapshot_id[:20])

        import modal as _modal

        cred_mounts = []
        try:
            from tools.credential_files import get_credential_file_mounts, iter_skills_files

            for mount_entry in get_credential_file_mounts():
                cred_mounts.append(
                    _modal.Mount.from_local_file(
                        mount_entry["host_path"],
                        remote_path=mount_entry["container_path"],
                    )
                )
                logger.info(
                    "Modal: mounting credential %s -> %s",
                    mount_entry["host_path"],
                    mount_entry["container_path"],
                )

            # Mount individual skill files (symlinks filtered out).
            skills_files = iter_skills_files()
            for entry in skills_files:
                cred_mounts.append(
                    _modal.Mount.from_local_file(
                        entry["host_path"],
                        remote_path=entry["container_path"],
                    )
                )
            if skills_files:
                logger.info("Modal: mounting %d skill files", len(skills_files))
        except Exception as e:
            logger.debug("Modal: could not load credential file mounts: %s", e)

        self._worker.start()

        async def _create_sandbox(image_spec: Any):
            app = await _modal.App.lookup.aio("hermes-agent", create_if_missing=True)
            create_kwargs = dict(sandbox_kwargs)
            if cred_mounts:
                existing_mounts = list(create_kwargs.pop("mounts", []))
                existing_mounts.extend(cred_mounts)
                create_kwargs["mounts"] = existing_mounts
            sandbox = await _modal.Sandbox.create.aio(
                "sleep",
                "infinity",
                image=image_spec,
                app=app,
                timeout=int(create_kwargs.pop("timeout", 3600)),
                **create_kwargs,
            )
            return app, sandbox

        try:
            target_image_spec = restored_snapshot_id or image
            try:
                # _resolve_modal_image routes python:X.Y images to
                # debian_slim (avoiding build-tool failures) and applies
                # setup_dockerfile_commands with ensurepip for other
                # registry images.  Snapshot ids restore via from_id().
                effective_image = _resolve_modal_image(target_image_spec)
                self._app, self._sandbox = self._worker.run_coroutine(
                    _create_sandbox(effective_image),
                    timeout=300,
                )
            except Exception as exc:
                if not restored_snapshot_id:
                    raise

                logger.warning(
                    "Modal: failed to restore snapshot %s, retrying with base image: %s",
                    restored_snapshot_id[:20],
                    exc,
                )
                _delete_direct_snapshot(self._task_id, restored_snapshot_id)
                base_image = _resolve_modal_image(image)
                self._app, self._sandbox = self._worker.run_coroutine(
                    _create_sandbox(base_image),
                    timeout=300,
                )
            else:
                if restored_snapshot_id and restored_from_legacy_key:
                    _store_direct_snapshot(self._task_id, restored_snapshot_id)
                    logger.info(
                        "Modal: migrated legacy snapshot entry for task %s",
                        self._task_id,
                    )
        except Exception:
            self._worker.stop()
            raise

        logger.info("Modal: sandbox created (task=%s)", self._task_id)

        # Capture login-shell environment into a snapshot for the unified model
        self.init_session()

    # ------------------------------------------------------------------
    # File sync
    # ------------------------------------------------------------------

    def _push_file_to_sandbox(self, host_path: str, container_path: str) -> bool:
        """Push a single file into the sandbox if changed. Returns True if synced."""
        hp = Path(host_path)
        try:
            stat = hp.stat()
            file_key = (stat.st_mtime, stat.st_size)
        except OSError:
            return False

        if self._synced_files.get(container_path) == file_key:
            return False

        try:
            content = hp.read_bytes()
        except Exception:
            return False

        import base64
        b64 = base64.b64encode(content).decode("ascii")
        container_dir = str(Path(container_path).parent)
        cmd = (
            f"mkdir -p {shlex.quote(container_dir)} && "
            f"echo {shlex.quote(b64)} | base64 -d > {shlex.quote(container_path)}"
        )

        async def _write():
            proc = await self._sandbox.exec.aio("bash", "-c", cmd)
            await proc.wait.aio()

        self._worker.run_coroutine(_write(), timeout=15)
        self._synced_files[container_path] = file_key
        return True

    def _sync_files(self) -> None:
        """Push credential files and skill files into the running sandbox.

        Runs before each command. Uses mtime+size caching so only changed
        files are pushed (~13us overhead in the no-op case).
        """
        try:
            from tools.credential_files import get_credential_file_mounts, iter_skills_files

            for entry in get_credential_file_mounts():
                if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
                    logger.debug("Modal: synced credential %s", entry["container_path"])

            for entry in iter_skills_files():
                if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
                    logger.debug("Modal: synced skill file %s", entry["container_path"])
        except Exception as e:
            logger.debug("Modal: file sync failed: %s", e)

    # ------------------------------------------------------------------
    # Unified execution hooks
    # ------------------------------------------------------------------

    def _before_execute(self) -> None:
        self._sync_files()

    def _run_bash(self, cmd_string: str, *, timeout: int | None = None,
                  stdin_data: str | None = None):
        """Spawn ``bash -c <cmd_string>`` inside the Modal sandbox."""
        effective_timeout = timeout or self.timeout
        if stdin_data is not None:
            cmd_string = self._embed_stdin_heredoc(cmd_string, stdin_data)
        return self._modal_exec(cmd_string, effective_timeout, login=False)

    def _run_bash_login(self, cmd_string: str, *,
                        timeout: int | None = None):
        """Spawn ``bash -l -c <cmd_string>`` for snapshot creation."""
        effective_timeout = timeout or self._snapshot_timeout
        return self._modal_exec(cmd_string, effective_timeout, login=True)

    def _modal_exec(self, cmd_string: str, timeout: int, login: bool):
        """Create a _ThreadedProcessHandle wrapping Modal's async exec."""
        from tools.environments.base import _ThreadedProcessHandle
        worker = self._worker
        sandbox = self._sandbox

        def exec_fn():
            async def _exec():
                args = ["bash", "-l", "-c", cmd_string] if login else ["bash", "-c", cmd_string]
                process = await sandbox.exec.aio(*args, timeout=timeout)
                stdout = await process.stdout.read.aio()
                stderr = await process.stderr.read.aio()
                exit_code = await process.wait.aio()
                return stdout, stderr, exit_code

            stdout, stderr, exit_code = worker.run_coroutine(
                _exec(), timeout=timeout + 30,
            )
            output = stdout if isinstance(stdout, str) else stdout.decode("utf-8", errors="replace")
            if stderr:
                output += stderr if isinstance(stderr, str) else stderr.decode("utf-8", errors="replace")
            return output, exit_code

        return _ThreadedProcessHandle(exec_fn)

    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------

    def cleanup(self):
        """Snapshot the filesystem (if persistent) then stop the sandbox."""
        if self._sandbox is None:
            return

        if self._persistent:
            try:
                async def _snapshot():
                    img = await self._sandbox.snapshot_filesystem.aio()
                    return img.object_id

                try:
                    snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
                except Exception:
                    snapshot_id = None

                if snapshot_id:
                    _store_direct_snapshot(self._task_id, snapshot_id)
                    logger.info(
                        "Modal: saved filesystem snapshot %s for task %s",
                        snapshot_id[:20],
                        self._task_id,
                    )
            except Exception as e:
                logger.warning("Modal: filesystem snapshot failed: %s", e)

        try:
            self._worker.run_coroutine(
                self._sandbox.terminate.aio(),
                timeout=15,
            )
        except Exception:
            pass
        finally:
            self._worker.stop()
            self._sandbox = None
            self._app = None