diff --git a/README.md b/README.md index bab19f1a6a..d38ac51aa8 100644 --- a/README.md +++ b/README.md @@ -248,8 +248,12 @@ This backend is under active development and must be explicitly enabled. uv pip install "hermes-agent[nomad]" # Start Nomad (dev mode) +# Docker driver: nomad agent -dev -config=nomad-dev.hcl +# Apptainer/raw_exec driver (draft): +# nomad agent -dev -config=nomad-singularity.hcl + # Build sandbox-server image docker build -t hermes-sandbox:local -f tools/sandbox/Dockerfile . @@ -258,9 +262,14 @@ export TERMINAL_ENV=nomad export TERMINAL_NOMAD_ADDRESS=http://localhost:4646 export TERMINAL_NOMAD_JOB_ID=hermes-sandbox export TERMINAL_NOMAD_IMAGE=hermes-sandbox:local +export TERMINAL_NOMAD_DRIVER=docker # or: raw_exec (Apptainer draft) export TERMINAL_NOMAD_SLOTS=10 export TERMINAL_NOMAD_MIN=1 export TERMINAL_NOMAD_MAX=10 + +# raw_exec only: +# export TERMINAL_NOMAD_APPTAINER_IMAGE=/path/to/hermes-sandbox.sif +# export TERMINAL_NOMAD_RAW_EXEC_PORT=8080 ``` **Sudo Support:** If a command needs sudo, you'll be prompted for your password (cached for the session). Or set `SUDO_PASSWORD` in `~/.hermes/.env`. diff --git a/nomad-singularity.hcl b/nomad-singularity.hcl new file mode 100644 index 0000000000..97150de4db --- /dev/null +++ b/nomad-singularity.hcl @@ -0,0 +1,25 @@ +# Nomad Configuration for Singularity/Apptainer Sandbox +# Run with: nomad agent -dev -config=nomad-singularity.hcl +# +# This enables the raw_exec driver, which can be used to run Apptainer +# commands on hosts where Docker is unavailable. +# +# NOTE: Hermes-Agent's Nomad backend support is draft; this file is provided +# as a starting point for local testing. + +client { + enabled = true + + options { + "driver.raw_exec.enable" = "1" + } +} + +plugin "raw_exec" { + config { + enabled = true + } +} + +# If you have a dedicated Nomad Singularity/Apptainer driver plugin installed, +# you can configure that instead of raw_exec. diff --git a/tools/nomad_slotpool.py b/tools/nomad_slotpool.py index 75581cf9dd..d8070874ef 100644 --- a/tools/nomad_slotpool.py +++ b/tools/nomad_slotpool.py @@ -173,8 +173,62 @@ def create_sandbox_job( memory: int, port: int = 8080, datacenter: str = "dc1", + driver: str = "docker", ) -> Dict[str, Any]: - """Create a basic sandbox-server Nomad job spec (docker driver).""" + """Create a sandbox-server Nomad job spec. + + driver: + - docker: runs tools/sandbox_server.py inside a Docker container image. + - raw_exec: runs Apptainer/Singularity via raw_exec on the host. + + raw_exec is provided as a draft option for HPC-like hosts without Docker. + """ + + if driver == "raw_exec": + # For raw_exec, we bind to a fixed port. This is intended for dev/testing + # on a single host. Scaling to multiple allocations requires more robust + # service discovery. + apptainer_image = os.getenv("TERMINAL_NOMAD_APPTAINER_IMAGE", "") + if not apptainer_image: + raise RuntimeError( + "TERMINAL_NOMAD_APPTAINER_IMAGE is required for raw_exec driver" + ) + + cmd = ( + "apptainer exec " + "--bind \"$NOMAD_ALLOC_DIR/data:/data\" " + f"{apptainer_image} " + f"python /sandbox_server.py --port {port} --slots {slots_per_container} --data-dir /data" + ) + + return { + "ID": job_id, + "Name": job_id, + "Type": "service", + "Datacenters": [datacenter], + "TaskGroups": [ + { + "Name": "sandbox", + "Count": count, + "Update": {"HealthCheck": "task_states", "MinHealthyTime": 0}, + "Networks": [{"Mode": "host"}], + "Tasks": [ + { + "Name": "sandbox-server", + "Driver": "raw_exec", + "Config": { + "command": "bash", + "args": ["-lc", cmd], + }, + "Env": {"PYTHONUNBUFFERED": "1", "NOMAD_ALLOC_DIR": "${NOMAD_ALLOC_DIR}"}, + "Resources": {"CPU": cpu, "MemoryMB": memory}, + } + ], + } + ], + } + + # Default: docker return { "ID": job_id, "Name": job_id, @@ -328,6 +382,10 @@ class SlotPoolConfig: job_id: str = "hermes-sandbox" datacenter: str = "dc1" image: str = "hermes-sandbox:local" + + # Driver selection: docker (default) or raw_exec (for Apptainer/Singularity) + driver: str = "docker" # docker | raw_exec + slots_per_container: int = 10 privileged: bool = False cpu: int = 500 @@ -336,6 +394,9 @@ class SlotPoolConfig: max_containers: int = 10 acquire_timeout: float = 30.0 + # raw_exec: fixed port (dynamic ports are harder to discover reliably) + raw_exec_port: int = 8080 + class SlotPool: def __init__(self, cfg: SlotPoolConfig): @@ -365,6 +426,8 @@ class SlotPool: cpu=self.cfg.cpu, memory=self.cfg.memory, datacenter=self.cfg.datacenter, + driver=self.cfg.driver, + port=self.cfg.raw_exec_port, ) res = await self.nomad.submit_job(spec) if "error" in res: @@ -387,16 +450,29 @@ class SlotPool: net = (detail.get("Resources") or {}).get("Networks") or [] address = None port = None - if net and isinstance(net, list): - n0 = net[0] - address = n0.get("IP") - ports = n0.get("DynamicPorts") or [] - for p in ports: - if p.get("Label") == "http": - port = p.get("Value") - if not address or not port: - # Fall back: allocation has an Address field - address = detail.get("NodeName") or detail.get("NodeID") + + if self.cfg.driver == "raw_exec": + # raw_exec: use fixed port and best-effort node address discovery + port = self.cfg.raw_exec_port + address = ( + detail.get("NodeAddress") + or detail.get("NodeName") + or detail.get("NodeID") + ) + else: + # docker: discover dynamic port mapping + if net and isinstance(net, list): + n0 = net[0] + address = n0.get("IP") + ports = n0.get("DynamicPorts") or [] + for p in ports: + if p.get("Label") == "http": + port = p.get("Value") + + if not address or not port: + # Fall back: allocation node identity + address = detail.get("NodeAddress") or detail.get("NodeName") or detail.get("NodeID") + if not address or not port: # Can't use this alloc continue diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index b775f9948f..2774061732 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -1273,6 +1273,8 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int, task_ nomad_address=os.getenv("TERMINAL_NOMAD_ADDRESS", "http://localhost:4646"), job_id=os.getenv("TERMINAL_NOMAD_JOB_ID", "hermes-sandbox"), image=os.getenv("TERMINAL_NOMAD_IMAGE", "hermes-sandbox:local"), + driver=os.getenv("TERMINAL_NOMAD_DRIVER", "docker"), + raw_exec_port=int(os.getenv("TERMINAL_NOMAD_RAW_EXEC_PORT", "8080")), slots_per_container=int(os.getenv("TERMINAL_NOMAD_SLOTS", "10")), min_containers=int(os.getenv("TERMINAL_NOMAD_MIN", "1")), max_containers=int(os.getenv("TERMINAL_NOMAD_MAX", "10")),