mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
Nomad backend: draft raw_exec/Apptainer driver support + docs
This commit is contained in:
@@ -248,8 +248,12 @@ This backend is under active development and must be explicitly enabled.
|
||||
uv pip install "hermes-agent[nomad]"
|
||||
|
||||
# Start Nomad (dev mode)
|
||||
# Docker driver:
|
||||
nomad agent -dev -config=nomad-dev.hcl
|
||||
|
||||
# Apptainer/raw_exec driver (draft):
|
||||
# nomad agent -dev -config=nomad-singularity.hcl
|
||||
|
||||
# Build sandbox-server image
|
||||
docker build -t hermes-sandbox:local -f tools/sandbox/Dockerfile .
|
||||
|
||||
@@ -258,9 +262,14 @@ export TERMINAL_ENV=nomad
|
||||
export TERMINAL_NOMAD_ADDRESS=http://localhost:4646
|
||||
export TERMINAL_NOMAD_JOB_ID=hermes-sandbox
|
||||
export TERMINAL_NOMAD_IMAGE=hermes-sandbox:local
|
||||
export TERMINAL_NOMAD_DRIVER=docker # or: raw_exec (Apptainer draft)
|
||||
export TERMINAL_NOMAD_SLOTS=10
|
||||
export TERMINAL_NOMAD_MIN=1
|
||||
export TERMINAL_NOMAD_MAX=10
|
||||
|
||||
# raw_exec only:
|
||||
# export TERMINAL_NOMAD_APPTAINER_IMAGE=/path/to/hermes-sandbox.sif
|
||||
# export TERMINAL_NOMAD_RAW_EXEC_PORT=8080
|
||||
```
|
||||
|
||||
**Sudo Support:** If a command needs sudo, you'll be prompted for your password (cached for the session). Or set `SUDO_PASSWORD` in `~/.hermes/.env`.
|
||||
|
||||
25
nomad-singularity.hcl
Normal file
25
nomad-singularity.hcl
Normal file
@@ -0,0 +1,25 @@
|
||||
# Nomad Configuration for Singularity/Apptainer Sandbox
|
||||
# Run with: nomad agent -dev -config=nomad-singularity.hcl
|
||||
#
|
||||
# This enables the raw_exec driver, which can be used to run Apptainer
|
||||
# commands on hosts where Docker is unavailable.
|
||||
#
|
||||
# NOTE: Hermes-Agent's Nomad backend support is draft; this file is provided
|
||||
# as a starting point for local testing.
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
|
||||
options {
|
||||
"driver.raw_exec.enable" = "1"
|
||||
}
|
||||
}
|
||||
|
||||
plugin "raw_exec" {
|
||||
config {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
# If you have a dedicated Nomad Singularity/Apptainer driver plugin installed,
|
||||
# you can configure that instead of raw_exec.
|
||||
@@ -173,8 +173,62 @@ def create_sandbox_job(
|
||||
memory: int,
|
||||
port: int = 8080,
|
||||
datacenter: str = "dc1",
|
||||
driver: str = "docker",
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a basic sandbox-server Nomad job spec (docker driver)."""
|
||||
"""Create a sandbox-server Nomad job spec.
|
||||
|
||||
driver:
|
||||
- docker: runs tools/sandbox_server.py inside a Docker container image.
|
||||
- raw_exec: runs Apptainer/Singularity via raw_exec on the host.
|
||||
|
||||
raw_exec is provided as a draft option for HPC-like hosts without Docker.
|
||||
"""
|
||||
|
||||
if driver == "raw_exec":
|
||||
# For raw_exec, we bind to a fixed port. This is intended for dev/testing
|
||||
# on a single host. Scaling to multiple allocations requires more robust
|
||||
# service discovery.
|
||||
apptainer_image = os.getenv("TERMINAL_NOMAD_APPTAINER_IMAGE", "")
|
||||
if not apptainer_image:
|
||||
raise RuntimeError(
|
||||
"TERMINAL_NOMAD_APPTAINER_IMAGE is required for raw_exec driver"
|
||||
)
|
||||
|
||||
cmd = (
|
||||
"apptainer exec "
|
||||
"--bind \"$NOMAD_ALLOC_DIR/data:/data\" "
|
||||
f"{apptainer_image} "
|
||||
f"python /sandbox_server.py --port {port} --slots {slots_per_container} --data-dir /data"
|
||||
)
|
||||
|
||||
return {
|
||||
"ID": job_id,
|
||||
"Name": job_id,
|
||||
"Type": "service",
|
||||
"Datacenters": [datacenter],
|
||||
"TaskGroups": [
|
||||
{
|
||||
"Name": "sandbox",
|
||||
"Count": count,
|
||||
"Update": {"HealthCheck": "task_states", "MinHealthyTime": 0},
|
||||
"Networks": [{"Mode": "host"}],
|
||||
"Tasks": [
|
||||
{
|
||||
"Name": "sandbox-server",
|
||||
"Driver": "raw_exec",
|
||||
"Config": {
|
||||
"command": "bash",
|
||||
"args": ["-lc", cmd],
|
||||
},
|
||||
"Env": {"PYTHONUNBUFFERED": "1", "NOMAD_ALLOC_DIR": "${NOMAD_ALLOC_DIR}"},
|
||||
"Resources": {"CPU": cpu, "MemoryMB": memory},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Default: docker
|
||||
return {
|
||||
"ID": job_id,
|
||||
"Name": job_id,
|
||||
@@ -328,6 +382,10 @@ class SlotPoolConfig:
|
||||
job_id: str = "hermes-sandbox"
|
||||
datacenter: str = "dc1"
|
||||
image: str = "hermes-sandbox:local"
|
||||
|
||||
# Driver selection: docker (default) or raw_exec (for Apptainer/Singularity)
|
||||
driver: str = "docker" # docker | raw_exec
|
||||
|
||||
slots_per_container: int = 10
|
||||
privileged: bool = False
|
||||
cpu: int = 500
|
||||
@@ -336,6 +394,9 @@ class SlotPoolConfig:
|
||||
max_containers: int = 10
|
||||
acquire_timeout: float = 30.0
|
||||
|
||||
# raw_exec: fixed port (dynamic ports are harder to discover reliably)
|
||||
raw_exec_port: int = 8080
|
||||
|
||||
|
||||
class SlotPool:
|
||||
def __init__(self, cfg: SlotPoolConfig):
|
||||
@@ -365,6 +426,8 @@ class SlotPool:
|
||||
cpu=self.cfg.cpu,
|
||||
memory=self.cfg.memory,
|
||||
datacenter=self.cfg.datacenter,
|
||||
driver=self.cfg.driver,
|
||||
port=self.cfg.raw_exec_port,
|
||||
)
|
||||
res = await self.nomad.submit_job(spec)
|
||||
if "error" in res:
|
||||
@@ -387,16 +450,29 @@ class SlotPool:
|
||||
net = (detail.get("Resources") or {}).get("Networks") or []
|
||||
address = None
|
||||
port = None
|
||||
if net and isinstance(net, list):
|
||||
n0 = net[0]
|
||||
address = n0.get("IP")
|
||||
ports = n0.get("DynamicPorts") or []
|
||||
for p in ports:
|
||||
if p.get("Label") == "http":
|
||||
port = p.get("Value")
|
||||
if not address or not port:
|
||||
# Fall back: allocation has an Address field
|
||||
address = detail.get("NodeName") or detail.get("NodeID")
|
||||
|
||||
if self.cfg.driver == "raw_exec":
|
||||
# raw_exec: use fixed port and best-effort node address discovery
|
||||
port = self.cfg.raw_exec_port
|
||||
address = (
|
||||
detail.get("NodeAddress")
|
||||
or detail.get("NodeName")
|
||||
or detail.get("NodeID")
|
||||
)
|
||||
else:
|
||||
# docker: discover dynamic port mapping
|
||||
if net and isinstance(net, list):
|
||||
n0 = net[0]
|
||||
address = n0.get("IP")
|
||||
ports = n0.get("DynamicPorts") or []
|
||||
for p in ports:
|
||||
if p.get("Label") == "http":
|
||||
port = p.get("Value")
|
||||
|
||||
if not address or not port:
|
||||
# Fall back: allocation node identity
|
||||
address = detail.get("NodeAddress") or detail.get("NodeName") or detail.get("NodeID")
|
||||
|
||||
if not address or not port:
|
||||
# Can't use this alloc
|
||||
continue
|
||||
|
||||
@@ -1273,6 +1273,8 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int, task_
|
||||
nomad_address=os.getenv("TERMINAL_NOMAD_ADDRESS", "http://localhost:4646"),
|
||||
job_id=os.getenv("TERMINAL_NOMAD_JOB_ID", "hermes-sandbox"),
|
||||
image=os.getenv("TERMINAL_NOMAD_IMAGE", "hermes-sandbox:local"),
|
||||
driver=os.getenv("TERMINAL_NOMAD_DRIVER", "docker"),
|
||||
raw_exec_port=int(os.getenv("TERMINAL_NOMAD_RAW_EXEC_PORT", "8080")),
|
||||
slots_per_container=int(os.getenv("TERMINAL_NOMAD_SLOTS", "10")),
|
||||
min_containers=int(os.getenv("TERMINAL_NOMAD_MIN", "1")),
|
||||
max_containers=int(os.getenv("TERMINAL_NOMAD_MAX", "10")),
|
||||
|
||||
Reference in New Issue
Block a user