diff --git a/.env.example b/.env.example index 1f2cba1da5..908f7ce40e 100644 --- a/.env.example +++ b/.env.example @@ -22,9 +22,9 @@ HERMES_BACKEND=openai # of OpenRouter. # # Local server convenience (base URL without /v1): -# llama.cpp example (see `Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh`): +# llama.cpp example (see `Hermes-Agent/scripts/launch_llama_cpp_hermes_4_36b.sh`): # ATROPOS_SERVER_BASE_URL=http://127.0.0.1:8080 -# ATROPOS_SERVER_MODEL=glm-4.7-flash +# ATROPOS_SERVER_MODEL=hermes-4-36b # ATROPOS_SERVER_API_KEY=local # # Generic OpenAI-compatible (base URL should include /v1): diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py index 789ea88b7c..ecd012b100 100644 --- a/atropos/envs/agent_env.py +++ b/atropos/envs/agent_env.py @@ -51,6 +51,13 @@ class AgentEnvConfig(BaseEnvConfig): max_containers: int = Field(default=10, description="Nomad mode: maximum containers") privileged: bool = Field(default=False, description="Nomad mode: run container privileged") acquire_timeout_s: float = Field(default=30.0, description="Slot acquisition timeout (seconds)") + purge_job_on_start: bool = Field( + default=False, + description=( + "Nomad mode: stop/purge the sandbox job on startup. This is helpful in local dev and training runs " + "to recover from previous crashes that leave the job in a restart backoff state." + ), + ) purge_job_on_shutdown: bool = Field(default=True, description="Nomad mode: stop/purge job on shutdown") # basic agent defaults @@ -229,6 +236,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]): max_containers=self.config.max_containers, privileged=self.config.privileged, acquire_timeout=self.config.acquire_timeout_s, + purge_job_on_start=bool(self.config.purge_job_on_start), ) ) await pool.start() diff --git a/atropos/envs/hermes_compat_test_env.py b/atropos/envs/hermes_compat_test_env.py index 93b0fe2dd9..be9cc40e0b 100644 --- a/atropos/envs/hermes_compat_test_env.py +++ b/atropos/envs/hermes_compat_test_env.py @@ -54,7 +54,7 @@ class HermesCompatTestEnvConfig(AgentEnvConfig): default="http://127.0.0.1:8080", description="Base URL for an OpenAI-compatible chat server (without /v1).", ) - server_model: str = Field(default="glm-4.7-flash", description="Model name") + server_model: str = Field(default="hermes-4-36b", description="Model name") class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]): @@ -79,7 +79,7 @@ class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]): or os.getenv("LLM_BASE_URL") or "http://127.0.0.1:8080" ) - model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "hermes-4-36b" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = HermesCompatTestEnvConfig( @@ -97,6 +97,8 @@ class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]): disabled_toolsets=[], # Default to Nomad sandboxing; users can override via --env.* args. sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + # In local dev it's common for a previous crash to leave the job in backoff. + purge_job_on_start=True, purge_job_on_shutdown=True, ) diff --git a/atropos/envs/sandbox_terminal_smoke_env.py b/atropos/envs/sandbox_terminal_smoke_env.py index 9c140a30b6..d850817730 100644 --- a/atropos/envs/sandbox_terminal_smoke_env.py +++ b/atropos/envs/sandbox_terminal_smoke_env.py @@ -55,7 +55,7 @@ class SandboxTerminalSmokeEnvConfig(AgentEnvConfig): default="http://127.0.0.1:8080", description="Base URL for an OpenAI-compatible chat server (without /v1).", ) - server_model: str = Field(default="glm-4.7-flash", description="Model name") + server_model: str = Field(default="hermes-4-36b", description="Model name") class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]): @@ -80,7 +80,7 @@ class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]): or os.getenv("LLM_BASE_URL") or "http://127.0.0.1:8080" ) - model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "hermes-4-36b" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = SandboxTerminalSmokeEnvConfig( @@ -98,6 +98,7 @@ class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]): disabled_toolsets=[], # Default to Nomad sandboxing; users can override via --env.* args. sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + purge_job_on_start=True, purge_job_on_shutdown=True, ) diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py index 79f384495c..fa08a37633 100644 --- a/atropos/envs/swe_smith_oracle_env.py +++ b/atropos/envs/swe_smith_oracle_env.py @@ -74,7 +74,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]): or os.getenv("LLM_BASE_URL") or "http://127.0.0.1:8080" ) - model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "hermes-4-36b" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = SweSmithOracleEnvConfig( diff --git a/atropos/envs/test_env.py b/atropos/envs/test_env.py index 7c7f08e011..e078943b30 100644 --- a/atropos/envs/test_env.py +++ b/atropos/envs/test_env.py @@ -65,7 +65,7 @@ class SimpleTestEnvConfig(AgentEnvConfig): description="Base URL for an OpenAI-compatible server (without /v1)", ) server_model: str = Field( - default="glm-4.7-flash", + default="hermes-4-36b", description="Model name", ) @@ -104,7 +104,7 @@ class SimpleTestEnv(AgentEnv[SimpleTestEnvConfig]): or os.getenv("LLM_BASE_URL") or "http://127.0.0.1:8080" ) - model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "hermes-4-36b" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = SimpleTestEnvConfig( diff --git a/atropos/envs/toolserver_smoke_env.py b/atropos/envs/toolserver_smoke_env.py index 4b39af468d..234e0d7452 100644 --- a/atropos/envs/toolserver_smoke_env.py +++ b/atropos/envs/toolserver_smoke_env.py @@ -33,7 +33,7 @@ class ToolServerSmokeEnvConfig(AgentEnvConfig): default="http://127.0.0.1:8080", description="Base URL for an OpenAI-compatible chat server (without /v1).", ) - server_model: str = Field(default="glm-4.7-flash", description="Model name") + server_model: str = Field(default="hermes-4-36b", description="Model name") class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]): @@ -58,7 +58,7 @@ class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]): or os.getenv("LLM_BASE_URL") or "http://127.0.0.1:8080" ) - model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "hermes-4-36b" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = ToolServerSmokeEnvConfig( @@ -76,6 +76,7 @@ class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]): # Self-contained ToolServer for local smoke. tool_server_url="inprocess", sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + purge_job_on_start=True, purge_job_on_shutdown=True, ) diff --git a/atropos/slots/pool.py b/atropos/slots/pool.py index d6ace16b7d..1ee741cafa 100644 --- a/atropos/slots/pool.py +++ b/atropos/slots/pool.py @@ -50,6 +50,9 @@ class SlotPoolConfig: health_check_interval: float = 30.0 # Seconds between health checks scale_cooldown: float = 60.0 # Seconds between scale operations + # Job lifecycle + purge_job_on_start: bool = False # Purge any pre-existing job before starting (local dev/training friendly) + class SlotPool: """ @@ -144,7 +147,11 @@ class SlotPool: if not await self.nomad.is_healthy(): raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}") - # Check if job exists + if self.config.purge_job_on_start: + logger.info(f"Purging any existing Nomad job: {self.config.job_id}") + await self.nomad.stop_job(self.config.job_id, purge=True) + + # Check if job exists (after optional purge) job = await self.nomad.get_job(self.config.job_id) if job is None: @@ -397,11 +404,30 @@ class SlotPool: for task_name, st in task_states.items(): events = (st or {}).get("Events") or [] if isinstance(events, list) and events: - last = events[-1] - desc = last.get("DisplayMessage") or last.get("Message") or last.get("Type") or "" - if desc: - parts.append(f"{task_name}: {desc}") + # Include a few recent events; the latest can be a generic restart message + # while the true root cause is slightly earlier (e.g. image pull failure). + recent = events[-3:] + msgs: List[str] = [] + for ev in recent: + desc = ev.get("DisplayMessage") or ev.get("Message") or ev.get("Type") or "" + if desc: + msgs.append(desc) + if msgs: + parts.append(f"{task_name}: " + " | ".join(msgs)) return "; ".join(parts) + + def _alloc_events_lower(detail: Dict[str, Any]) -> str: + task_states = detail.get("TaskStates") or {} + texts: List[str] = [] + if isinstance(task_states, dict): + for _task_name, st in task_states.items(): + events = (st or {}).get("Events") or [] + if isinstance(events, list): + for ev in events[-10:]: + desc = ev.get("DisplayMessage") or ev.get("Message") or ev.get("Type") or "" + if desc: + texts.append(desc) + return " ".join(texts).lower() while time.time() - start < timeout: allocs = await self.nomad.get_job_allocations(self.config.job_id) @@ -417,13 +443,23 @@ class SlotPool: detail = await self.nomad.get_allocation(alloc.id) if isinstance(detail, dict): summary = _summarize_alloc_detail(detail) - lowered = summary.lower() + lowered = _alloc_events_lower(detail) or summary.lower() if "failed to pull" in lowered or "pull access denied" in lowered: raise RuntimeError( "Nomad allocation failed to start due to a Docker image pull error. " f"Allocation {alloc.id[:8]}: {summary}\n" "If you're using a local image tag (e.g. `atropos-sandbox:local`) on macOS, " - "make sure the image is loaded into Docker (build with `docker buildx build --load ...`)." + "make sure the image is loaded into Docker, e.g.:\n" + " docker buildx build --load -t atropos-sandbox:local -f Hermes-Agent/atropos/Dockerfile Hermes-Agent/atropos" + ) + if "exceeded allowed attempts" in lowered: + raise RuntimeError( + "Nomad allocation is crash-looping and has entered restart backoff. " + f"Allocation {alloc.id[:8]}: {summary}\n" + "Inspect logs with:\n" + f" nomad alloc logs -stderr -task sandbox-server {alloc.id}\n" + "Common causes include: missing local Docker image tag, container entrypoint error, " + "or sandbox-server startup failure." ) if healthy_count >= min_count: diff --git a/cli.py b/cli.py index c1d0b9f880..e2e026a337 100755 --- a/cli.py +++ b/cli.py @@ -472,14 +472,33 @@ class HermesCLI: self.console = Console() self.compact = compact if compact is not None else CLI_CONFIG["display"].get("compact", False) self.verbose = verbose if verbose is not None else CLI_CONFIG["agent"].get("verbose", False) + + self.backend = (backend or os.getenv("HERMES_BACKEND") or "openai").strip().lower() + if self.backend not in {"openai", "atropos"}: + self.console.print( + f"[bold yellow]Warning:[/] unknown backend '{self.backend}', falling back to 'openai'" + ) + self.backend = "openai" # Configuration - priority: CLI args > env vars > config file - self.model = ( - model - or os.getenv("LLM_MODEL") - or os.getenv("ATROPOS_SERVER_MODEL") - or CLI_CONFIG["model"]["default"] - ) + # + # Note: For the Atropos backend we intentionally prefer `ATROPOS_SERVER_MODEL` + # over `LLM_MODEL`, because `LLM_MODEL` is commonly an OpenRouter-style ID + # (e.g. "anthropic/claude-sonnet-4") and will not exist on local servers. + if model: + self.model = model + elif self.backend == "atropos": + self.model = ( + os.getenv("ATROPOS_SERVER_MODEL") + or os.getenv("LLM_MODEL") + or CLI_CONFIG["model"]["default"] + ) + else: + self.model = ( + os.getenv("LLM_MODEL") + or os.getenv("ATROPOS_SERVER_MODEL") + or CLI_CONFIG["model"]["default"] + ) env_openai_base_url = os.getenv("OPENAI_BASE_URL") if env_openai_base_url: @@ -507,13 +526,6 @@ class HermesCLI: or os.getenv("OPENROUTER_API_KEY") ) self.max_turns = max_turns if max_turns != 20 else CLI_CONFIG["agent"].get("max_turns", 20) - - self.backend = (backend or os.getenv("HERMES_BACKEND") or "openai").strip().lower() - if self.backend not in {"openai", "atropos"}: - self.console.print( - f"[bold yellow]Warning:[/] unknown backend '{self.backend}', falling back to 'openai'" - ) - self.backend = "openai" # Parse and validate toolsets self.enabled_toolsets = toolsets diff --git a/scripts/launch_llama_cpp_hermes_4_36b.sh b/scripts/launch_llama_cpp_hermes_4_36b.sh new file mode 100755 index 0000000000..8e6f14be4e --- /dev/null +++ b/scripts/launch_llama_cpp_hermes_4_36b.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Launch a local llama.cpp OpenAI-compatible server running Hermes 4.3 36B (GGUF). +# +# Requires: +# - `llama-server` installed (e.g. `brew install llama.cpp`) +# +# Note: Port choice can conflict with other local dev servers. If 8080 is already +# in use, override via `LLAMA_CPP_PORT=...`. +# +# Usage: +# Hermes-Agent/scripts/launch_llama_cpp_hermes_4_36b.sh +# +# Override defaults: +# LLAMA_CPP_HOST=127.0.0.1 LLAMA_CPP_PORT=8082 \ +# LLAMA_CPP_HF_REPO=NousResearch/Hermes-4.3-36B-GGUF \ +# LLAMA_CPP_HF_FILE=hermes-4_3_36b-Q4_K_M.gguf \ +# LLAMA_CPP_ALIAS=hermes-4-36b \ +# Hermes-Agent/scripts/launch_llama_cpp_hermes_4_36b.sh + +HOST="${LLAMA_CPP_HOST:-127.0.0.1}" +PORT="${LLAMA_CPP_PORT:-8080}" +HF_REPO="${LLAMA_CPP_HF_REPO:-NousResearch/Hermes-4.3-36B-GGUF}" +HF_FILE="${LLAMA_CPP_HF_FILE:-hermes-4_3_36b-Q4_K_M.gguf}" +ALIAS="${LLAMA_CPP_ALIAS:-hermes-4-36b}" + +if ! command -v llama-server >/dev/null 2>&1; then + echo "Error: llama-server not found in PATH." + echo "Install via Homebrew: brew install llama.cpp" + exit 1 +fi + +echo "Launching llama.cpp server..." +echo " host: $HOST" +echo " port: $PORT" +echo " repo: $HF_REPO" +echo " file: $HF_FILE" +echo " alias: $ALIAS" +echo +echo "Suggested env vars for Hermes/Atropos integration:" +echo " export ATROPOS_SERVER_BASE_URL=http://${HOST}:${PORT}" +echo " export ATROPOS_SERVER_MODEL=${ALIAS}" +echo " export ATROPOS_SERVER_API_KEY=local" +echo + +if command -v lsof >/dev/null 2>&1; then + if lsof -nP -iTCP:"$PORT" -sTCP:LISTEN >/dev/null 2>&1; then + echo "Error: port $PORT is already in use." + echo "Pick a different port, e.g.:" + echo " LLAMA_CPP_PORT=8082 Hermes-Agent/scripts/launch_llama_cpp_hermes_4_36b.sh" + exit 1 + fi +fi + +exec llama-server \ + --host "$HOST" \ + --port "$PORT" \ + --hf-repo "$HF_REPO" \ + --hf-file "$HF_FILE" \ + --alias "$ALIAS" \ + -c 32768 \ + -n -1 diff --git a/tests/test_data/checkpoint_test_dataset.jsonl b/tests/test_data/checkpoint_test_dataset.jsonl new file mode 100644 index 0000000000..150e1c0afc --- /dev/null +++ b/tests/test_data/checkpoint_test_dataset.jsonl @@ -0,0 +1,15 @@ +{"prompt": "Test prompt 0: What is 2+2? Just answer briefly.", "test_id": 0} +{"prompt": "Test prompt 1: What is 2+2? Just answer briefly.", "test_id": 1} +{"prompt": "Test prompt 2: What is 2+2? Just answer briefly.", "test_id": 2} +{"prompt": "Test prompt 3: What is 2+2? Just answer briefly.", "test_id": 3} +{"prompt": "Test prompt 4: What is 2+2? Just answer briefly.", "test_id": 4} +{"prompt": "Test prompt 5: What is 2+2? Just answer briefly.", "test_id": 5} +{"prompt": "Test prompt 6: What is 2+2? Just answer briefly.", "test_id": 6} +{"prompt": "Test prompt 7: What is 2+2? Just answer briefly.", "test_id": 7} +{"prompt": "Test prompt 8: What is 2+2? Just answer briefly.", "test_id": 8} +{"prompt": "Test prompt 9: What is 2+2? Just answer briefly.", "test_id": 9} +{"prompt": "Test prompt 10: What is 2+2? Just answer briefly.", "test_id": 10} +{"prompt": "Test prompt 11: What is 2+2? Just answer briefly.", "test_id": 11} +{"prompt": "Test prompt 12: What is 2+2? Just answer briefly.", "test_id": 12} +{"prompt": "Test prompt 13: What is 2+2? Just answer briefly.", "test_id": 13} +{"prompt": "Test prompt 14: What is 2+2? Just answer briefly.", "test_id": 14} diff --git a/tests/test_data/checkpoint_test_resume_partial.jsonl b/tests/test_data/checkpoint_test_resume_partial.jsonl new file mode 100644 index 0000000000..e001d382b5 --- /dev/null +++ b/tests/test_data/checkpoint_test_resume_partial.jsonl @@ -0,0 +1,5 @@ +{"prompt": "Test prompt 0: What is 2+2? Just answer briefly.", "test_id": 0} +{"prompt": "Test prompt 1: What is 2+2? Just answer briefly.", "test_id": 1} +{"prompt": "Test prompt 2: What is 2+2? Just answer briefly.", "test_id": 2} +{"prompt": "Test prompt 3: What is 2+2? Just answer briefly.", "test_id": 3} +{"prompt": "Test prompt 4: What is 2+2? Just answer briefly.", "test_id": 4}