README: document optional Modal pooling (TERMINAL_MODAL_MODE=pool)

Add optional Modal sandbox pooling backend (TERMINAL_MODAL_MODE=pool)
- Default Modal behavior unchanged - New pooled mode reuses warm Modal environments across tasks - Per-task working directory (/root/hermes_tasks/<task_id>) with best-effort cleanup - Minimal integration: terminal_tool _create_environment now accepts task_id and selects pooled modal wrapper when configured
2026-04-28 23:11:37 +08:00 · 2026-02-14 13:36:28 +10:00 · 2026-02-14 13:18:59 +10:00 · 2026-02-14 13:13:00 +10:00
6 changed files with 383 additions and 41 deletions
--- a/README.md
+++ b/README.md
@@ -240,6 +240,19 @@ modal setup                    # Authenticate with Modal
 hermes config set terminal.backend modal
 ```

+**Modal pooling (optional):** reuse warm Modal sandboxes across tasks.
+
+This is opt-in and does not change the default Modal behavior.
+
+```bash
+# Enable pooled mode
+export TERMINAL_ENV=modal
+export TERMINAL_MODAL_MODE=pool
+
+# Pool size (default: 4)
+export TERMINAL_MODAL_POOL_MAX=4
+```
+
 **Sudo Support:** If a command needs sudo, you'll be prompted for your password (cached for the session). Or set `SUDO_PASSWORD` in `~/.hermes/.env`.

 ### 📱 Messaging Gateway
--- a/environments/agent_loop.py
+++ b/environments/agent_loop.py
@@ -73,6 +73,12 @@ class AgentResult:
    # Tool errors encountered during the loop
    tool_errors: List[ToolError] = field(default_factory=list)

+    # Tool-call metrics (debugging / optional reward shaping)
+    tool_calls_attempted: int = 0
+    tool_calls_schema_valid: int = 0
+    tool_calls_executed_ok: int = 0
+    tool_calls_exec_error: int = 0
+

 def _extract_reasoning_from_message(message) -> Optional[str]:
    """
@@ -136,6 +142,8 @@ class HermesAgentLoop:
        temperature: float = 1.0,
        max_tokens: Optional[int] = None,
        extra_body: Optional[Dict[str, Any]] = None,
+        tool_handler=None,
+        max_context_tokens: Optional[int] = None,
    ):
        """
        Initialize the agent loop.
@@ -152,6 +160,13 @@ class HermesAgentLoop:
            extra_body: Extra parameters passed to the OpenAI client's create() call.
                        Used for OpenRouter provider preferences, transforms, etc.
                        e.g. {"provider": {"ignore": ["DeepInfra"]}}
+            tool_handler: Optional async callable(tool_name, args, task_id) -> str.
+                         When provided, used INSTEAD of handle_function_call() for
+                         tool dispatch. This allows sandbox backends (Modal, Nomad)
+                         to route tool calls through their slot-based execution.
+            max_context_tokens: Maximum prompt tokens before truncation.
+                               If None, no truncation is applied.
+                               Recommended: set to max_model_len - max_tokens - 512 (safety margin).
        """
        self.server = server
        self.tool_schemas = tool_schemas
@@ -161,6 +176,123 @@ class HermesAgentLoop:
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.extra_body = extra_body
+        self.tool_handler = tool_handler
+        self.max_context_tokens = max_context_tokens
+
+    def _truncate_context(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Truncate conversation history to fit within max_context_tokens.
+
+        Strategy:
+        - Keep system message (index 0) and initial user message (index 1) always
+        - Keep last 6 messages (recent context) always
+        - For everything in between, progressively truncate tool result content
+        - If still too long, drop oldest middle messages entirely
+
+        Uses rough char/4 token estimate (fast, no tokenizer needed).
+
+        NOTE: This function mutates the provided list (it may pop/replace entries).
+        Call it on a copy when you want to preserve the full trajectory.
+        """
+        if self.max_context_tokens is None:
+            return messages
+
+        def estimate_tokens(msgs):
+            total = 0
+            for m in msgs:
+                content = m.get("content", "") or ""
+                total += len(content) // 4 + 10  # ~4 chars per token + overhead
+                if "tool_calls" in m:
+                    total += 50 * len(m["tool_calls"])  # tool call overhead
+            return total
+
+        if estimate_tokens(messages) <= self.max_context_tokens:
+            return messages
+
+        protect_head = 2
+        protect_tail = max(0, min(6, len(messages) - protect_head))
+        middle_start = protect_head
+        middle_end = len(messages) - protect_tail
+
+        # Phase 1: truncate tool outputs in the middle
+        if middle_start < middle_end:
+            for i in range(middle_start, middle_end):
+                if messages[i].get("role") == "tool":
+                    content = messages[i].get("content", "") or ""
+                    if len(content) > 200:
+                        messages[i] = dict(messages[i])
+                        messages[i]["content"] = content[:100] + "\n...[truncated]...\n" + content[-50:]
+
+            if estimate_tokens(messages) <= self.max_context_tokens:
+                return messages
+
+        # Phase 2: drop oldest middle messages (try to keep assistant+tool pairs)
+        while middle_start < middle_end and estimate_tokens(messages) > self.max_context_tokens:
+            msg = messages[middle_start]
+            messages.pop(middle_start)
+            middle_end -= 1
+
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                tool_ids = {
+                    tc.get("id") or tc.get("tool_call_id", "")
+                    for tc in msg.get("tool_calls", [])
+                    if isinstance(tc, dict)
+                }
+                i = middle_start
+                while i < middle_end:
+                    if messages[i].get("role") == "tool" and messages[i].get("tool_call_id", "") in tool_ids:
+                        messages.pop(i)
+                        middle_end -= 1
+                    else:
+                        i += 1
+
+        return messages
+
+    def _normalize_tool_args(self, tool_name: str, tool_args_raw: str) -> (Dict[str, Any], bool):
+        """Normalize tool arguments into a dict.
+
+        Returns: (args_dict, schema_valid)
+
+        schema_valid is True only when arguments decode directly into a dict
+        (no double-decoding and no coercion/wrapping required).
+
+        Goal: keep environments robust (never crash on args format drift) while
+        still allowing reward functions to penalize malformed formats if desired.
+        """
+        try:
+            decoded = json.loads(tool_args_raw)
+        except json.JSONDecodeError:
+            # Not JSON at all — treat as a plain string
+            if tool_name == "terminal":
+                return {"command": tool_args_raw}, False
+            return {"input": tool_args_raw}, False
+
+        if isinstance(decoded, dict):
+            if tool_name == "terminal":
+                cmd = decoded.get("command")
+                if isinstance(cmd, str) and cmd.strip():
+                    return decoded, True
+                if isinstance(decoded.get("input"), str):
+                    return {"command": decoded.get("input")}, False
+                return decoded, False
+            return decoded, True
+
+        if isinstance(decoded, str):
+            s = decoded.strip()
+            if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
+                try:
+                    decoded2 = json.loads(s)
+                except json.JSONDecodeError:
+                    decoded2 = None
+                if isinstance(decoded2, dict):
+                    return decoded2, False
+
+            if tool_name == "terminal":
+                return {"command": decoded}, False
+            return {"input": decoded}, False
+
+        if tool_name == "terminal":
+            return {"command": str(decoded)}, False
+        return {"input": decoded}, False

    async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
        """
@@ -176,14 +308,22 @@ class HermesAgentLoop:
        reasoning_per_turn = []
        tool_errors: List[ToolError] = []

+        tool_calls_attempted = 0
+        tool_calls_schema_valid = 0
+        tool_calls_executed_ok = 0
+        tool_calls_exec_error = 0
+
        import time as _time

        for turn in range(self.max_turns):
            turn_start = _time.monotonic()

+            # Truncate prompt view on a copy (preserve full trajectory in `messages`)
+            prompt_messages = self._truncate_context(list(messages))
+
            # Build the chat_completion kwargs
            chat_kwargs = {
-                "messages": messages,
+                "messages": prompt_messages,
                "n": 1,
                "temperature": self.temperature,
            }
@@ -215,6 +355,10 @@ class HermesAgentLoop:
                    finished_naturally=False,
                    reasoning_per_turn=reasoning_per_turn,
                    tool_errors=tool_errors,
+                    tool_calls_attempted=tool_calls_attempted,
+                    tool_calls_schema_valid=tool_calls_schema_valid,
+                    tool_calls_executed_ok=tool_calls_executed_ok,
+                    tool_calls_exec_error=tool_calls_exec_error,
                )

            api_elapsed = _time.monotonic() - api_start
@@ -228,6 +372,10 @@ class HermesAgentLoop:
                    finished_naturally=False,
                    reasoning_per_turn=reasoning_per_turn,
                    tool_errors=tool_errors,
+                    tool_calls_attempted=tool_calls_attempted,
+                    tool_calls_schema_valid=tool_calls_schema_valid,
+                    tool_calls_executed_ok=tool_calls_executed_ok,
+                    tool_calls_exec_error=tool_calls_exec_error,
                )

            assistant_msg = response.choices[0].message
@@ -270,6 +418,7 @@ class HermesAgentLoop:

                    # Validate tool name
                    if tool_name not in self.valid_tool_names:
+                        tool_calls_exec_error += 1
                        tool_result = json.dumps(
                            {
                                "error": f"Unknown tool '{tool_name}'. "
@@ -287,35 +436,35 @@ class HermesAgentLoop:
                            tool_name, turn + 1,
                        )
                    else:
-                        # Parse arguments and dispatch
-                        try:
-                            args = json.loads(tool_args_raw)
-                        except json.JSONDecodeError:
-                            args = {}
-                            logger.warning(
-                                "Invalid JSON in tool call arguments for '%s': %s",
-                                tool_name, tool_args_raw[:200],
-                            )
+                        tool_calls_attempted += 1
+                        args, schema_valid = self._normalize_tool_args(tool_name, tool_args_raw)
+                        if schema_valid:
+                            tool_calls_schema_valid += 1

                        try:
                            if tool_name == "terminal":
                                backend = os.getenv("TERMINAL_ENV", "local")
-                                cmd_preview = args.get("command", "")[:80]
+                                cmd_preview = str(args.get("command", ""))[:80]
                                logger.info(
                                    "[%s] $ %s", self.task_id[:8], cmd_preview,
                                )

-                            # Run tool calls in a thread pool so backends that use
-                            # asyncio.run() internally (modal, docker) get a clean
-                            # event loop instead of deadlocking inside Atropos's loop.
                            tool_submit_time = _time.monotonic()
-                            loop = asyncio.get_event_loop()
-                            tool_result = await loop.run_in_executor(
-                                _tool_executor,
-                                lambda: handle_function_call(
-                                    tool_name, args, task_id=self.task_id
-                                ),
-                            )
+
+                            if self.tool_handler:
+                                tool_result = await self.tool_handler(tool_name, args, self.task_id)
+                            else:
+                                # Run tool calls in a thread pool so backends that use
+                                # asyncio.run() internally (modal, docker) get a clean
+                                # event loop instead of deadlocking inside Atropos's loop.
+                                loop = asyncio.get_event_loop()
+                                tool_result = await loop.run_in_executor(
+                                    _tool_executor,
+                                    lambda: handle_function_call(
+                                        tool_name, args, task_id=self.task_id
+                                    ),
+                                )
+
                            tool_elapsed = _time.monotonic() - tool_submit_time

                            # Log slow tools and thread pool stats for debugging
@@ -327,6 +476,7 @@ class HermesAgentLoop:
                                    tool_elapsed, pool_active,
                                )
                        except Exception as e:
+                            tool_calls_exec_error += 1
                            tool_result = json.dumps(
                                {"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
                            )
@@ -340,22 +490,31 @@ class HermesAgentLoop:
                                "Tool '%s' execution failed on turn %d: %s",
                                tool_name, turn + 1, e,
                            )
+                        else:
+                            tool_err = False
+                            try:
+                                result_data = json.loads(tool_result)
+                                if isinstance(result_data, dict):
+                                    err = result_data.get("error")
+                                    if err:
+                                        tool_err = True

-                        # Also check if the tool returned an error in its JSON result
-                        try:
-                            result_data = json.loads(tool_result)
-                            if isinstance(result_data, dict):
-                                err = result_data.get("error")
-                                exit_code = result_data.get("exit_code")
-                                if err and exit_code and exit_code < 0:
-                                    tool_errors.append(ToolError(
-                                        turn=turn + 1, tool_name=tool_name,
-                                        arguments=tool_args_raw[:200],
-                                        error=str(err),
-                                        tool_result=tool_result[:500],
-                                    ))
-                        except (json.JSONDecodeError, TypeError):
-                            pass
+                                    exit_code = result_data.get("exit_code")
+                                    if exit_code is not None and isinstance(exit_code, int) and exit_code < 0:
+                                        tool_err = True
+                                        tool_errors.append(ToolError(
+                                            turn=turn + 1, tool_name=tool_name,
+                                            arguments=tool_args_raw[:200],
+                                            error=str(err) if err else "nonzero exit_code",
+                                            tool_result=tool_result[:500],
+                                        ))
+                            except (json.JSONDecodeError, TypeError):
+                                pass
+
+                            if tool_err:
+                                tool_calls_exec_error += 1
+                            else:
+                                tool_calls_executed_ok += 1

                    # Add tool response to conversation
                    messages.append(
@@ -396,6 +555,10 @@ class HermesAgentLoop:
                    finished_naturally=True,
                    reasoning_per_turn=reasoning_per_turn,
                    tool_errors=tool_errors,
+                    tool_calls_attempted=tool_calls_attempted,
+                    tool_calls_schema_valid=tool_calls_schema_valid,
+                    tool_calls_executed_ok=tool_calls_executed_ok,
+                    tool_calls_exec_error=tool_calls_exec_error,
                )

        # Hit max turns without the model stopping
@@ -407,6 +570,10 @@ class HermesAgentLoop:
            finished_naturally=False,
            reasoning_per_turn=reasoning_per_turn,
            tool_errors=tool_errors,
+            tool_calls_attempted=tool_calls_attempted,
+            tool_calls_schema_valid=tool_calls_schema_valid,
+            tool_calls_executed_ok=tool_calls_executed_ok,
+            tool_calls_exec_error=tool_calls_exec_error,
        )

    def _get_managed_state(self) -> Optional[Dict[str, Any]]:
--- a/environments/hermes_base_env.py
+++ b/environments/hermes_base_env.py
@@ -478,6 +478,7 @@ class HermesAgentBaseEnv(BaseEnv):
                    tokenizer=self.tokenizer,
                    tool_call_parser=tc_parser,
                ) as managed:
+                    _max_ctx = self.config.max_token_length if (self.config.max_token_length and self.config.max_token_length > 0) else None
                    agent = HermesAgentLoop(
                        server=managed,
                        tool_schemas=tools,
@@ -487,6 +488,7 @@ class HermesAgentBaseEnv(BaseEnv):
                        temperature=self.config.agent_temperature,
                        max_tokens=self.config.max_token_length,
                        extra_body=self.config.extra_body,
+                        max_context_tokens=_max_ctx,
                    )
                    result = await agent.run(messages)
            except NotImplementedError:
@@ -495,6 +497,7 @@ class HermesAgentBaseEnv(BaseEnv):
                    "ManagedServer not available (OpenAI server?). "
                    "Falling back to direct server mode."
                )
+                _max_ctx = self.config.max_token_length if (self.config.max_token_length and self.config.max_token_length > 0) else None
                agent = HermesAgentLoop(
                    server=self.server,
                    tool_schemas=tools,
@@ -504,10 +507,12 @@ class HermesAgentBaseEnv(BaseEnv):
                    temperature=self.config.agent_temperature,
                    max_tokens=self.config.max_token_length,
                    extra_body=self.config.extra_body,
+                    max_context_tokens=_max_ctx,
                )
                result = await agent.run(messages)
        else:
            # Phase 1: OpenAI server -- native tool_calls, placeholder tokens
+            _max_ctx = self.config.max_token_length if (self.config.max_token_length and self.config.max_token_length > 0) else None
            agent = HermesAgentLoop(
                server=self.server,
                tool_schemas=tools,
@@ -517,6 +522,7 @@ class HermesAgentBaseEnv(BaseEnv):
                temperature=self.config.agent_temperature,
                max_tokens=self.config.max_token_length,
                extra_body=self.config.extra_body,
+                max_context_tokens=_max_ctx,
            )
            result = await agent.run(messages)

--- a/environments/tool_call_parsers/hermes_parser.py
+++ b/environments/tool_call_parsers/hermes_parser.py
@@ -49,15 +49,22 @@ class HermesToolCallParser(ToolCallParser):
                    continue

                tc_data = json.loads(raw_json)
+                # Handle arguments: could be dict or already a JSON string
+                raw_args = tc_data.get("arguments", {})
+                if isinstance(raw_args, str):
+                    # Already a string — pass through as-is.
+                    # It may be a JSON string ("{...}") or a plain string ("ls").
+                    args_str = raw_args
+                else:
+                    # Dict — serialize to JSON
+                    args_str = json.dumps(raw_args, ensure_ascii=False)
                tool_calls.append(
                    ChatCompletionMessageToolCall(
                        id=f"call_{uuid.uuid4().hex[:8]}",
                        type="function",
                        function=Function(
                            name=tc_data["name"],
-                            arguments=json.dumps(
-                                tc_data.get("arguments", {}), ensure_ascii=False
-                            ),
+                            arguments=args_str,
                        ),
                    )
                )
--- a/tools/modal_pool.py
+++ b/tools/modal_pool.py
@@ -0,0 +1,128 @@
+"""Modal sandbox pooling backend for terminal_tool.
+
+This module provides an OPTIONAL pooled Modal backend that is compatible with
+Hermes-Agent's existing terminal_tool interface.
+
+Goals:
+- Keep the default Modal path unchanged.
+- Allow switching to pooled behavior with minimal friction:
+    TERMINAL_ENV=modal
+    TERMINAL_MODAL_MODE=pool
+
+Design:
+- Pool stores warm `_ModalEnvironment` instances (each wraps a live ModalDeployment).
+- Each task acquires one environment exclusively, uses a task-specific working dir,
+  then releases it back to the pool.
+- Release attempts to remove the task working directory to reduce cross-task leakage.
+
+NOTE: This is intentionally conservative and self-contained. It does not change
+any tool schemas or model-facing behavior.
+"""
+
+from __future__ import annotations
+
+import os
+import threading
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class _PooledEnv:
+    env: object
+    created_at: float
+
+
+class ModalEnvPool:
+    """Thread-safe pool of warm Modal environments."""
+
+    def __init__(self, max_size: int = 4):
+        self.max_size = max_size
+        self._lock = threading.Lock()
+        self._cond = threading.Condition(self._lock)
+        self._idle: list[_PooledEnv] = []
+        self._total = 0
+
+    def acquire(self, create_fn, *, wait_s: int = 300):
+        deadline = time.time() + wait_s
+        with self._cond:
+            while True:
+                if self._idle:
+                    return self._idle.pop().env
+
+                if self._total < self.max_size:
+                    self._total += 1
+                    break
+
+                remaining = deadline - time.time()
+                if remaining <= 0:
+                    # As a last resort, allow temporary oversubscription.
+                    self._total += 1
+                    break
+
+                self._cond.wait(timeout=min(5, remaining))
+
+        # Create outside lock (slow)
+        try:
+            return create_fn()
+        except Exception:
+            # Roll back count if create fails
+            with self._cond:
+                self._total -= 1
+                self._cond.notify()
+            raise
+
+    def release(self, env: object):
+        with self._cond:
+            self._idle.append(_PooledEnv(env=env, created_at=time.time()))
+            self._cond.notify()
+
+
+# Global pool (process-level)
+_global_pool: Optional[ModalEnvPool] = None
+
+
+def get_global_pool() -> ModalEnvPool:
+    global _global_pool
+    if _global_pool is None:
+        max_size = int(os.getenv("TERMINAL_MODAL_POOL_MAX", os.getenv("TERMINAL_MODAL_POOL_SIZE", "4")))
+        _global_pool = ModalEnvPool(max_size=max_size)
+    return _global_pool
+
+
+class ModalPooledTaskEnvironment:
+    """Per-task environment wrapper that leases a pooled Modal env."""
+
+    def __init__(self, *, inner, base_cwd: str, timeout: int, task_id: str):
+        self._inner = inner
+        self.timeout = timeout
+        self.task_id = task_id or str(uuid.uuid4())
+        self.base_cwd = base_cwd.rstrip("/") or "/root"
+        self.cwd = f"{self.base_cwd}/hermes_tasks/{self.task_id}"
+
+        # Ensure workdir exists and is empty-ish
+        self._inner.execute(f"mkdir -p {self.cwd} && rm -rf {self.cwd}/*", cwd="/", timeout=60)
+
+    @classmethod
+    def acquire(cls, *, image: str, base_cwd: str, timeout: int, task_id: str, create_modal_env_fn):
+        pool = get_global_pool()
+        inner = pool.acquire(create_modal_env_fn)
+        return cls(inner=inner, base_cwd=base_cwd, timeout=timeout, task_id=task_id)
+
+    def execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict:
+        # Always execute in the task workdir unless an explicit cwd is given.
+        workdir = cwd or self.cwd
+        return self._inner.execute(command, cwd=workdir, timeout=timeout or self.timeout)
+
+    def cleanup(self):
+        # Best-effort cleanup of task directory, then return to pool
+        try:
+            self._inner.execute(f"rm -rf {self.cwd}", cwd="/", timeout=60)
+        except Exception:
+            pass
+        get_global_pool().release(self._inner)
+
+    def stop(self):
+        self.cleanup()
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@@ -1223,7 +1223,7 @@ def _get_env_config() -> Dict[str, Any]:
    }


-def _create_environment(env_type: str, image: str, cwd: str, timeout: int, ssh_config: dict = None):
+def _create_environment(env_type: str, image: str, cwd: str, timeout: int, task_id: str = "", ssh_config: dict = None):
    """
    Create an execution environment from mini-swe-agent.
    
@@ -1250,7 +1250,27 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int, ssh_c
        return _SingularityEnvironment(image=image, cwd=cwd, timeout=timeout)
    
    elif env_type == "modal":
-        # Use custom Modal wrapper with sudo support
+        # Use Modal backend.
+        # Default is a dedicated Modal sandbox per task.
+        # Optional: set TERMINAL_MODAL_MODE=pool to reuse a pool of warm sandboxes.
+        mode = os.getenv("TERMINAL_MODAL_MODE", "default")
+        if mode == "pool":
+            # Lazy import to avoid overhead when not using the pool
+            from tools.modal_pool import ModalPooledTaskEnvironment, get_global_pool
+
+            pool = get_global_pool()
+
+            def _create_inner():
+                # Create a fresh Modal environment (expensive)
+                return _ModalEnvironment(image=image, cwd=cwd, timeout=timeout)
+
+            return ModalPooledTaskEnvironment.acquire(
+                image=image,
+                base_cwd=cwd,
+                timeout=timeout,
+                task_id=task_id or str(uuid.uuid4()),
+                create_modal_env_fn=_create_inner,
+            )
        return _ModalEnvironment(image=image, cwd=cwd, timeout=timeout)
    
    elif env_type == "ssh":
@@ -1578,6 +1598,7 @@ def terminal_tool(
                            image=image,
                            cwd=cwd,
                            timeout=effective_timeout,
+                            task_id=effective_task_id,
                            ssh_config=ssh_config
                        )
                    except ImportError as e: