diff --git a/environments/agent_loop.py b/environments/agent_loop.py index 891ce42f448..880f2ca797b 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -225,20 +225,35 @@ class HermesAgentLoop: chat_kwargs["extra_body"] = self.extra_body # Make the API call -- standard OpenAI spec + # Retry on timeout/connection errors (provider queuing, rate limits) api_start = _time.monotonic() - try: - response = await self.server.chat_completion(**chat_kwargs) - except Exception as e: - api_elapsed = _time.monotonic() - api_start - logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e) - return AgentResult( - messages=messages, - managed_state=self._get_managed_state(), - turns_used=turn + 1, - finished_naturally=False, - reasoning_per_turn=reasoning_per_turn, - tool_errors=tool_errors, - ) + response = None + max_retries = 3 + for attempt in range(max_retries): + try: + response = await self.server.chat_completion(**chat_kwargs) + break + except Exception as e: + api_elapsed = _time.monotonic() - api_start + is_retryable = "timeout" in type(e).__name__.lower() or "connection" in type(e).__name__.lower() + if is_retryable and attempt < max_retries - 1: + wait = 2 ** attempt + logger.warning( + "[%s] API call timed out on turn %d attempt %d (%.1fs), retrying in %ds: %s", + self.task_id[:8], turn + 1, attempt + 1, api_elapsed, wait, e, + ) + await asyncio.sleep(wait) + api_start = _time.monotonic() + continue + logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e) + return AgentResult( + messages=messages, + managed_state=self._get_managed_state(), + turns_used=turn + 1, + finished_naturally=False, + reasoning_per_turn=reasoning_per_turn, + tool_errors=tool_errors, + ) api_elapsed = _time.monotonic() - api_start diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml index eb675b12e70..4a781cc3212 100644 --- a/environments/benchmarks/terminalbench_2/default.yaml +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -19,11 +19,11 @@ env: max_token_length: 32000 agent_temperature: 0.8 terminal_backend: "modal" - terminal_timeout: 300 # 5 min per command (builds, pip install) - tool_pool_size: 128 # thread pool for 89 parallel tasks - dataset_name: "NousResearch/terminal-bench-2" + terminal_timeout: 300 # 5 min per command (builds, pip install) + tool_pool_size: 128 # thread pool for 89 parallel tasks + dataset_name: "sidbin/terminal-bench-2-verified-flattened" test_timeout: 600 - task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded + task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" use_wandb: true wandb_name: "terminal-bench-2" @@ -36,7 +36,8 @@ env: openai: base_url: "https://openrouter.ai/api/v1" - model_name: "anthropic/claude-opus-4.6" + model_name: "openai/gpt-oss-120b:nitro" server_type: "openai" health_check: false + timeout: 300 # 5 min per API call (default 1200s causes 20min stalls) # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/benchmarks/terminalbench_2/run_eval.sh b/environments/benchmarks/terminalbench_2/run_eval.sh index ffbe4848065..27eeef41ce7 100755 --- a/environments/benchmarks/terminalbench_2/run_eval.sh +++ b/environments/benchmarks/terminalbench_2/run_eval.sh @@ -32,8 +32,8 @@ export PYTHONUNBUFFERED=1 # These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal export LOGLEVEL=INFO -python terminalbench2_env.py evaluate \ - --config default.yaml \ +uv run python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ + --config environments/benchmarks/terminalbench_2/default.yaml \ "$@" \ 2>&1 | tee "$LOG_FILE" diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index c7eaff6c4c2..4036890aaa0 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -354,6 +354,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): for i, task in enumerate(self.all_eval_items): self.category_index[task.get("category", "unknown")].append(i) + # Pre-compute which tasks need Modal's add_python (avoids re-decoding + # multi-MB environment_tar blobs during per-task rollouts). + self._needs_add_python: Dict[str, bool] = { + task["task_name"]: self._image_needs_add_python(task) + for task in self.all_eval_items + } + add_py_count = sum(self._needs_add_python.values()) + if add_py_count: + print(f" {add_py_count} tasks need add_python (non-python base image)") + # Reward tracking for wandb logging self.eval_metrics: List[Tuple[str, float]] = [] @@ -414,6 +424,36 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # Docker image resolution # ========================================================================= + @staticmethod + def _image_needs_add_python(item: Dict[str, Any]) -> bool: + """Check if the task's base image lacks `python` on PATH. + + Parses the Dockerfile FROM line in environment_tar. Returns True + for non-python base images (ubuntu, debian, etc.) that need + Modal's add_python parameter. + """ + environment_tar = item.get("environment_tar", "") + if not environment_tar: + return False + try: + raw = base64.b64decode(environment_tar) + buf = io.BytesIO(raw) + with tarfile.open(fileobj=buf, mode="r:gz") as tar: + for member in tar: + if not member.isfile() or "Dockerfile" not in member.name: + continue + f = tar.extractfile(member) + if not f: + continue + for line in f.read().decode("utf-8", errors="ignore").splitlines(): + stripped = line.strip() + if stripped.upper().startswith("FROM "): + base = stripped.split()[1].lower() + return not base.startswith("python:") + except Exception: + pass + return False + def _resolve_task_image( self, item: Dict[str, Any], task_name: str ) -> Tuple[str, Optional[Path]]: @@ -505,11 +545,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # --- 2. Register per-task image override --- # Set both modal_image and docker_image so the task image is used # regardless of which backend is configured. - register_task_env_overrides(task_id, { + overrides = { "modal_image": modal_image, "docker_image": modal_image, "cwd": "/app", - }) + } + if self._needs_add_python.get(task_name, False): + overrides["add_python"] = "3.12" + register_task_env_overrides(task_id, overrides) logger.info( "Task %s: registered image override for task_id %s", task_name, task_id[:8], diff --git a/tools/environments/modal.py b/tools/environments/modal.py index 1cb8e47969e..89937afeb3b 100644 --- a/tools/environments/modal.py +++ b/tools/environments/modal.py @@ -153,6 +153,7 @@ class ModalEnvironment(BaseEnvironment): modal_sandbox_kwargs: Optional[Dict[str, Any]] = None, persistent_filesystem: bool = True, task_id: str = "default", + add_python: Optional[str] = None, ): super().__init__(cwd=cwd, timeout=timeout) diff --git a/tools/file_tools.py b/tools/file_tools.py index 186a9d052c6..2c017994486 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -214,6 +214,7 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations: image = "" cwd = overrides.get("cwd") or config["cwd"] + add_python = overrides.get("add_python") logger.info("Creating new %s environment for task %s...", env_type, task_id[:8]) container_config = None @@ -252,6 +253,7 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations: local_config=local_config, task_id=task_id, host_cwd=config.get("host_cwd"), + add_python=add_python, ) with _env_lock: diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 243127a2958..fb2f50071ab 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -458,6 +458,7 @@ def register_task_env_overrides(task_id: str, overrides: Dict[str, Any]): - modal_image: str -- Path to Dockerfile or Docker Hub image name - docker_image: str -- Docker image name - cwd: str -- Working directory inside the sandbox + - add_python: str -- Python version for Modal's add_python (for images without python on PATH) Args: task_id: The rollout's unique task identifier @@ -584,7 +585,8 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int, ssh_config: dict = None, container_config: dict = None, local_config: dict = None, task_id: str = "default", - host_cwd: str = None): + host_cwd: str = None, + add_python: str = None): """ Create an execution environment for sandboxed command execution. @@ -682,6 +684,7 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int, image=image, cwd=cwd, timeout=timeout, modal_sandbox_kwargs=sandbox_kwargs, persistent_filesystem=persistent, task_id=task_id, + add_python=add_python, ) elif env_type == "daytona": @@ -1057,6 +1060,7 @@ def terminal_tool( image = "" cwd = overrides.get("cwd") or config["cwd"] + add_python = overrides.get("add_python") default_timeout = config["timeout"] effective_timeout = timeout or default_timeout @@ -1133,6 +1137,7 @@ def terminal_tool( local_config=local_config, task_id=effective_task_id, host_cwd=config.get("host_cwd"), + add_python=add_python, ) except ImportError as e: return json.dumps({