diff --git a/environments/agent_loop.py b/environments/agent_loop.py index 880f2ca797b..35f6ebe7e25 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -138,6 +138,7 @@ class HermesAgentLoop: max_turns: int = 30, task_id: Optional[str] = None, temperature: float = 1.0, + top_p: Optional[float] = None, max_tokens: Optional[int] = None, extra_body: Optional[Dict[str, Any]] = None, budget_config: Optional["BudgetConfig"] = None, @@ -153,6 +154,7 @@ class HermesAgentLoop: max_turns: Maximum number of LLM calls before stopping task_id: Unique ID for terminal/browser session isolation temperature: Sampling temperature for generation + top_p: Nucleus sampling top_p (None = omit, use provider default) max_tokens: Max tokens per generation (None for server default) extra_body: Extra parameters passed to the OpenAI client's create() call. Used for OpenRouter provider preferences, transforms, etc. @@ -168,6 +170,7 @@ class HermesAgentLoop: self.max_turns = max_turns self.task_id = task_id or str(uuid.uuid4()) self.temperature = temperature + self.top_p = top_p self.max_tokens = max_tokens self.extra_body = extra_body self.budget_config = budget_config or DEFAULT_BUDGET @@ -211,6 +214,9 @@ class HermesAgentLoop: "temperature": self.temperature, } + if self.top_p is not None: + chat_kwargs["top_p"] = self.top_p + # Only pass tools if we have them if self.tool_schemas: chat_kwargs["tools"] = self.tool_schemas diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml index 8f0ddaa2368..2a3647f908b 100644 --- a/environments/benchmarks/terminalbench_2/default.yaml +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -28,13 +28,13 @@ env: When to stop: Once you believe your solution is complete and you have verified it works (e.g. the program runs correctly, the output looks right, the file is in place), respond with a plain text message summarizing what you did. Do NOT make any more tool calls after that. enabled_toolsets: ["terminal", "file"] - max_agent_turns: 60 + max_agent_turns: 100 max_token_length: 32000 - agent_temperature: 0.8 + agent_temperature: 1.0 terminal_backend: "modal" terminal_timeout: 300 # 5 min per command (builds, pip install) tool_pool_size: 128 # thread pool for 89 parallel tasks - dataset_name: "sidbin/terminal-bench-2-verified-flattened" + dataset_name: "NousResearch/terminal-bench-2-verified-flattened" test_timeout: 600 task_timeout: 900 # 15 min wall-clock per task, auto-FAIL if exceeded tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" @@ -46,10 +46,14 @@ env: # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes # are created simultaneously inside thread pool workers via asyncio.run(). max_concurrent_tasks: 8 + extra_body: + provider: + order: ["DeepInfra"] + allow_fallbacks: false openai: base_url: "https://openrouter.ai/api/v1" - model_name: "qwen/qwen3.5-122b-a10b:nitro" + model_name: "nvidia/nemotron-3-super-120b-a12b" server_type: "openai" health_check: false timeout: 300 # 5 min per API call (default 1200s causes 20min stalls) diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 85b8ed31977..748d706e249 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -52,18 +52,17 @@ _repo_root = Path(__file__).resolve().parent.parent.parent.parent if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) -from pydantic import Field - from atroposlib.envs.base import EvalHandlingEnum from atroposlib.envs.server_handling.server_manager import APIServerConfig +from pydantic import Field from environments.agent_loop import AgentResult, HermesAgentLoop from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig from environments.tool_context import ToolContext from tools.terminal_tool import ( - register_task_env_overrides, - clear_task_env_overrides, cleanup_vm, + clear_task_env_overrides, + register_task_env_overrides, ) logger = logging.getLogger(__name__) @@ -73,6 +72,7 @@ logger = logging.getLogger(__name__) # Configuration # ============================================================================= + class TerminalBench2EvalConfig(HermesAgentEnvConfig): """ Configuration for the Terminal-Bench 2.0 evaluation environment. @@ -138,11 +138,24 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig): # Tasks that cannot run properly on Modal and are excluded from scoring. MODAL_INCOMPATIBLE_TASKS = { - "qemu-startup", # Needs KVM/hardware virtualization - "qemu-alpine-ssh", # Needs KVM/hardware virtualization - "crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts + "qemu-startup", # Needs KVM/hardware virtualization + "qemu-alpine-ssh", # Needs KVM/hardware virtualization + "crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts } +# Injected as a user message when the model responds with plain text instead of +# calling a tool or including a tag. +_FORMAT_NUDGE_MESSAGE = ( + "Your response must be one of the following:\n" + "1. A tool call (e.g. terminal, read_file, write_file) to continue working on the task.\n" + "2. DONE — if you have fully completed the task.\n" + "3. UNFINISHED — if you are unable to complete the task.\n\n" + "Plain text responses are not accepted. Please continue working or report your final status." +) + +# Maximum number of format nudges before giving up and moving on to scoring. +_MAX_FORMAT_NUDGES = 3 + # ============================================================================= # Tar extraction helper @@ -203,7 +216,6 @@ def _safe_extract_tar(tar: tarfile.TarFile, target_dir: Path) -> None: except OSError: pass - def _extract_base64_tar(b64_data: str, target_dir: Path): """Extract a base64-encoded tar.gz archive into target_dir.""" if not b64_data: @@ -218,6 +230,7 @@ def _extract_base64_tar(b64_data: str, target_dir: Path): # Main Environment # ============================================================================= + class TerminalBench2EvalEnv(HermesAgentBaseEnv): """ Terminal-Bench 2.0 evaluation environment (eval-only, no training). @@ -262,23 +275,18 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): enabled_toolsets=["terminal", "file"], disabled_toolsets=None, distribution=None, - # Agent settings -- TB2 tasks are complex, need many turns max_agent_turns=60, max_token_length=16000, agent_temperature=0.6, system_prompt=None, - # Modal backend for per-task cloud-isolated sandboxes terminal_backend="modal", - terminal_timeout=300, # 5 min per command (builds, pip install, etc.) - + terminal_timeout=300, # 5 min per command (builds, pip install, etc.) # Test execution timeout (TB2 test scripts can install deps like pytest) test_timeout=180, - # 89 tasks run in parallel, each needs a thread for tool calls tool_pool_size=128, - # --- Eval-only Atropos settings --- # These settings make the env work as an eval-only environment: # - STOP_TRAIN: pauses training during eval (standard for eval envs) @@ -288,7 +296,6 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): group_size=1, steps_per_eval=1, total_steps=1, - tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B", use_wandb=True, wandb_name="terminal-bench-2", @@ -336,7 +343,11 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # Skip tasks incompatible with the current backend (e.g., QEMU on Modal) # plus any user-specified skip_tasks - skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set() + skip = ( + set(MODAL_INCOMPATIBLE_TASKS) + if self.config.terminal_backend == "modal" + else set() + ) if self.config.skip_tasks: skip |= {name.strip() for name in self.config.skip_tasks.split(",")} if skip: @@ -344,7 +355,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): tasks = [t for t in tasks if t["task_name"] not in skip] skipped = before - len(tasks) if skipped > 0: - print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}") + print( + f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}" + ) self.all_eval_items = tasks self.iter = 0 @@ -371,24 +384,30 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # immediately on completion so data is preserved even on Ctrl+C. # Timestamped filename so each run produces a unique file. import datetime + log_dir = os.path.join(os.path.dirname(__file__), "logs") os.makedirs(log_dir, exist_ok=True) run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") model_name = self.server.servers[0].config.model_name model_slug = model_name.replace("/", "_").replace(":", "_") - self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}_{model_slug}.jsonl") + self._streaming_path = os.path.join( + log_dir, f"samples_{run_ts}_{model_slug}.jsonl" + ) self._streaming_file = open(self._streaming_path, "w") self._streaming_lock = __import__("threading").Lock() self._run_meta = { "model_name": model_name, "temperature": self.config.agent_temperature, + "top_p": self.config.agent_top_p, "max_agent_turns": self.config.max_agent_turns, "task_timeout": self.config.task_timeout, "terminal_backend": self.config.terminal_backend, } print(f" Streaming results to: {self._streaming_path}") - print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories") + print( + f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories" + ) for cat, indices in sorted(self.category_index.items()): print(f" {cat}: {len(indices)} tasks") @@ -397,7 +416,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): if not hasattr(self, "_streaming_file") or self._streaming_file.closed: return with self._streaming_lock: - self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n") + self._streaming_file.write( + json.dumps(result, ensure_ascii=False, default=str) + "\n" + ) self._streaming_file.flush() # ========================================================================= @@ -495,7 +516,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): if dockerfile_path.exists(): logger.info( "Task %s: building from Dockerfile (force_build=%s, docker_image=%s)", - task_name, self.config.force_build, bool(docker_image), + task_name, + self.config.force_build, + bool(docker_image), ) return str(dockerfile_path), task_dir @@ -503,12 +526,80 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): if docker_image: logger.warning( "Task %s: force_build=True but no environment_tar, " - "falling back to docker_image %s", task_name, docker_image, + "falling back to docker_image %s", + task_name, + docker_image, ) return docker_image, None return "", None + # ========================================================================= + # Agent loop with format nudging + # ========================================================================= + + async def _run_with_nudges( + self, + server, + tools: List[Dict[str, Any]], + valid_names: set, + messages: List[Dict[str, Any]], + task_id: str, + task_name: str, + ) -> Tuple["AgentResult", int]: + """Run the agent loop, nudging if the model returns plain text without task_status tag.""" + total_turns_used = 0 + nudge_count = 0 + result = None + + while total_turns_used < self.config.max_agent_turns: + remaining = self.config.max_agent_turns - total_turns_used + agent = HermesAgentLoop( + server=server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=remaining, + task_id=task_id, + temperature=self.config.agent_temperature, + top_p=self.config.agent_top_p, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) + total_turns_used += result.turns_used + + if not result.finished_naturally: + break + + last_content = next( + ( + m.get("content", "") or "" + for m in reversed(messages) + if m.get("role") == "assistant" + ), + "", + ) + if "" in last_content: + break + + if nudge_count >= _MAX_FORMAT_NUDGES: + logger.warning( + "Task %s: model ignored %d format nudges; stopping.", + task_name, + nudge_count, + ) + break + nudge_count += 1 + logger.info( + "Task %s: nudging model (nudge %d/%d) — no tool calls and no task_status", + task_name, + nudge_count, + _MAX_FORMAT_NUDGES, + ) + messages.append({"role": "user", "content": _FORMAT_NUDGE_MESSAGE}) + + return result, total_turns_used + # ========================================================================= # Per-task evaluation -- agent loop + test verification # ========================================================================= @@ -537,6 +628,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): task_dir = None # Set if we extract a Dockerfile (needs cleanup) from tqdm import tqdm + tqdm.write(f" [START] {task_name} (task_id={task_id[:8]})") task_start = time.time() @@ -544,10 +636,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # --- 1. Resolve Docker image --- modal_image, task_dir = self._resolve_task_image(eval_item, task_name) if not modal_image: - logger.error("Task %s: no docker_image or environment_tar, skipping", task_name) + logger.error( + "Task %s: no docker_image or environment_tar, skipping", task_name + ) return { - "passed": False, "reward": 0.0, - "task_name": task_name, "category": category, + "passed": False, + "reward": 0.0, + "task_name": task_name, + "category": category, "error": "no_image", } @@ -564,7 +660,8 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): register_task_env_overrides(task_id, overrides) logger.info( "Task %s: registered image override for task_id %s", - task_name, task_id[:8], + task_name, + task_id[:8], ) # --- 3. Resolve tools and build messages --- @@ -572,53 +669,48 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): messages: List[Dict[str, Any]] = [] if self.config.system_prompt: - messages.append({"role": "system", "content": self.config.system_prompt}) + messages.append( + {"role": "system", "content": self.config.system_prompt} + ) messages.append({"role": "user", "content": self.format_prompt(eval_item)}) - # --- 4. Run agent loop --- - # Use ManagedServer (Phase 2) for vLLM/SGLang backends to get - # token-level tracking via /generate. Falls back to direct - # ServerManager (Phase 1) for OpenAI endpoints. + # --- 4. Run agent loop with format enforcement --- + # The model must either call a tool or end with DONE/UNFINISHED. + # If it returns plain text without the tag, inject a nudge user message and + # continue with the remaining turn budget (up to _MAX_FORMAT_NUDGES times). if self._use_managed_server(): async with self.server.managed_server( tokenizer=self.tokenizer, preserve_think_blocks=bool(self.config.thinking_mode), ) as managed: - agent = HermesAgentLoop( + result, total_turns_used = await self._run_with_nudges( server=managed, - tool_schemas=tools, - valid_tool_names=valid_names, - max_turns=self.config.max_agent_turns, + tools=tools, + valid_names=valid_names, + messages=messages, task_id=task_id, - temperature=self.config.agent_temperature, - max_tokens=self.config.max_token_length, - extra_body=self.config.extra_body, - budget_config=self.config.build_budget_config(), + task_name=task_name, ) - result = await agent.run(messages) else: - agent = HermesAgentLoop( + result, total_turns_used = await self._run_with_nudges( server=self.server, - tool_schemas=tools, - valid_tool_names=valid_names, - max_turns=self.config.max_agent_turns, + tools=tools, + valid_names=valid_names, + messages=messages, task_id=task_id, - temperature=self.config.agent_temperature, - max_tokens=self.config.max_token_length, - extra_body=self.config.extra_body, - budget_config=self.config.build_budget_config(), + task_name=task_name, ) - result = await agent.run(messages) # --- 5. Verify -- run test suite in the agent's sandbox --- # Skip verification if the agent produced no meaningful output only_system_and_user = all( - msg.get("role") in ("system", "user") for msg in result.messages + msg.get("role") in ("system", "user") for msg in messages ) - if result.turns_used == 0 or only_system_and_user: + if total_turns_used == 0 or only_system_and_user: logger.warning( "Task %s: agent produced no output (turns=%d). Reward=0.", - task_name, result.turns_used, + task_name, + total_turns_used, ) reward = 0.0 else: @@ -630,7 +722,10 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): loop = asyncio.get_event_loop() reward = await loop.run_in_executor( None, # default thread pool - self._run_tests, eval_item, ctx, task_name, + self._run_tests, + eval_item, + ctx, + task_name, ) except Exception as e: logger.error("Task %s: test verification failed: %s", task_name, e) @@ -641,10 +736,15 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): passed = reward == 1.0 status = "PASS" if passed else "FAIL" elapsed = time.time() - task_start - tqdm.write(f" [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)") + tqdm.write( + f" [{status}] {task_name} (turns={total_turns_used}, {elapsed:.0f}s)" + ) logger.info( "Task %s: reward=%.1f, turns=%d, finished=%s", - task_name, reward, result.turns_used, result.finished_naturally, + task_name, + reward, + total_turns_used, + result.finished_naturally, ) out = { @@ -653,9 +753,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): "reward": reward, "task_name": task_name, "category": category, - "turns_used": result.turns_used, + "turns_used": total_turns_used, "finished_naturally": result.finished_naturally, - "messages": result.messages, + "messages": messages, } self._save_result(out) return out @@ -666,8 +766,10 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)") out = { **self._run_meta, - "passed": False, "reward": 0.0, - "task_name": task_name, "category": category, + "passed": False, + "reward": 0.0, + "task_name": task_name, + "category": category, "error": str(e), } self._save_result(out) @@ -740,7 +842,8 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # Execute the test suite logger.info( "Task %s: running test suite (timeout=%ds)", - task_name, self.config.test_timeout, + task_name, + self.config.test_timeout, ) test_result = ctx.terminal( "bash /tests/test.sh", @@ -773,7 +876,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): logger.warning( "Task %s: reward.txt content unexpected (%r), " "falling back to exit_code=%d", - task_name, content, exit_code, + task_name, + content, + exit_code, ) reward = 1.0 if exit_code == 0 else 0.0 else: @@ -781,14 +886,17 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): logger.warning( "Task %s: reward.txt not found after download, " "falling back to exit_code=%d", - task_name, exit_code, + task_name, + exit_code, ) reward = 1.0 if exit_code == 0 else 0.0 except Exception as e: logger.warning( "Task %s: failed to download verifier dir: %s, " "falling back to exit_code=%d", - task_name, e, exit_code, + task_name, + e, + exit_code, ) reward = 1.0 if exit_code == 0 else 0.0 finally: @@ -799,7 +907,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): output_preview = output[-500:] if output else "(no output)" logger.info( "Task %s: FAIL (exit_code=%d)\n%s", - task_name, exit_code, output_preview, + task_name, + exit_code, + output_preview, ) return reward @@ -824,13 +934,18 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): ) except asyncio.TimeoutError: from tqdm import tqdm + elapsed = self.config.task_timeout - tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)") + tqdm.write( + f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)" + ) logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed) out = { **self._run_meta, - "passed": False, "reward": 0.0, - "task_name": task_name, "category": category, + "passed": False, + "reward": 0.0, + "task_name": task_name, + "category": category, "error": f"timeout ({elapsed}s)", } self._save_result(out) @@ -864,23 +979,25 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): self.handleError(record) handler = _TqdmHandler() - handler.setFormatter(logging.Formatter( - "%(asctime)s [%(name)s] %(levelname)s: %(message)s", - datefmt="%H:%M:%S", - )) + handler.setFormatter( + logging.Formatter( + "%(asctime)s [%(name)s] %(levelname)s: %(message)s", + datefmt="%H:%M:%S", + ) + ) root = logging.getLogger() root.handlers = [handler] # Replace any existing handlers root.setLevel(logging.INFO) # Silence noisy third-party loggers that flood the output - logging.getLogger("httpx").setLevel(logging.WARNING) # Every HTTP request - logging.getLogger("openai").setLevel(logging.WARNING) # OpenAI client retries - logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment + logging.getLogger("httpx").setLevel(logging.WARNING) # Every HTTP request + logging.getLogger("openai").setLevel(logging.WARNING) # OpenAI client retries + logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment logging.getLogger("rex_image_builder").setLevel(logging.WARNING) # Image builds - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print("Starting Terminal-Bench 2.0 Evaluation") - print(f"{'='*60}") + print(f"{'=' * 60}") print(f" Dataset: {self.config.dataset_name}") print(f" Total tasks: {len(self.all_eval_items)}") print(f" Max agent turns: {self.config.max_agent_turns}") @@ -888,9 +1005,11 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): print(f" Terminal backend: {self.config.terminal_backend}") print(f" Tool thread pool: {self.config.tool_pool_size}") print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd") - print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)") + print( + f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)" + ) print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Semaphore to limit concurrent Modal sandbox creations. # Without this, all 86 tasks fire simultaneously, each creating a Modal @@ -932,6 +1051,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): await asyncio.gather(*eval_tasks, return_exceptions=True) # Belt-and-suspenders: clean up any remaining sandboxes from tools.terminal_tool import cleanup_all_environments + cleanup_all_environments() print("All sandboxes cleaned up.") return @@ -977,9 +1097,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): self.eval_metrics = [(k, v) for k, v in eval_metrics.items()] # ---- Print summary ---- - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print("Terminal-Bench 2.0 Evaluation Results") - print(f"{'='*60}") + print(f"{'=' * 60}") print(f"Overall Pass Rate: {overall_pass_rate:.4f} ({passed}/{total})") print(f"Evaluation Time: {end_time - start_time:.1f} seconds") @@ -999,7 +1119,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): extra = f" (error: {error})" if error else "" print(f" [{status}] {r['task_name']} (turns={turns}){extra}") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") # Build sample records for evaluate_log (includes full conversations) samples = [ @@ -1024,6 +1144,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): end_time=end_time, generation_parameters={ "temperature": self.config.agent_temperature, + "top_p": self.config.agent_top_p, "max_tokens": self.config.max_token_length, "max_agent_turns": self.config.max_agent_turns, "terminal_backend": self.config.terminal_backend, @@ -1040,6 +1161,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # Kill all remaining sandboxes. Timed-out tasks leave orphaned thread # pool workers still executing commands -- cleanup_all stops them. from tools.terminal_tool import cleanup_all_environments + print("\nCleaning up all sandboxes...") cleanup_all_environments() @@ -1047,6 +1169,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # tasks are killed immediately instead of retrying against dead # sandboxes and spamming the console with TimeoutError warnings. from environments.agent_loop import _tool_executor + _tool_executor.shutdown(wait=False, cancel_futures=True) print("Done.") diff --git a/environments/hermes_base_env.py b/environments/hermes_base_env.py index ededab355f0..83f1ff30306 100644 --- a/environments/hermes_base_env.py +++ b/environments/hermes_base_env.py @@ -115,6 +115,10 @@ class HermesAgentEnvConfig(BaseEnvConfig): default=1.0, description="Sampling temperature for agent generation during rollouts.", ) + agent_top_p: Optional[float] = Field( + default=None, + description="Nucleus sampling top_p for agent generation. None = provider default.", + ) # --- Terminal backend --- terminal_backend: str = Field( @@ -529,6 +533,7 @@ class HermesAgentBaseEnv(BaseEnv): max_turns=self.config.max_agent_turns, task_id=task_id, temperature=self.config.agent_temperature, + top_p=self.config.agent_top_p, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, budget_config=self.config.build_budget_config(), @@ -547,6 +552,7 @@ class HermesAgentBaseEnv(BaseEnv): max_turns=self.config.max_agent_turns, task_id=task_id, temperature=self.config.agent_temperature, + top_p=self.config.agent_top_p, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, budget_config=self.config.build_budget_config(), @@ -561,6 +567,7 @@ class HermesAgentBaseEnv(BaseEnv): max_turns=self.config.max_agent_turns, task_id=task_id, temperature=self.config.agent_temperature, + top_p=self.config.agent_top_p, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, budget_config=self.config.build_budget_config(),