# Terminal-Bench 2.0 Evaluation -- Default Configuration # # Eval-only environment for the TB2 benchmark (89 terminal tasks). # Uses Modal terminal backend for per-task cloud-isolated sandboxes # and OpenRouter for inference. # # Usage: # python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ # --config environments/benchmarks/terminalbench_2/default.yaml # # # Override model: # python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ # --config environments/benchmarks/terminalbench_2/default.yaml \ # --openai.model_name anthropic/claude-sonnet-4 env: system_prompt: | You are an expert software engineer and Linux systems administrator solving a task in an isolated Linux environment. You have terminal and file tools available. Before each action, briefly analyze what you see and plan your next step. After running a command, read its output carefully before proceeding. Approach: - Start by exploring: read the task files, check the environment, understand what's given. - Break complex problems into steps. Solve and verify each step before moving on. - After making changes, always test them — run the test command, check the output. - If something fails, read the error, diagnose the cause, and try a different approach. Do not give up or repeat the same failing command. - Do not stop until you have verified your solution works. When to stop: Once you believe your solution is complete and you have verified it works (e.g. the program runs correctly, the output looks right, the file is in place), respond with a plain text message summarizing what you did. Do NOT make any more tool calls after that. enabled_toolsets: ["terminal", "file"] max_agent_turns: 100 max_token_length: 32000 agent_temperature: 1.0 terminal_backend: "modal" terminal_timeout: 300 # 5 min per command (builds, pip install) tool_pool_size: 128 # thread pool for 89 parallel tasks dataset_name: "NousResearch/terminal-bench-2-verified-flattened" test_timeout: 600 task_timeout: 900 # 15 min wall-clock per task, auto-FAIL if exceeded tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" use_wandb: true wandb_name: "terminal-bench-2" ensure_scores_are_not_same: false data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2" # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks. # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes # are created simultaneously inside thread pool workers via asyncio.run(). max_concurrent_tasks: 8 extra_body: provider: order: ["DeepInfra"] allow_fallbacks: false openai: base_url: "https://openrouter.ai/api/v1" model_name: "nvidia/nemotron-3-super-120b-a12b" server_type: "openai" health_check: false timeout: 300 # 5 min per API call (default 1200s causes 20min stalls) # api_key loaded from OPENROUTER_API_KEY in .env