diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml index 2a3647f908b..9797eb0918c 100644 --- a/environments/benchmarks/terminalbench_2/default.yaml +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -14,19 +14,6 @@ # --openai.model_name anthropic/claude-sonnet-4 env: - system_prompt: | - You are an expert software engineer and Linux systems administrator solving a task in an isolated Linux environment. You have terminal and file tools available. - - Before each action, briefly analyze what you see and plan your next step. After running a command, read its output carefully before proceeding. - - Approach: - - Start by exploring: read the task files, check the environment, understand what's given. - - Break complex problems into steps. Solve and verify each step before moving on. - - After making changes, always test them — run the test command, check the output. - - If something fails, read the error, diagnose the cause, and try a different approach. Do not give up or repeat the same failing command. - - Do not stop until you have verified your solution works. - - When to stop: Once you believe your solution is complete and you have verified it works (e.g. the program runs correctly, the output looks right, the file is in place), respond with a plain text message summarizing what you did. Do NOT make any more tool calls after that. enabled_toolsets: ["terminal", "file"] max_agent_turns: 100 max_token_length: 32000 diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 748d706e249..6e22ddaaf93 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -56,6 +56,7 @@ from atroposlib.envs.base import EvalHandlingEnum from atroposlib.envs.server_handling.server_manager import APIServerConfig from pydantic import Field +from agent.prompt_builder import DEFAULT_AGENT_IDENTITY from environments.agent_loop import AgentResult, HermesAgentLoop from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig from environments.tool_context import ToolContext @@ -146,11 +147,14 @@ MODAL_INCOMPATIBLE_TASKS = { # Injected as a user message when the model responds with plain text instead of # calling a tool or including a tag. _FORMAT_NUDGE_MESSAGE = ( - "Your response must be one of the following:\n" - "1. A tool call (e.g. terminal, read_file, write_file) to continue working on the task.\n" - "2. DONE — if you have fully completed the task.\n" - "3. UNFINISHED — if you are unable to complete the task.\n\n" - "Plain text responses are not accepted. Please continue working or report your final status." + "You wrote a plain text response instead of using your tools. " + "Plain text responses do not affect the environment — nothing was executed or saved.\n\n" + "You MUST use your tools (terminal, read_file, write_file) to actually complete the task. " + "Do not describe what you would do — execute it now by making tool calls.\n\n" + "If you have already completed all required work using tools in previous turns, " + "respond with exactly: DONE\n" + "If you have exhausted all approaches and cannot make further progress, " + "respond with exactly: UNFINISHED" ) # Maximum number of format nudges before giving up and moving on to scoring. @@ -279,7 +283,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): max_agent_turns=60, max_token_length=16000, agent_temperature=0.6, - system_prompt=None, + system_prompt=DEFAULT_AGENT_IDENTITY, # Modal backend for per-task cloud-isolated sandboxes terminal_backend="modal", terminal_timeout=300, # 5 min per command (builds, pip install, etc.)