# Terminal-Bench 2.0 Evaluation -- Default Configuration
#
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
# and OpenRouter for inference.
#
# Usage:
#   python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
#       --config environments/benchmarks/terminalbench_2/default.yaml
#
#   # Override model:
#   python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
#       --config environments/benchmarks/terminalbench_2/default.yaml \
#       --openai.model_name anthropic/claude-sonnet-4

env:
  system_prompt: |
    You are an expert software engineer and Linux systems administrator solving a task in an isolated Linux environment. You have terminal and file tools available.

    Before each action, briefly analyze what you see and plan your next step. After running a command, read its output carefully before proceeding.

    Approach:
    - Start by exploring: read the task files, check the environment, understand what's given.
    - Break complex problems into steps. Solve and verify each step before moving on.
    - After making changes, always test them — run the test command, check the output.
    - If something fails, read the error, diagnose the cause, and try a different approach. Do not give up or repeat the same failing command.
    - Do not stop until you have verified your solution works.

    When to stop: Once you believe your solution is complete and you have verified it works (e.g. the program runs correctly, the output looks right, the file is in place), respond with a plain text message summarizing what you did. Do NOT make any more tool calls after that.
  enabled_toolsets: ["terminal", "file"]
  max_agent_turns: 100
  max_token_length: 32000
  agent_temperature: 1.0
  terminal_backend: "modal"
  terminal_timeout: 300 # 5 min per command (builds, pip install)
  tool_pool_size: 128 # thread pool for 89 parallel tasks
  dataset_name: "NousResearch/terminal-bench-2-verified-flattened"
  test_timeout: 600
  task_timeout: 900 # 15 min wall-clock per task, auto-FAIL if exceeded
  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
  use_wandb: true
  wandb_name: "terminal-bench-2"
  ensure_scores_are_not_same: false
  data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
  # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
  # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
  # are created simultaneously inside thread pool workers via asyncio.run().
  max_concurrent_tasks: 8
  extra_body:
    provider:
      order: ["DeepInfra"]
      allow_fallbacks: false

openai:
  base_url: "https://openrouter.ai/api/v1"
  model_name: "nvidia/nemotron-3-super-120b-a12b"
  server_type: "openai"
  health_check: false
  timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
  # api_key loaded from OPENROUTER_API_KEY in .env