mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 07:21:37 +08:00
55 lines
2.6 KiB
YAML
55 lines
2.6 KiB
YAML
# Terminal-Bench 2.0 Evaluation -- Default Configuration
|
|
#
|
|
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
|
|
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
|
# and OpenRouter for inference.
|
|
#
|
|
# Usage:
|
|
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
|
# --config environments/benchmarks/terminalbench_2/default.yaml
|
|
#
|
|
# # Override model:
|
|
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
|
# --config environments/benchmarks/terminalbench_2/default.yaml \
|
|
# --openai.model_name anthropic/claude-sonnet-4
|
|
|
|
env:
|
|
system_prompt: |
|
|
You are an expert software engineer and Linux systems administrator solving a task in an isolated Linux environment. You have terminal and file tools available.
|
|
|
|
Before each action, briefly analyze what you see and plan your next step. After running a command, read its output carefully before proceeding.
|
|
|
|
Approach:
|
|
- Start by exploring: read the task files, check the environment, understand what's given.
|
|
- Break complex problems into steps. Solve and verify each step before moving on.
|
|
- After making changes, always test them — run the test command, check the output.
|
|
- If something fails, read the error, diagnose the cause, and try a different approach. Do not give up or repeat the same failing command.
|
|
- Do not stop until you have verified your solution works.
|
|
enabled_toolsets: ["terminal", "file"]
|
|
max_agent_turns: 60
|
|
max_token_length: 32000
|
|
agent_temperature: 0.8
|
|
terminal_backend: "modal"
|
|
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
|
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
|
dataset_name: "sidbin/terminal-bench-2-verified-flattened"
|
|
test_timeout: 600
|
|
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
|
use_wandb: true
|
|
wandb_name: "terminal-bench-2"
|
|
ensure_scores_are_not_same: false
|
|
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
|
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
|
|
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
|
|
# are created simultaneously inside thread pool workers via asyncio.run().
|
|
max_concurrent_tasks: 8
|
|
|
|
openai:
|
|
base_url: "https://openrouter.ai/api/v1"
|
|
model_name: "openai/gpt-oss-120b:nitro"
|
|
server_type: "openai"
|
|
health_check: false
|
|
timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
|
|
# api_key loaded from OPENROUTER_API_KEY in .env
|