wip: run tb2 and fix modal instantiation

This commit is contained in:
alt-glitch
2026-04-01 16:55:03 -07:00
parent 3baafea380
commit 3befb9389f
7 changed files with 90 additions and 23 deletions

View File

@@ -19,11 +19,11 @@ env:
max_token_length: 32000
agent_temperature: 0.8
terminal_backend: "modal"
terminal_timeout: 300 # 5 min per command (builds, pip install)
tool_pool_size: 128 # thread pool for 89 parallel tasks
dataset_name: "NousResearch/terminal-bench-2"
terminal_timeout: 300 # 5 min per command (builds, pip install)
tool_pool_size: 128 # thread pool for 89 parallel tasks
dataset_name: "sidbin/terminal-bench-2-verified-flattened"
test_timeout: 600
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
use_wandb: true
wandb_name: "terminal-bench-2"
@@ -36,7 +36,8 @@ env:
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-opus-4.6"
model_name: "openai/gpt-oss-120b:nitro"
server_type: "openai"
health_check: false
timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
# api_key loaded from OPENROUTER_API_KEY in .env