diff --git a/environments/benchmarks/tblite/local.yaml b/environments/benchmarks/tblite/local.yaml new file mode 100644 index 00000000000..35d4b896869 --- /dev/null +++ b/environments/benchmarks/tblite/local.yaml @@ -0,0 +1,38 @@ +# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute) +# +# Runs tasks in Docker containers on the local machine. +# Sandboxed like Modal but no cloud costs. Good for dev/testing. +# +# Usage: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/local.yaml +# +# # Override concurrency: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/local.yaml \ +# --env.eval_concurrency 4 + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 60 + max_token_length: 32000 + agent_temperature: 0.8 + terminal_backend: "docker" + terminal_timeout: 300 + tool_pool_size: 16 + dataset_name: "NousResearch/openthoughts-tblite" + test_timeout: 600 + task_timeout: 1200 + eval_concurrency: 8 # max 8 tasks at once + tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" + use_wandb: false + wandb_name: "openthoughts-tblite-local" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local" + +openai: + base_url: "https://openrouter.ai/api/v1" + model_name: "anthropic/claude-sonnet-4" + server_type: "openai" + health_check: false + # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index ccb65b32624..2d4ae201fef 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -118,6 +118,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig): "Tasks exceeding this are scored as FAIL. Default 30 minutes.", ) + # --- Eval concurrency --- + eval_concurrency: int = Field( + default=0, + description="Maximum number of tasks to evaluate in parallel. " + "0 means unlimited (all tasks run concurrently). " + "Set to 8 for local backends to avoid overwhelming the machine.", + ) + # Tasks that cannot run properly on Modal and are excluded from scoring. MODAL_INCOMPATIBLE_TASKS = { @@ -429,8 +437,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): "error": "no_image", } - # --- 2. Register per-task Modal image override --- - register_task_env_overrides(task_id, {"modal_image": modal_image}) + # --- 2. Register per-task image override --- + # Set both modal_image and docker_image so the task image is used + # regardless of which backend is configured. + register_task_env_overrides(task_id, { + "modal_image": modal_image, + "docker_image": modal_image, + }) logger.info( "Task %s: registered image override for task_id %s", task_name, task_id[:8], @@ -655,13 +668,19 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict: """ - Wrap rollout_and_score_eval with a per-task wall-clock timeout. + Wrap rollout_and_score_eval with a per-task wall-clock timeout + and optional concurrency limit via semaphore. If the task exceeds task_timeout seconds, it's automatically scored as FAIL. This prevents any single task from hanging indefinitely. """ task_name = item.get("task_name", "unknown") category = item.get("category", "unknown") + + # Acquire concurrency semaphore if configured + if self._eval_semaphore: + await self._eval_semaphore.acquire() + try: return await asyncio.wait_for( self.rollout_and_score_eval(item), @@ -679,6 +698,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): } self._save_result(out) return out + finally: + if self._eval_semaphore: + self._eval_semaphore.release() async def evaluate(self, *args, **kwargs) -> None: """ @@ -696,6 +718,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): """ start_time = time.time() + # Set up concurrency limit if configured + if self.config.eval_concurrency > 0: + self._eval_semaphore = asyncio.Semaphore(self.config.eval_concurrency) + print(f" Eval concurrency: {self.config.eval_concurrency} tasks at a time") + else: + self._eval_semaphore = None + # Route all logging through tqdm.write() so the progress bar stays # pinned at the bottom while log lines scroll above it. from tqdm import tqdm