add prokletor formatter

switch to asyncio
gemini fake reasoning
2026-05-05 10:17:17 +08:00 · 2025-11-23 10:24:58 -05:00 · 2025-11-22 11:25:23 -05:00 · 2025-11-22 09:47:00 -05:00 · 2025-11-19 21:14:37 -05:00
7 changed files with 632 additions and 647 deletions
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -4,25 +4,25 @@ Batch Agent Runner
 This module provides parallel batch processing capabilities for running the agent
 across multiple prompts from a dataset. It includes:
- Dataset loading and batching
+- Dataset loading
- Parallel batch processing with multiprocessing
+- Concurrent processing with asyncio (Producer-Consumer pattern)
 - Checkpointing for fault tolerance and resumption
 - Trajectory saving in the proper format (from/value pairs)
- Tool usage statistics aggregation across all batches
+- Tool usage statistics aggregation across all prompts
 - Cluster failure detection and graceful shutdown (morph, firecrawl, API errors)
 - Configurable failure thresholds with automatic data consolidation
 Usage:
-    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run
+    python batch_runner.py --dataset_file=data.jsonl --run_name=my_run
    # Resume an interrupted run
-    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run --resume
+    python batch_runner.py --dataset_file=data.jsonl --run_name=my_run --resume
    # Use a specific toolset distribution
-    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run --distribution=image_gen
+    python batch_runner.py --dataset_file=data.jsonl --run_name=my_run --distribution=image_gen
    # Configure tool failure thresholds
-    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
+    python batch_runner.py --dataset_file=data.jsonl --run_name=my_run \\
                           --max_tool_failures=20 --max_tool_failure_rate=0.3 --min_tool_calls_for_rate=10
 """
@@ -30,10 +30,10 @@ import json
 import logging
 import os
 import time
 import asyncio
 from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple
+from typing import List, Dict, Any, Optional, Tuple, Set
 from datetime import datetime
 from multiprocessing import Pool, Manager, Lock
 import traceback
 import re
@@ -49,9 +49,6 @@ from toolset_distributions import (
 from safe_print import safe_print
 # Global configuration for worker processes
 _WORKER_CONFIG = {}
 # Canonical names for the terminal tool (old & new implementations)
 _TERMINAL_TOOL_NAMES = {"terminal", "terminal_tool", "simple_terminal_tool"}
@@ -295,10 +292,9 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
    return tool_stats
-def _process_single_prompt(
+async def _process_single_prompt(
    prompt_index: int,
    prompt_data: Dict[str, Any],
    batch_num: int,
    config: Dict[str, Any]
 ) -> Dict[str, Any]:
    """
@@ -307,7 +303,6 @@ def _process_single_prompt(
    Args:
        prompt_index (int): Index of prompt in dataset
        prompt_data (Dict): Prompt data containing 'prompt' field
        batch_num (int): Batch number
        config (Dict): Configuration dict with agent parameters
    Returns:
@@ -332,11 +327,13 @@ def _process_single_prompt(
            save_trajectories=False,  # We handle saving ourselves
            verbose_logging=config.get("verbose", False),
            ephemeral_system_prompt=config.get("ephemeral_system_prompt"),
-            log_prefix_chars=config.get("log_prefix_chars", 100)
+            log_prefix_chars=config.get("log_prefix_chars", 100),
            prokletor_client=config.get("prokletor_client"),
            prokletor_formatter=config.get("prokletor_formatter")
        )
        # Run the agent with task_id to ensure each task gets its own isolated VM
-        result = agent.run_conversation(prompt, task_id=f"task_{prompt_index}")
+        result = await agent.run_conversation(prompt, task_id=f"task_{prompt_index}")
        # Extract tool usage statistics
        tool_stats = _extract_tool_stats(result["messages"])
@@ -365,7 +362,6 @@ def _process_single_prompt(
            "api_calls": result["api_calls"],
            "toolsets_used": selected_toolsets,
            "metadata": {
                "batch_num": batch_num,
                "timestamp": datetime.now().isoformat(),
                "model": config["model"]
            }
@@ -389,132 +385,38 @@ def _process_single_prompt(
            "tool_stats": {},
            "toolsets_used": [],
            "metadata": {
                "batch_num": batch_num,
                "timestamp": datetime.now().isoformat()
            }
        }
-def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
+async def worker(
    work_queue: asyncio.Queue,
    result_queue: asyncio.Queue,
    config: Dict[str, Any]
 ):
    """
-    Worker function to process a single batch of prompts.
+    Consumer worker that processes prompts from the work queue.
    Args:
        args (Tuple): (batch_num, batch_data, output_dir, completed_prompts, config)
    Returns:
        Dict: Batch results with statistics
    """
-    batch_num, batch_data, output_dir, completed_prompts_set, config = args
+    while True:
        try:
            task = await work_queue.get()
            if task is None:
                # Sentinel to stop worker
                work_queue.task_done()
                break
-    output_dir = Path(output_dir)
+            prompt_index, prompt_data = task
    print(f"\n🔄 Batch {batch_num}: Starting ({len(batch_data)} prompts)")
-    # Output file for this batch
+            result = await _process_single_prompt(prompt_index, prompt_data, config)
    batch_output_file = output_dir / f"batch_{batch_num}.jsonl"
-    # Filter out already completed prompts
+            await result_queue.put(result)
-    prompts_to_process = [
+            work_queue.task_done()
        (idx, data) for idx, data in batch_data
        if idx not in completed_prompts_set
    ]
-    if not prompts_to_process:
+        except Exception as e:
-        print(f"✅ Batch {batch_num}: Already completed (skipping)")
+            print(f"Error in worker: {e}")
-        return {
+            if 'task' in locals() and task is not None:
-            "batch_num": batch_num,
+                work_queue.task_done()
            "processed": 0,
            "skipped": len(batch_data),
            "tool_stats": {},
            "completed_prompts": []
        }
    print(f"   Processing {len(prompts_to_process)} prompts (skipping {len(batch_data) - len(prompts_to_process)} already completed)")
    # Initialize aggregated stats for this batch
    batch_tool_stats = {}
    batch_profiling_stats = []  # Collect profiling stats from each prompt
    completed_in_batch = []
    all_tool_errors = []  # Track all tool errors in this batch
    exception_errors = []  # Track top-level exceptions
    # Process each prompt sequentially in this batch
    for prompt_index, prompt_data in prompts_to_process:
        # Process the prompt
        result = _process_single_prompt(
            prompt_index,
            prompt_data,
            batch_num,
            config
        )
        # Track tool errors from the conversation
        if result.get("tool_errors"):
            for tool_error in result["tool_errors"]:
                all_tool_errors.append({
                    "prompt_index": prompt_index,
                    "tool_name": tool_error["tool_name"],
                    "error_message": tool_error["error_message"],
                    "full_content": tool_error.get("full_content", ""),
                    "error_type": tool_error.get("error_type", "Other")
                })
        # Track top-level exceptions (not tool errors)
        if not result["success"]:
            exception_errors.append({
                "prompt_index": prompt_index,
                "error": result.get("error", "Unknown error"),
                "traceback": result.get("traceback", "")
            })
            safe_print(f"[bold red]❌ Exception in prompt {prompt_index}:[/bold red] {result.get('error', '')[:100]}")
        # Save trajectory if successful
        if result["success"] and result["trajectory"]:
            trajectory_entry = {
                "prompt_index": prompt_index,
                "conversations": result["trajectory"],
                "metadata": result["metadata"],
                "completed": result["completed"],
                "api_calls": result["api_calls"],
                "toolsets_used": result["toolsets_used"]
            }
            # Append to batch output file
            with open(batch_output_file, 'a', encoding='utf-8') as f:
                f.write(json.dumps(trajectory_entry, ensure_ascii=False) + "\n")
        # Aggregate tool statistics
        for tool_name, stats in result.get("tool_stats", {}).items():
            if tool_name not in batch_tool_stats:
                batch_tool_stats[tool_name] = {
                    "count": 0,
                    "success": 0,
                    "failure": 0
                }
            batch_tool_stats[tool_name]["count"] += stats["count"]
            batch_tool_stats[tool_name]["success"] += stats["success"]
            batch_tool_stats[tool_name]["failure"] += stats["failure"]
        # Collect profiling statistics
        if result.get("profiling_stats"):
            batch_profiling_stats.append(result["profiling_stats"])
        completed_in_batch.append(prompt_index)
        print(f"   ✅ Prompt {prompt_index} completed")
    print(f"✅ Batch {batch_num}: Completed ({len(prompts_to_process)} prompts processed)")
    return {
        "batch_num": batch_num,
        "processed": len(prompts_to_process),
        "skipped": len(batch_data) - len(prompts_to_process),
        "tool_stats": batch_tool_stats,
        "profiling_stats": batch_profiling_stats,
        "completed_prompts": completed_in_batch,
        "tool_errors": all_tool_errors,
        "exception_errors": exception_errors
    }
 class BatchRunner:
@@ -525,7 +427,6 @@ class BatchRunner:
    def __init__(
        self,
        dataset_file: str,
        batch_size: int,
        run_name: str,
        distribution: str = "default",
        max_iterations: int = 10,
@@ -536,34 +437,36 @@ class BatchRunner:
        verbose: bool = False,
        ephemeral_system_prompt: str = None,
        log_prefix_chars: int = 100,
-        max_tool_failures: int = 10,
+        max_tool_failures: float = float("inf"),
        max_tool_failure_rate: float = 0.5,
        keep_recent_errors: int = 5,
        min_tool_calls_for_rate: int = 10,
        prokletor_client: str = None,
        prokletor_formatter: str = None,
    ):
        """
        Initialize the batch runner.
        Args:
            dataset_file (str): Path to the dataset JSONL file with 'prompt' field
            batch_size (int): Number of prompts per batch
            run_name (str): Name for this run (used for checkpointing and output)
            distribution (str): Toolset distribution to use (default: "default")
            max_iterations (int): Max iterations per agent run
            base_url (str): Base URL for model API
            api_key (str): API key for model
            model (str): Model name to use
-            num_workers (int): Number of parallel workers
+            num_workers (int): Number of parallel workers (default: 4)
            verbose (bool): Enable verbose logging
            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
-            max_tool_failures (int): Maximum number of tool failures before stopping (default: 10)
+            max_tool_failures (float): Maximum number of tool failures before stopping (default: inf for unlimited)
            max_tool_failure_rate (float): Maximum tool failure rate (0.0-1.0) before stopping (default: 0.5)
            keep_recent_errors (int): Number of recent errors to keep per tool (default: 5)
            min_tool_calls_for_rate (int): Minimum number of tool calls before checking failure rate (default: 10)
            prokletor_client (str): Name of the prokletor client to use
            prokletor_formatter (str): Name of the prokletor formatter to use
        """
        self.dataset_file = Path(dataset_file)
        self.batch_size = batch_size
        self.run_name = run_name
        self.distribution = distribution
        self.max_iterations = max_iterations
@@ -578,6 +481,8 @@ class BatchRunner:
        self.max_tool_failure_rate = max_tool_failure_rate
        self.keep_recent_errors = keep_recent_errors
        self.min_tool_calls_for_rate = min_tool_calls_for_rate
        self.prokletor_client = prokletor_client
        self.prokletor_formatter = prokletor_formatter
        # Validate distribution
        if not validate_distribution(distribution):
@@ -596,16 +501,14 @@ class BatchRunner:
        # Errors file
        self.errors_file = self.output_dir / "errors.json"
        # Trajectories file
        self.trajectories_file = self.output_dir / "trajectories.jsonl"
        # Load dataset
        self.dataset = self._load_dataset()
        # Create batches
        self.batches = self._create_batches()
        safe_print("[bold cyan]📊 Batch Runner Initialized[/bold cyan]")
        safe_print(f"   Dataset: {self.dataset_file} ({len(self.dataset)} prompts)")
        safe_print(f"   Batch size: {self.batch_size}")
        safe_print(f"   Total batches: {len(self.batches)}")
        safe_print(f"   Run name: {self.run_name}")
        safe_print(f"   Distribution: {self.distribution}")
        safe_print(f"   Output directory: {self.output_dir}")
@@ -651,20 +554,6 @@ class BatchRunner:
        return dataset
    def _create_batches(self) -> List[List[Tuple[int, Dict[str, Any]]]]:
        """
        Split dataset into batches with indices.
        Returns:
            List of batches, where each batch is a list of (index, entry) tuples
        """
        batches = []
        for i in range(0, len(self.dataset), self.batch_size):
            batch = [(idx, entry) for idx, entry in enumerate(self.dataset[i:i + self.batch_size], start=i)]
            batches.append(batch)
        return batches
    def _load_checkpoint(self) -> Dict[str, Any]:
        """
        Load checkpoint data if it exists.
@@ -676,7 +565,6 @@ class BatchRunner:
            return {
                "run_name": self.run_name,
                "completed_prompts": [],
                "batch_stats": {},
                "last_updated": None
            }
@@ -688,61 +576,34 @@ class BatchRunner:
            return {
                "run_name": self.run_name,
                "completed_prompts": [],
                "batch_stats": {},
                "last_updated": None
            }
-    def _save_checkpoint(self, checkpoint_data: Dict[str, Any], lock: Optional[Lock] = None):
+    def _save_checkpoint(self, checkpoint_data: Dict[str, Any]):
        """
        Save checkpoint data.
        Args:
            checkpoint_data (Dict): Checkpoint data to save
            lock (Lock): Optional lock for thread-safe access
        """
        checkpoint_data["last_updated"] = datetime.now().isoformat()
        with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
-        if lock:
+    def _save_final_stats(
-            with lock:
+        self, 
-                with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
+        num_processed: int,
-                    json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
+        tool_stats: Dict[str, Dict[str, int]],
-        else:
+        start_time: float,
-            with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
+        tool_errors_by_tool: Dict[str, List[Dict]],
-                json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
+        exception_errors: List[Dict],
-
+        early_exit: bool = False,
-    def _consolidate_data(self, num_batches: int, tool_stats: Dict[str, Dict[str, int]],
+        exit_reason: str = None,
-                          start_time: float, tool_errors_by_tool: Dict[str, List[Dict]],
+        profiling_stats_list: List[Dict] = None
-                          exception_errors: List[Dict], early_exit: bool = False, exit_reason: str = None,
+    ):
                          profiling_stats_list: List[Dict] = None):
        """
-        Consolidate batch data into trajectories.jsonl and save statistics.
+        Save final statistics and errors.
        Args:
            num_batches (int): Number of batches processed
            tool_stats (Dict): Aggregated tool statistics
            start_time (float): Start time of the run
            tool_errors_by_tool (Dict): Tool errors grouped by tool name with k most recent
            exception_errors (List): Top-level exceptions
            early_exit (bool): Whether this is an early exit
            exit_reason (str): Reason for early exit
            profiling_stats_list (List[Dict]): List of profiling statistics from each conversation
        """
        # Combine all batch files into a single trajectories.jsonl file
        combined_file = self.output_dir / "trajectories.jsonl"
        safe_print(f"\n[cyan]📦 Combining batch files into {combined_file.name}...[/cyan]")
        entries_written = 0
        with open(combined_file, 'w', encoding='utf-8') as outfile:
            for batch_num in range(num_batches):
                batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
                if batch_file.exists():
                    with open(batch_file, 'r', encoding='utf-8') as infile:
                        for line in infile:
                            outfile.write(line)
                            entries_written += 1
        safe_print(f"[green]✅ Combined {num_batches} batch files into trajectories.jsonl ({entries_written} entries)[/green]")
        # Calculate success rates for tool stats
        for tool_name in tool_stats:
            stats = tool_stats[tool_name]
@@ -794,17 +655,21 @@ class BatchRunner:
        # Aggregate profiling statistics if available
        aggregated_profiling_stats = None
        if profiling_stats_list:
-            from profiling import aggregate_profiling_stats
+            try:
-            aggregated_profiling_stats = aggregate_profiling_stats(profiling_stats_list)
+                from profiling import aggregate_profiling_stats, print_aggregated_statistics
                aggregated_profiling_stats = aggregate_profiling_stats(profiling_stats_list)
-        # Save final statistics (without detailed errors)
+                # Display aggregated profiling statistics
                print_aggregated_statistics(aggregated_profiling_stats, detailed=True)
            except ImportError:
                pass
        # Save final statistics
        final_stats = {
            "run_name": self.run_name,
            "distribution": self.distribution,
            "total_prompts": len(self.dataset),
-            "total_batches": len(self.batches),
+            "processed": num_processed,
            "batches_processed": num_batches,
            "batch_size": self.batch_size,
            "model": self.model,
            "completed_at": datetime.now().isoformat(),
            "duration_seconds": round(time.time() - start_time, 2),
@@ -817,18 +682,9 @@ class BatchRunner:
        with open(self.stats_file, 'w', encoding='utf-8') as f:
            json.dump(final_stats, f, indent=2, ensure_ascii=False)
-        # Display aggregated profiling statistics
+    async def _run_async(self, resume: bool = False):
        if aggregated_profiling_stats:
            from profiling import print_aggregated_statistics
            print_aggregated_statistics(aggregated_profiling_stats, detailed=True)
    def run(self, resume: bool = False):
        """
-        Run the batch processing pipeline.
+        Async implementation of the batch runner pipeline.
        Args:
            resume (bool): Whether to resume from checkpoint
        """
        print("\n" + "=" * 70)
        print("🚀 Starting Batch Processing")
@@ -838,15 +694,32 @@ class BatchRunner:
        checkpoint_data = self._load_checkpoint() if resume else {
            "run_name": self.run_name,
            "completed_prompts": [],
            "batch_stats": {},
            "last_updated": None
        }
        if resume and checkpoint_data.get("completed_prompts"):
            print(f"📂 Resuming from checkpoint ({len(checkpoint_data['completed_prompts'])} prompts already completed)")
-        # Prepare configuration for workers
+        completed_prompts_set = set(checkpoint_data.get("completed_prompts", []))
-        config = {
+        
        # Prepare queues
        work_queue = asyncio.Queue()
        result_queue = asyncio.Queue()
        # Enqueue prompts to process
        prompts_to_process = []
        for idx, entry in enumerate(self.dataset):
            if idx not in completed_prompts_set:
                prompts_to_process.append((idx, entry))
                work_queue.put_nowait((idx, entry))
        total_to_process = len(prompts_to_process)
        if total_to_process == 0:
            print("✅ All prompts already completed.")
            return
        # Worker configuration
        worker_config = {
            "distribution": self.distribution,
            "model": self.model,
            "max_iterations": self.max_iterations,
@@ -854,123 +727,124 @@ class BatchRunner:
            "api_key": self.api_key,
            "verbose": self.verbose,
            "ephemeral_system_prompt": self.ephemeral_system_prompt,
-            "log_prefix_chars": self.log_prefix_chars
+            "log_prefix_chars": self.log_prefix_chars,
            "prokletor_client": self.prokletor_client,
            "prokletor_formatter": self.prokletor_formatter
        }
-        # Get completed prompts set
+        # Start workers
-        completed_prompts_set = set(checkpoint_data.get("completed_prompts", []))
+        workers = []
        for _ in range(min(self.num_workers, total_to_process)):
            w = asyncio.create_task(worker(work_queue, result_queue, worker_config))
            workers.append(w)
-        # Aggregate statistics across all batches
+        print(f"   Processing {total_to_process} prompts with {len(workers)} workers...")
        # Aggregate statistics
        total_tool_stats = {}
-        all_profiling_stats = []  # Collect all profiling stats for aggregation
+        all_profiling_stats = []
-        tool_errors_by_tool = {}  # {tool_name: [list of k most recent errors]}
+        tool_errors_by_tool = {}
        all_exception_errors = []
        all_completed_prompts = list(completed_prompts_set)
        total_processed = len(completed_prompts_set)
        total_tool_errors = 0
        early_exit = False
        exit_reason = None
        processed_count = 0
        start_time = time.time()
-        # Process batches in parallel
+        # Process results as they arrive
-        with Pool(processes=self.num_workers) as pool:
+        try:
-            # Create tasks for each batch
+            while processed_count < total_to_process:
-            tasks = [
+                result = await result_queue.get()
-                (
+                processed_count += 1
                    batch_num,
                    batch_data,
                    str(self.output_dir),  # Convert Path to string for pickling
                    completed_prompts_set,
                    config
                )
                for batch_num, batch_data in enumerate(self.batches)
            ]
-            # Process batches in parallel and check tool failure threshold as results come in
+                prompt_index = result["prompt_index"]
            # imap_unordered allows parallel processing while getting results as they complete
            batch_num = 0
            try:
                for result in pool.imap_unordered(_process_batch_worker, tasks):
                    # Update statistics
                    all_completed_prompts.extend(result.get("completed_prompts", []))
                    total_processed += result.get("processed", 0)
-                    # Aggregate tool stats
+                # Track exceptions
-                    for tool_name, stats in result.get("tool_stats", {}).items():
+                if not result["success"]:
-                        if tool_name not in total_tool_stats:
+                    safe_print(f"[bold red]❌ Exception in prompt {prompt_index}:[/bold red] {result.get('error', '')[:100]}")
-                            total_tool_stats[tool_name] = {
+                    all_exception_errors.append({
-                                "count": 0,
+                        "prompt_index": prompt_index,
-                                "success": 0,
+                        "error": result.get("error", "Unknown error"),
-                                "failure": 0
+                        "traceback": result.get("traceback", "")
-                            }
+                    })
                else:
                    print(f"   ✅ Prompt {prompt_index} completed")
-                        total_tool_stats[tool_name]["count"] += stats["count"]
+                    # Save trajectory immediately
-                        total_tool_stats[tool_name]["success"] += stats["success"]
+                    if result.get("trajectory"):
-                        total_tool_stats[tool_name]["failure"] += stats["failure"]
+                        trajectory_entry = {
                            "prompt_index": prompt_index,
                            "conversations": result["trajectory"],
                            "metadata": result["metadata"],
                            "completed": result["completed"],
                            "api_calls": result["api_calls"],
                            "toolsets_used": result["toolsets_used"]
                        }
                        with open(self.trajectories_file, 'a', encoding='utf-8') as f:
                            f.write(json.dumps(trajectory_entry, ensure_ascii=False) + "\n")
-                    # Collect profiling stats from this batch
+                # Aggregate tool stats
-                    if result.get("profiling_stats"):
+                for tool_name, stats in result.get("tool_stats", {}).items():
-                        all_profiling_stats.extend(result["profiling_stats"])
+                    if tool_name not in total_tool_stats:
                        total_tool_stats[tool_name] = {"count": 0, "success": 0, "failure": 0}
-                    # Aggregate tool errors (keep k most recent per tool)
+                    total_tool_stats[tool_name]["count"] += stats["count"]
-                    for tool_error in result.get("tool_errors", []):
+                    total_tool_stats[tool_name]["success"] += stats["success"]
-                        tool_name = tool_error["tool_name"]
+                    total_tool_stats[tool_name]["failure"] += stats["failure"]
                        if tool_name not in tool_errors_by_tool:
                            tool_errors_by_tool[tool_name] = []
-                        # Add error and keep only k most recent
+                # Collect profiling stats
-                        tool_errors_by_tool[tool_name].append(tool_error)
+                if result.get("profiling_stats"):
-                        if len(tool_errors_by_tool[tool_name]) > self.keep_recent_errors:
+                    all_profiling_stats.append(result["profiling_stats"])
                            tool_errors_by_tool[tool_name] = tool_errors_by_tool[tool_name][-self.keep_recent_errors:]
-                        total_tool_errors += 1
+                # Aggregate tool errors
                for tool_error in result.get("tool_errors", []):
                    tool_name = tool_error["tool_name"]
                    if tool_name not in tool_errors_by_tool:
                        tool_errors_by_tool[tool_name] = []
-                    # Track exception errors
+                    tool_errors_by_tool[tool_name].append(tool_error)
-                    all_exception_errors.extend(result.get("exception_errors", []))
+                    # Keep only k most recent
                    if len(tool_errors_by_tool[tool_name]) > self.keep_recent_errors:
                        tool_errors_by_tool[tool_name] = tool_errors_by_tool[tool_name][-self.keep_recent_errors:]
-                    # Check tool failure thresholds
+                    total_tool_errors += 1
                    # Calculate total tool calls (not prompts)
                    total_tool_calls = sum(stats["count"] for stats in total_tool_stats.values())
-                    # Check absolute count threshold
+                # Update checkpoint
-                    if total_tool_errors >= self.max_tool_failures:
+                completed_prompts_set.add(prompt_index)
                checkpoint_data["completed_prompts"] = list(completed_prompts_set)
                self._save_checkpoint(checkpoint_data)
                # Check failure thresholds
                total_tool_calls = sum(stats["count"] for stats in total_tool_stats.values())
                if total_tool_errors >= self.max_tool_failures:
                    early_exit = True
                    exit_reason = f"Exceeded maximum tool failures ({total_tool_errors}/{self.max_tool_failures})"
                    break
                if total_tool_calls >= self.min_tool_calls_for_rate:
                    tool_failure_rate = total_tool_errors / total_tool_calls
                    if tool_failure_rate >= self.max_tool_failure_rate:
                        early_exit = True
-                        exit_reason = f"Exceeded maximum tool failures ({total_tool_errors}/{self.max_tool_failures})"
+                        exit_reason = f"Exceeded tool failure rate ({tool_failure_rate:.2%})"
                        safe_print(f"\n[bold red]🛑 STOPPING: {exit_reason}[/bold red]")
                        pool.terminate()  # Stop all workers immediately
                        break
-                    # Check rate threshold (only if we have enough tool calls to trust the rate)
+        except asyncio.CancelledError:
-                    if total_tool_calls >= self.min_tool_calls_for_rate:
+            early_exit = True
-                        tool_failure_rate = total_tool_errors / total_tool_calls
+            exit_reason = "Run cancelled"
        finally:
            # Stop all workers
            for _ in range(len(workers)):
                work_queue.put_nowait(None)
            await asyncio.gather(*workers, return_exceptions=True)
-                        if tool_failure_rate >= self.max_tool_failure_rate:
+        if early_exit:
-                            early_exit = True
+            safe_print(f"\n[bold red]🛑 STOPPING: {exit_reason}[/bold red]")
                            exit_reason = f"Exceeded tool failure rate ({tool_failure_rate:.2%} >= {self.max_tool_failure_rate:.2%}, {total_tool_errors}/{total_tool_calls} tool calls)"
                            safe_print(f"\n[bold red]🛑 STOPPING: {exit_reason}[/bold red]")
                            pool.terminate()  # Stop all workers immediately
                            break
-                    # Update checkpoint after each batch completes
+        # Save final statistics
-                    checkpoint_data["completed_prompts"] = all_completed_prompts
+        self._save_final_stats(
-                    self._save_checkpoint(checkpoint_data)
+            processed_count,
                    batch_num += 1
            except KeyboardInterrupt:
                safe_print("\n[bold yellow]⚠️  Interrupted by user, stopping workers...[/bold yellow]")
                pool.terminate()
                early_exit = True
                exit_reason = "Interrupted by user"
        # Save final checkpoint
        checkpoint_data["completed_prompts"] = all_completed_prompts
        self._save_checkpoint(checkpoint_data)
        # Consolidate data and save statistics
        num_batches_processed = batch_num + 1 if early_exit else len(self.batches)
        self._consolidate_data(
            num_batches_processed,
            total_tool_stats,
            start_time,
            tool_errors_by_tool,
@@ -980,164 +854,28 @@ class BatchRunner:
            all_profiling_stats
        )
-        # Print summary
+        # Summary output
        safe_print("\n" + "=" * 70)
-        if early_exit:
+        safe_print(f"✅ Total prompts processed: {processed_count}/{total_to_process}")
            safe_print("[bold yellow]⚠️  BATCH PROCESSING STOPPED EARLY[/bold yellow]")
            safe_print(f"[yellow]Reason: {exit_reason}[/yellow]")
        else:
            safe_print("[bold green]📊 BATCH PROCESSING COMPLETE[/bold green]")
        safe_print("=" * 70)
        safe_print(f"✅ Total prompts processed: {total_processed}")
        safe_print(f"✅ Batches completed: {num_batches_processed}/{len(self.batches)}")
        safe_print(f"⏱️  Total duration: {round(time.time() - start_time, 2)}s")
        # Tool error summary
        if tool_errors_by_tool:
-            total_errors = sum(len(errors) for errors in tool_errors_by_tool.values())
+            safe_print(f"\n[bold red]🚨 Tool Errors: {total_tool_errors} total[/bold red]")
-            safe_print(f"\n[bold red]🚨 Tool Errors: {total_tool_errors} total ({len(tool_errors_by_tool)} tools)[/bold red]")
+            # Simplified error printing here, full detail is in json
-            safe_print("[red]-[/red]" * 70)
+            for tool_name, errors in tool_errors_by_tool.items():
-
+                safe_print(f"  {tool_name}: {len(errors)} errors")
            # Sort tools by error count
            sorted_tools = sorted(
                tool_errors_by_tool.items(),
                key=lambda x: len(x[1]),
                reverse=True
            )
            for tool_name, errors in sorted_tools:
                # Count unique error messages
                unique_errors = {}
                for error in errors:
                    error_msg = error["error_message"][:100]  # Truncate for grouping
                    if error_msg not in unique_errors:
                        unique_errors[error_msg] = []
                    unique_errors[error_msg].append(error)
                safe_print(f"\n  [red]{tool_name}:[/red] {len(errors)} errors ({len(unique_errors)} unique)")
                # Show up to 3 most recent unique error types
                for idx, (error_msg, instances) in enumerate(list(unique_errors.items())[:3]):
                    error_preview = error_msg if len(error_msg) <= 100 else error_msg[:97] + "..."
                    safe_print(f"    [{idx+1}] [dim]{error_preview}[/dim] (x{len(instances)})")
                    # Show one example with prompt index and full content prefix
                    example = instances[-1]  # Most recent
                    safe_print(f"        [dim]Prompt {example['prompt_index']}[/dim]")
                    # Show full content prefix (first 200 chars)
                    full_content = example.get('full_content', '')
                    if full_content and full_content != error_preview:
                        content_preview = full_content[:200]
                        if len(full_content) > 200:
                            content_preview += "..."
                        # Show with prefix indicator
                        safe_print(f"        [dim]Content: {content_preview}[/dim]")
                if len(unique_errors) > 3:
                    safe_print(f"    [dim]... and {len(unique_errors) - 3} more error types[/dim]")
            tool_failure_rate = total_tool_errors / total_processed if total_processed > 0 else 0
            safe_print(f"\n  [red]Tool failure rate: {tool_failure_rate:.2%}[/red]")
        # Exception errors
        if all_exception_errors:
            safe_print(f"\n[bold red]💥 Top-level Exceptions: {len(all_exception_errors)}[/bold red]")
            safe_print("[red]-[/red]" * 70)
            for error in all_exception_errors[:self.keep_recent_errors]:
                error_msg = error["error"]
                error_preview = error_msg[:150]
                if len(error_msg) > 150:
                    error_preview += "..."
                safe_print(f"  [red]Prompt {error['prompt_index']}:[/red] [dim]{error_preview}[/dim]")
                # Show traceback prefix if available
                traceback_text = error.get("traceback", "")
                if traceback_text:
                    # Show last 3 lines of traceback for context
                    tb_lines = traceback_text.strip().split('\n')
                    relevant_lines = tb_lines[-3:] if len(tb_lines) > 3 else tb_lines
                    for line in relevant_lines:
                        safe_print(f"    [dim]{line}[/dim]")
        safe_print(f"\n[cyan]📈 Tool Usage Statistics:[/cyan]")
        safe_print("-" * 70)
        if total_tool_stats:
            # Sort by count descending
            sorted_tools = sorted(
                total_tool_stats.items(),
                key=lambda x: x[1]["count"],
                reverse=True
            )
            safe_print(f"{'Tool Name':<25} {'Count':<10} {'Success':<10} {'Failure':<10} {'Success Rate':<12}")
            safe_print("-" * 70)
            for tool_name, stats in sorted_tools:
                safe_print(
                    f"{tool_name:<25} "
                    f"{stats['count']:<10} "
                    f"{stats['success']:<10} "
                    f"{stats['failure']:<10} "
                    f"{stats.get('success_rate', 0):.1f}%"
                )
        else:
            safe_print("No tool calls were made during this run.")
        # Display failure type breakdown for tools with failures
        if tool_errors_by_tool:
            safe_print(f"\n[cyan]📊 Failure Type Breakdown:[/cyan]")
            safe_print("-" * 70)
            # Sort tools by total error count
            sorted_tools = sorted(
                tool_errors_by_tool.items(),
                key=lambda x: len(x[1]),
                reverse=True
            )
            for tool_name, errors in sorted_tools:
                # Count failure types for this tool
                failure_types = {}
                for error in errors:
                    error_type = error.get("error_type", "Other")
                    if error_type not in failure_types:
                        failure_types[error_type] = 0
                    failure_types[error_type] += 1
                # Display tool name and total failures
                total_failures = len(errors)
                safe_print(f"\n[yellow]{tool_name}[/yellow] ({total_failures} failures):")
                # Sort failure types by count
                sorted_types = sorted(
                    failure_types.items(),
                    key=lambda x: x[1],
                    reverse=True
                )
                # Display each failure type with count and percentage
                for failure_type, count in sorted_types:
                    percentage = (count / total_failures) * 100
                    safe_print(f"  • {failure_type:<20} {count:>4} ({percentage:>5.1f}%)")
        safe_print(f"\n[cyan]💾 Results saved to:[/cyan] {self.output_dir}")
        safe_print(f"   - Trajectories: trajectories.jsonl (combined)")
        safe_print(f"   - Individual batches: batch_*.jsonl (for debugging)")
        safe_print(f"   - Statistics: {self.stats_file.name}")
        safe_print(f"   - Errors: {self.errors_file.name}")
        safe_print(f"   - Checkpoint: {self.checkpoint_file.name}")
-        if early_exit:
+    def run(self, resume: bool = False):
-            safe_print(f"\n[bold yellow]ℹ️  Run was stopped early due to tool failures.[/bold yellow]")
+        """
-            safe_print(f"[yellow]   Check {self.errors_file.name} for detailed error information including tracebacks.[/yellow]")
+        Run the batch processing pipeline (sync wrapper).
-            safe_print(f"[yellow]   You can resume this run later with --resume flag.[/yellow]")
+        """
        asyncio.run(self._run_async(resume))
 def main(
    dataset_file: str = None,
    batch_size: int = None,
    run_name: str = None,
    distribution: str = "default",
    model: str = "claude-opus-4-20250514",
@@ -1150,17 +888,18 @@ def main(
    list_distributions: bool = False,
    ephemeral_system_prompt: str = None,
    log_prefix_chars: int = 100,
-    max_tool_failures: int = 10,
+    max_tool_failures: float = float("inf"),
    max_tool_failure_rate: float = 0.5,
    keep_recent_errors: int = 5,
    min_tool_calls_for_rate: int = 10,
    prokletor_client: str = None,
    prokletor_formatter: str = None,
 ):
    """
    Run batch processing of agent prompts from a dataset.
    Args:
        dataset_file (str): Path to JSONL file with 'prompt' field in each entry
        batch_size (int): Number of prompts per batch
        run_name (str): Name for this run (used for output and checkpointing)
        distribution (str): Toolset distribution to use (default: "default")
        model (str): Model name to use (default: "claude-opus-4-20250514")
@@ -1173,31 +912,22 @@ def main(
        list_distributions (bool): List available toolset distributions and exit
        ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
-        max_tool_failures (int): Maximum number of tool failures before stopping (default: 10)
+        max_tool_failures (float): Maximum number of tool failures before stopping (default: inf for unlimited)
        max_tool_failure_rate (float): Maximum tool failure rate (0.0-1.0) before stopping (default: 0.5)
        keep_recent_errors (int): Number of recent errors to keep per tool for reporting (default: 5)
        min_tool_calls_for_rate (int): Minimum number of tool calls before checking failure rate (default: 10)
        prokletor_client (str): Name of the prokletor client to use
        prokletor_formatter (str): Name of the prokletor formatter to use
    Examples:
        # Basic usage
-        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run
+        python batch_runner.py --dataset_file=data.jsonl --run_name=my_run
        # Resume interrupted run
-        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run --resume
+        python batch_runner.py --dataset_file=data.jsonl --run_name=my_run --resume
        # Use specific distribution
-        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=image_test --distribution=image_gen
+        python batch_runner.py --dataset_file=data.jsonl --run_name=image_test --distribution=image_gen
        # With ephemeral system prompt (not saved to dataset)
        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
                               --ephemeral_system_prompt="You are a helpful assistant focused on image generation."
        # With custom tool failure thresholds
        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
                               --max_tool_failures=20 --max_tool_failure_rate=0.3 --min_tool_calls_for_rate=10 --keep_recent_errors=10
        # List available distributions
        python batch_runner.py --list_distributions
    """
    # Handle list distributions
    if list_distributions:
@@ -1209,10 +939,6 @@ def main(
        all_dists = get_all_dists()
        for dist_name in sorted(all_dists.keys()):
            print_distribution_info(dist_name)
        print("\n💡 Usage:")
        print("  python batch_runner.py --dataset_file=data.jsonl --batch_size=10 \\")
        print("                         --run_name=my_run --distribution=<name>")
        return
    # Validate required arguments
@@ -1220,10 +946,6 @@ def main(
        print("❌ Error: --dataset_file is required")
        return
    if not batch_size or batch_size < 1:
        print("❌ Error: --batch_size must be a positive integer")
        return
    if not run_name:
        print("❌ Error: --run_name is required")
        return
@@ -1232,7 +954,6 @@ def main(
    try:
        runner = BatchRunner(
            dataset_file=dataset_file,
            batch_size=batch_size,
            run_name=run_name,
            distribution=distribution,
            max_iterations=max_turns,
@@ -1246,7 +967,9 @@ def main(
            max_tool_failures=max_tool_failures,
            max_tool_failure_rate=max_tool_failure_rate,
            keep_recent_errors=keep_recent_errors,
-            min_tool_calls_for_rate=min_tool_calls_for_rate
+            min_tool_calls_for_rate=min_tool_calls_for_rate,
            prokletor_client=prokletor_client,
            prokletor_formatter=prokletor_formatter
        )
        runner.run(resume=resume)
--- a/gemini_nothinking.sh
+++ b/gemini_nothinking.sh
@@ -0,0 +1,12 @@
 python batch_runner.py \
  --dataset_file="source-data/agent_tasks_eval.jsonl" \
  --batch_size=1 \
  --run_name="agenttasks_eval_gemini-4.5-3-nothinking" \
  --distribution="science" \
  --model="gemini-3-pro-preview" \
  --base_url="https://generativelanguage.googleapis.com/v1beta/openai/" \
  --api_key="${GEMINI_API_KEY}" \
  --num_workers=10 \
  --max_turns=60 \
  --verbose \
  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. If you require API keys please check which ones already exist in your environment variables in a way that does not read them."
--- a/model_tools.py
+++ b/model_tools.py
@@ -23,7 +23,7 @@ Usage:
    web_tools = get_tool_definitions(enabled_toolsets=['web_tools'])
    # Handle function calls from model
-    result = handle_function_call("web_search", {"query": "Python"})
+    result = await handle_function_call("web_search", {"query": "Python"})
 """
 import json
@@ -439,7 +439,7 @@ def get_tool_definitions(
    return filtered_tools
-def handle_web_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+async def handle_web_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
    """
    Handle function calls for web tools.
@@ -454,25 +454,25 @@ def handle_web_function_call(function_name: str, function_args: Dict[str, Any])
        query = function_args.get("query", "")
        # Always use fixed limit of 5
        limit = 5
-        return web_search_tool(query, limit)
+        return await web_search_tool(query, limit)
    elif function_name == "web_extract":
        urls = function_args.get("urls", [])
        # Limit URLs to prevent abuse
        urls = urls[:5] if isinstance(urls, list) else []
-        # Run async function in event loop
+        # Run async function
-        return asyncio.run(web_extract_tool(urls, "markdown"))
+        return await web_extract_tool(urls, "markdown")
    elif function_name == "web_crawl":
        url = function_args.get("url", "")
        instructions = function_args.get("instructions")
-        # Run async function in event loop
+        # Run async function
-        return asyncio.run(web_crawl_tool(url, instructions, "basic"))
+        return await web_crawl_tool(url, instructions, "basic")
    else:
        return json.dumps({"error": f"Unknown web function: {function_name}"}, ensure_ascii=False)
-def handle_terminal_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
+async def handle_terminal_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
    """
    Handle function calls for terminal tools.
@@ -489,13 +489,20 @@ def handle_terminal_function_call(function_name: str, function_args: Dict[str, A
        background = function_args.get("background", False)
        timeout = function_args.get("timeout")
-        return simple_terminal_tool(command=command, background=background, timeout=timeout, task_id=task_id)
+        # Run sync terminal tool in a thread to avoid blocking
        return await asyncio.to_thread(
            simple_terminal_tool,
            command=command,
            background=background,
            timeout=timeout,
            task_id=task_id
        )
    else:
        return json.dumps({"error": f"Unknown terminal function: {function_name}"}, ensure_ascii=False)
-def handle_vision_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+async def handle_vision_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
    """
    Handle function calls for vision tools.
@@ -512,14 +519,14 @@ def handle_vision_function_call(function_name: str, function_args: Dict[str, Any
        full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
-        # Run async function in event loop
+        # Run async function
-        return asyncio.run(vision_analyze_tool(image_url, full_prompt, "gemini-2.5-flash"))
+        return await vision_analyze_tool(image_url, full_prompt, "gemini-2.5-flash")
    else:
        return json.dumps({"error": f"Unknown vision function: {function_name}"}, ensure_ascii=False)
-def handle_moa_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+async def handle_moa_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
    """
    Handle function calls for Mixture-of-Agents tools.
@@ -536,14 +543,14 @@ def handle_moa_function_call(function_name: str, function_args: Dict[str, Any])
        if not user_prompt:
            return json.dumps({"error": "user_prompt is required for MoA processing"}, ensure_ascii=False)
-        # Run async function in event loop
+        # Run async function
-        return asyncio.run(mixture_of_agents_tool(user_prompt=user_prompt))
+        return await mixture_of_agents_tool(user_prompt=user_prompt)
    else:
        return json.dumps({"error": f"Unknown MoA function: {function_name}"}, ensure_ascii=False)
-def handle_image_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+async def handle_image_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
    """
    Handle function calls for image generation tools.
@@ -572,21 +579,8 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any]
        allow_nsfw_images = True
        seed = None
-        # Run async function in event loop with proper handling for multiprocessing
+        # Run async function
-        try:
+        return await image_generate_tool(
            # Try to get existing event loop
            loop = asyncio.get_event_loop()
            if loop.is_closed():
                # If closed, create a new one
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)
        except RuntimeError:
            # No event loop in current thread, create one
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        # Run the coroutine in the event loop
        result = loop.run_until_complete(image_generate_tool(
            prompt=prompt,
            image_size=image_size,
            num_inference_steps=num_inference_steps,
@@ -597,15 +591,13 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any]
            acceleration=acceleration,
            allow_nsfw_images=allow_nsfw_images,
            seed=seed
-        ))
+        )
        return result
    else:
        return json.dumps({"error": f"Unknown image generation function: {function_name}"}, ensure_ascii=False)
-def handle_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
+async def handle_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
    """
    Main function call dispatcher that routes calls to appropriate toolsets.
@@ -627,23 +619,23 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task
    try:
        # Route web tools
        if function_name in ["web_search", "web_extract", "web_crawl"]:
-            return handle_web_function_call(function_name, function_args)
+            return await handle_web_function_call(function_name, function_args)
        # Route terminal tools
        elif function_name in ["terminal"]:
-            return handle_terminal_function_call(function_name, function_args, task_id)
+            return await handle_terminal_function_call(function_name, function_args, task_id)
        # Route vision tools
        elif function_name in ["vision_analyze"]:
-            return handle_vision_function_call(function_name, function_args)
+            return await handle_vision_function_call(function_name, function_args)
        # Route MoA tools
        elif function_name in ["mixture_of_agents"]:
-            return handle_moa_function_call(function_name, function_args)
+            return await handle_moa_function_call(function_name, function_args)
        # Route image generation tools
        elif function_name in ["image_generate"]:
-            return handle_image_function_call(function_name, function_args)
+            return await handle_image_function_call(function_name, function_args)
        else:
            error_msg = f"Unknown function: {function_name}"
--- a/run_agent.py
+++ b/run_agent.py
@@ -24,11 +24,23 @@ import json
 import logging
 import os
 import time
 import asyncio
 import sys
 from typing import List, Dict, Any, Optional
-from openai import OpenAI
+from openai import AsyncOpenAI
 import fire
 from datetime import datetime
 from pathlib import Path
 from rich import print
 from prokletor.formatters.hermes import HermesToolFormatterWithReasoning
 from prokletor.formatters.hermes import HermesToolFormatterWithReasoning
 from prokletor.clients.hermes import HermesToolClientWithReasoning, HermesToolClient
 from prokletor.clients.claude import AsyncClaudeClient
 try:
    from anthropic import AsyncAnthropic
 except ImportError:
    AsyncAnthropic = None
 # Load environment variables from .env file
 from dotenv import load_dotenv
@@ -70,6 +82,8 @@ class AIAgent:
        verbose_logging: bool = False,
        ephemeral_system_prompt: str = None,
        log_prefix_chars: int = 100,
        prokletor_client: str = None,
        prokletor_formatter: str = None,
    ):
        """
        Initialize the AI Agent.
@@ -86,6 +100,8 @@ class AIAgent:
            verbose_logging (bool): Enable verbose logging for debugging (default: False)
            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
            prokletor_client (str): Name of the prokletor client to use (e.g., "AsyncClaudeClient", "HermesToolClient")
            prokletor_formatter (str): Name of the prokletor formatter to use (optional)
        """
        self.model = model
        self.max_iterations = max_iterations
@@ -94,6 +110,8 @@ class AIAgent:
        self.verbose_logging = verbose_logging
        self.ephemeral_system_prompt = ephemeral_system_prompt
        self.log_prefix_chars = log_prefix_chars
        self.prokletor_client_name = prokletor_client
        self.prokletor_formatter_name = prokletor_formatter
        # Store toolset filtering options
        self.enabled_toolsets = enabled_toolsets
@@ -122,7 +140,7 @@ class AIAgent:
            logging.getLogger('openai').setLevel(logging.WARNING)
            logging.getLogger('httpx').setLevel(logging.WARNING)
-        # Initialize OpenAI client
+        # Initialize Client
        client_kwargs = {}
        if base_url:
            client_kwargs["base_url"] = base_url
@@ -132,12 +150,45 @@ class AIAgent:
            client_kwargs["api_key"] = os.getenv("ANTHROPIC_API_KEY", "dummy-key")
        try:
-            self.client = OpenAI(**client_kwargs)
+            if prokletor_client == "AsyncClaudeClient":
                if AsyncAnthropic is None:
                    raise ImportError("anthropic package is required for AsyncClaudeClient")
                # AsyncAnthropic kwargs
                anthropic_kwargs = {k: v for k, v in client_kwargs.items() if k in ["api_key", "base_url", "timeout", "max_retries", "default_headers"]}
                anthropic_client = AsyncAnthropic(**anthropic_kwargs)
                self.client = AsyncClaudeClient(anthropic_client)
                print(f"🧠 Wrapped Anthropic client with AsyncClaudeClient")
            elif prokletor_client == "HermesToolClient":
                oai_client = AsyncOpenAI(**client_kwargs)
                self.client = HermesToolClient(oai_client)
                print(f"🧠 Wrapped OpenAI client with HermesToolClient")
            elif prokletor_client == "HermesToolClientWithReasoning":
                oai_client = AsyncOpenAI(**client_kwargs)
                self.client = HermesToolClientWithReasoning(oai_client)
                print(f"🧠 Wrapped OpenAI client with HermesToolClientWithReasoning")
            elif prokletor_client:
                # Fallback for unknown client names or if user provides a custom one (future proofing?)
                # For now, raise error or default to OpenAI
                print(f"⚠️ Unknown prokletor_client '{prokletor_client}'. Defaulting to HermesToolClientWithReasoning.")
                oai_client = AsyncOpenAI(**client_kwargs)
                self.client = HermesToolClientWithReasoning(oai_client)
            else:
                # Default behavior
                oai_client = AsyncOpenAI(**client_kwargs)
                self.client = oai_client
                print(f"🧠 Using raw OpenAI client (no prokletor wrapper)")
            print(f"🤖 AI Agent initialized with model: {self.model}")
            if base_url:
                print(f"🔗 Using custom base URL: {base_url}")
        except Exception as e:
-            raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
+            raise RuntimeError(f"Failed to initialize client: {e}")
        # Get available tools with filtering
        self.tools = get_tool_definitions(
@@ -210,22 +261,54 @@ class AIAgent:
        Returns:
            List[Dict]: Messages in trajectory format
        """
        # Use the client wrapper's format method if available to get the exact Hermes format
        # This ensures batch runner also gets the correct formatting
        if hasattr(self, 'client') and hasattr(self.client, 'format'):
            formatted_messages = self.client.format(messages, self.tools, render_final=True)
            trajectory = []
            for msg in formatted_messages:
                role = msg["role"]
                content = msg["content"]
                # Map roles to trajectory format (human, gpt, system, tool)
                if role == "user":
                    trajectory_role = "human"
                elif role == "assistant":
                    trajectory_role = "gpt"
                elif role == "system":
                    trajectory_role = "system"
                elif role == "tool":
                    trajectory_role = "tool"
                else:
                    trajectory_role = role
                trajectory.append({
                    "from": trajectory_role,
                    "value": content
                })
            return trajectory
        trajectory = []
        # Add system message with tool definitions
-        system_msg = (
+        # Use the client's formatter if available to ensure consistency (e.g. reasoning prompt)
-            "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
+        if hasattr(self, 'client') and hasattr(self.client, 'formatter'):
-            "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
+            system_msg = self.client.formatter.format_system_message(self.tools if self.tools else [])
-            "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
+        else:
-            "into functions. After calling & executing the functions, you will be provided with function results within "
+            system_msg = (
-            "<tool_response> </tool_response> XML tags. Here are the available tools:\n"
+                "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
-            f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
+                "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
-            "For each function call return a JSON object, with the following pydantic model json schema for each:\n"
+                "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
-            "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
+                "into functions. After calling & executing the functions, you will be provided with function results within "
-            "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
+                "<tool_response> </tool_response> XML tags. Here are the available tools:\n"
-            "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
+                f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
-            "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
+                "For each function call return a JSON object, with the following pydantic model json schema for each:\n"
-        )
+                "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
                "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
                "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
                "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
            )
        trajectory.append({
            "from": "system",
@@ -348,7 +431,7 @@ class AIAgent:
        except Exception as e:
            print(f"⚠️ Failed to save trajectory: {e}")
-    def run_conversation(
+    async def run_conversation(
        self,
        user_message: str,
        system_message: str = None,
@@ -401,10 +484,14 @@ class AIAgent:
            if self.verbose_logging:
                logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}")
                logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}")
                # Log the last few messages to see if thought_signature is present
                logging.debug(f"Last message content: {json.dumps(messages[-1] if messages else {}, indent=2)}")
            api_start_time = time.time()
            retry_count = 0
            max_retries = 6  # Increased to allow longer backoff periods
            response = None
            last_api_error = None
            while retry_count <= max_retries:
                try:
@@ -416,12 +503,23 @@ class AIAgent:
                        api_messages = [{"role": "system", "content": active_system_prompt}] + api_messages
                    # Make API call with tools
-                    response = self.client.chat.completions.create(
+                    api_kwargs = {
-                        model=self.model,
+                        "model": self.model,
-                        messages=api_messages,
+                        "messages": api_messages,
-                        tools=self.tools if self.tools else None,
+                        "tools": self.tools if self.tools else None,
-                        timeout=300.0  # 5 minute timeout for long-running agent tasks
+                        "timeout": 300.0,  # 5 minute timeout for long-running agent tasks
-                    )
+                    }
                    # Enable thinking by default for AsyncClaudeClient if using a supported model
                    if self.prokletor_client_name == "AsyncClaudeClient" and self.model.startswith("claude"):
                         api_kwargs["thinking"] = {
                            "type": "enabled",
                            "budget_tokens": 8000
                        }
                         # Ensure max_tokens is set higher than budget_tokens
                         api_kwargs["max_tokens"] = 16000
                    response = await self.client.chat.completions.create(**api_kwargs)
                    api_duration = time.time() - api_start_time
                    print(f"⏱️  OpenAI-compatible API call completed in {api_duration:.2f}s")
@@ -435,6 +533,15 @@ class AIAgent:
                    break  # Success, exit retry loop
                except Exception as api_error:
                    last_api_error = api_error
                    error_message = str(api_error)
                    token_limit_error = "input token count exceeds the maximum number of tokens" in error_message.lower()
                    if token_limit_error:
                        print("❌ OpenAI-compatible API call failed: input token limit exceeded. Not retrying this request.")
                        logging.error("Non-retryable token limit error from API: %s", api_error)
                        break
                    retry_count += 1
                    if retry_count > max_retries:
                        raise api_error
@@ -443,7 +550,10 @@ class AIAgent:
                    print(f"⚠️  OpenAI-compatible API call failed (attempt {retry_count}/{max_retries}): {str(api_error)[:100]}")
                    print(f"⏳ Retrying in {wait_time}s...")
                    logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}")
-                    time.sleep(wait_time)
+                    await asyncio.sleep(wait_time)
            if response is None:
                raise last_api_error if last_api_error else RuntimeError("OpenAI-compatible API call failed without a response")
            try:
                assistant_message = response.choices[0].message
@@ -459,25 +569,62 @@ class AIAgent:
                    if self.verbose_logging:
                        for tc in assistant_message.tool_calls:
                            logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
                            # Debug: Check what attributes are available on tool_call
                            logging.debug(f"Tool call attributes: {dir(tc)}")
                            # Try to dump the model to see all fields
                            if hasattr(tc, 'model_dump'):
                                logging.debug(f"Tool call data: {tc.model_dump()}")
                    # Add assistant message with tool calls to conversation
                    # Extract thought_signature if present (required for Gemini models)
                    tool_calls_data = []
                    for tool_call in assistant_message.tool_calls:
                        tool_call_dict = {
                            "id": tool_call.id,
                            "type": tool_call.type,
                            "function": {
                                "name": tool_call.function.name,
                                "arguments": tool_call.function.arguments
                            }
                        }
                        # Try multiple ways to access thought_signature (Gemini-specific)
                        # Gemini uses extra_content.google.thought_signature structure
                        thought_sig = None
                        # Method 1: Check extra_content attribute
                        if hasattr(tool_call, 'extra_content'):
                            extra = tool_call.extra_content
                            if isinstance(extra, dict) and 'google' in extra:
                                thought_sig = extra['google'].get('thought_signature')
                        # Method 2: Check model_dump() if available (Pydantic v2)
                        if thought_sig is None and hasattr(tool_call, 'model_dump'):
                            dumped = tool_call.model_dump()
                            if 'extra_content' in dumped and isinstance(dumped['extra_content'], dict):
                                google_data = dumped['extra_content'].get('google', {})
                                thought_sig = google_data.get('thought_signature')
                        if thought_sig is not None:
                            tool_call_dict["extra_content"] = {
                                "google": {
                                    "thought_signature": thought_sig
                                }
                            }
                            if self.verbose_logging:
                                logging.debug(f"Captured thought_signature for tool call {tool_call.id}")
                        elif self.verbose_logging:
                            logging.debug(f"No thought_signature found for tool call {tool_call.id}")
                        tool_calls_data.append(tool_call_dict)
                    messages.append({
                        "role": "assistant",
                        "content": assistant_message.content,
-                        "tool_calls": [
+                        "tool_calls": tool_calls_data
                            {
                                "id": tool_call.id,
                                "type": tool_call.type,
                                "function": {
                                    "name": tool_call.function.name,
                                    "arguments": tool_call.function.arguments
                                }
                            }
                            for tool_call in assistant_message.tool_calls
                        ]
                    })
-                    # Execute each tool call
+                    # Execute tool calls concurrently
                    tool_tasks = []
                    for i, tool_call in enumerate(assistant_message.tool_calls, 1):
                        function_name = tool_call.function.name
@@ -492,35 +639,55 @@ class AIAgent:
                        args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
                        print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
                        # Create coroutine for tool execution
                        task = handle_function_call(function_name, function_args, effective_task_id)
                        tool_tasks.append(task)
                    if tool_tasks:
                        tool_start_time = time.time()
-                        # Execute the tool with task_id to isolate VMs between concurrent tasks
+                        # Execute all tools concurrently
-                        function_result = handle_function_call(function_name, function_args, effective_task_id)
+                        # We use return_exceptions=True to ensure one failure doesn't stop others
                        # Order of results corresponds to order of tasks
                        results = await asyncio.gather(*tool_tasks, return_exceptions=True)
                        tool_duration = time.time() - tool_start_time
                        result_preview = function_result[:200] if len(function_result) > 200 else function_result
-                        # Record tool timing in profiler
+                        # Process results
-                        get_profiler().record_tool_timing(function_name, tool_duration)
+                        for i, (result, tool_call) in enumerate(zip(results, assistant_message.tool_calls), 1):
                            function_name = tool_call.function.name
-                        if self.verbose_logging:
+                            # Handle exceptions from asyncio.gather
-                            logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
+                            if isinstance(result, Exception):
-                            logging.debug(f"Tool result preview: {result_preview}...")
+                                function_result = json.dumps({"error": str(result)}, ensure_ascii=False)
                                print(f"❌ Tool {i} ({function_name}) failed: {result}")
                            else:
                                function_result = result
-                        # Add tool result to conversation
+                            result_preview = function_result[:200] if len(function_result) > 200 else function_result
                        messages.append({
                            "role": "tool",
                            "content": function_result,
                            "tool_call_id": tool_call.id
                        })
-                        # Preview tool response
+                            # Record tool timing in profiler (approximate since they ran in parallel)
-                        response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
+                            get_profiler().record_tool_timing(function_name, tool_duration)
                        print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
-                        # Delay between tool calls
+                            if self.verbose_logging:
-                        if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
+                                logging.debug(f"Tool {function_name} completed in parallel batch")
-                            time.sleep(self.tool_delay)
+                                logging.debug(f"Tool result preview: {result_preview}...")
                            # Add tool result to conversation
                            # Note: thought_signature should NOT be in tool responses, only in assistant messages
                            messages.append({
                                "role": "tool",
                                "content": function_result,
                                "tool_call_id": tool_call.id
                            })
                            # Preview tool response
                            response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
                            print(f"  ✅ Tool {i} completed - {response_preview}")
                        # Optional delay after batch execution
                        if self.tool_delay > 0:
                            await asyncio.sleep(self.tool_delay)
                    # Continue loop for next response
                    continue
@@ -566,11 +733,79 @@ class AIAgent:
        completed = final_response is not None and api_call_count < self.max_iterations
        # Save trajectory if enabled
-        self._save_trajectory(messages, user_message, completed)
+        # When saving trajectory, we want to show what the prompt would look like with proper tool roles
        # This is helpful for training data or debugging
        if self.save_trajectories:
            # Use the client wrapper's format method if available to get the exact Hermes format
            if hasattr(self, 'client') and hasattr(self.client, 'format'):
                raise ValueError("reached this point")
                formatted_messages = self.client.format(messages, self.tools, render_final=True)
                # We need to adapt this formatted list to the trajectory format expected by _save_trajectory
                # Since _convert_to_trajectory_format expects raw OAI messages, we might need a different approach
                # OR just pass the formatted messages directly if _save_trajectory supports it.
                # Let's look at _convert_to_trajectory_format. It iterates through messages and converts them.
                # If we pass messages that are already formatted (e.g. system prompt with tools, tool calls in XML),
                # we need to be careful not to double-format.
                # Actually, the goal is to save the trajectory in a specific JSONL format for training/eval.
                # If we use the Hermes formatter, it produces a list of messages where content is XML strings.
                # The existing _convert_to_trajectory_format does manual XML wrapping.
                # Ideally, we should use the messages as they are (OAI format) and let the training pipeline handle formatting,
                # OR save them in the exact format the model sees.
                # The user request is: "accumulating history in oai format and then calling that final thing with use_tool_call True"
                # referring to client.format(messages, tools, use_tool_role=True)
                # So let's save the RESULT of client.format() to the trajectory file.
                # Create a custom trajectory entry directly from the formatted messages
                trajectory_content = []
                for msg in formatted_messages:
                    role = msg["role"]
                    content = msg["content"]
                    # Map roles to trajectory format (human, gpt, system, tool)
                    if role == "user":
                        trajectory_role = "human"
                    elif role == "assistant":
                        trajectory_role = "gpt"
                    elif role == "system":
                        trajectory_role = "system"
                    elif role == "tool":
                        trajectory_role = "tool"
                    else:
                        trajectory_role = role
                    trajectory_content.append({
                        "from": trajectory_role,
                        "value": content
                    })
                # Save this specific formatted trajectory
                filename = "trajectory_samples.jsonl" if completed else "failed_trajectories.jsonl"
                entry = {
                    "conversations": trajectory_content,
                    "timestamp": datetime.now().isoformat(),
                    "model": self.model,
                    "completed": completed
                }
                try:
                    with open(filename, "a", encoding="utf-8") as f:
                        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
                    print(f"💾 Trajectory saved to {filename} (using Hermes format)")
                except Exception as e:
                    print(f"⚠️ Failed to save trajectory: {e}")
            else:
                # Fallback to original saving method
                self._save_trajectory(messages, user_message, completed)
        # Clean up VM for this task after conversation completes
        try:
-            cleanup_vm(effective_task_id)
+            await asyncio.to_thread(cleanup_vm, effective_task_id)
        except Exception as e:
            if self.verbose_logging:
                logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
@@ -586,7 +821,7 @@ class AIAgent:
            "profiling_stats": profiling_stats
        }
-    def chat(self, message: str) -> str:
+    async def chat(self, message: str) -> str:
        """
        Simple chat interface that returns just the final response.
@@ -596,7 +831,7 @@ class AIAgent:
        Returns:
            str: Final assistant response
        """
-        result = self.run_conversation(message)
+        result = await self.run_conversation(message)
        return result["final_response"]
@@ -612,7 +847,9 @@ def main(
    save_trajectories: bool = False,
    verbose: bool = False,
    log_prefix_chars: int = 20,
-    show_profiling: bool = True
+    show_profiling: bool = True,
    prokletor_client: str = None,
    prokletor_formatter: str = None,
 ):
    """
    Main function for running the agent directly.
@@ -632,6 +869,8 @@ def main(
        verbose (bool): Enable verbose logging for debugging. Defaults to False.
        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.
        show_profiling (bool): Display profiling statistics after conversation. Defaults to True.
        prokletor_client (str): Name of the prokletor client to use (e.g., "AsyncClaudeClient")
        prokletor_formatter (str): Name of the prokletor formatter to use
    Toolset Examples:
        - "research": Web search, extract, crawl + vision tools
@@ -750,7 +989,9 @@ def main(
            disabled_toolsets=disabled_toolsets_list,
            save_trajectories=save_trajectories,
            verbose_logging=verbose,
-            log_prefix_chars=log_prefix_chars
+            log_prefix_chars=log_prefix_chars,
            prokletor_client=prokletor_client,
            prokletor_formatter=prokletor_formatter
        )
    except RuntimeError as e:
        print(f"❌ Failed to initialize agent: {e}")
@@ -769,7 +1010,7 @@ def main(
    print("\n" + "=" * 50)
    # Run conversation
-    result = agent.run_conversation(user_query)
+    result = asyncio.run(agent.run_conversation(user_query))
    print("\n" + "=" * 50)
    print("📋 CONVERSATION SUMMARY")
--- a/tools/mixture_of_agents_tool.py
+++ b/tools/mixture_of_agents_tool.py
@@ -78,6 +78,7 @@ AGGREGATOR_TEMPERATURE = 0.4  # Focused synthesis for consistency
 # Failure handling configuration
 MIN_SUCCESSFUL_REFERENCES = 1  # Minimum successful reference models needed to proceed
 UNAVAILABLE_TOOL_RESPONSE = "This tools is not available"
 # System prompt for the aggregator model (from the research paper)
 AGGREGATOR_SYSTEM_PROMPT = """You have been provided with a set of responses from various open-source models to the latest user query. Your task is to synthesize these responses into a single, high-quality response. It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased or incorrect. Your response should not simply replicate the given answers but should offer a refined, accurate, and comprehensive reply to the instruction. Ensure your response is well-structured, coherent, and adheres to the highest standards of accuracy and reliability.
@@ -364,14 +365,29 @@ async def mixture_of_agents_tool(
        if failed_models:
            print(f"⚠️  Failed models: {', '.join(failed_models)}")
        # Check if we have enough successful responses to proceed
        if successful_count < MIN_SUCCESSFUL_REFERENCES:
            raise ValueError(f"Insufficient successful reference models ({successful_count}/{len(ref_models)}). Need at least {MIN_SUCCESSFUL_REFERENCES} successful responses.")
        debug_call_data["reference_responses_count"] = successful_count
        debug_call_data["failed_models_count"] = failed_count
        debug_call_data["failed_models"] = failed_models
        # Check if we have enough successful responses to proceed
        if successful_count < MIN_SUCCESSFUL_REFERENCES:
            print("🚫 MoA tool unavailable: insufficient successful reference models after retries")
            result = {
                "success": False,
                "response": UNAVAILABLE_TOOL_RESPONSE,
                "models_used": {
                    "reference_models": ref_models,
                    "aggregator_model": agg_model
                }
            }
            debug_call_data["error"] = UNAVAILABLE_TOOL_RESPONSE
            debug_call_data["models_used"] = result["models_used"]
            processing_time = (datetime.datetime.now() - start_time).total_seconds()
            debug_call_data["processing_time_seconds"] = processing_time
            _log_debug_call("mixture_of_agents_tool", debug_call_data)
            _save_debug_log()
            return json.dumps(result, indent=2, ensure_ascii=False)
        # Layer 2: Aggregate responses using the aggregator model
        print("🧠 Layer 2: Synthesizing final response...")
        aggregator_system_prompt = _construct_aggregator_prompt(
--- a/tools/simple_terminal_tool.py
+++ b/tools/simple_terminal_tool.py
@@ -189,8 +189,13 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
        ssh_context_manager = instance.ssh()
        ssh_context = ssh_context_manager.__enter__()
-        # Execute the command
+        # Execute the command. Using a PTY ensures stdout/stderr ordering matches
-        result = ssh_context.run(command, get_pty=False, timeout=timeout or 120)
+        # what a human would see in a terminal session.
        result = ssh_context.run(
            command,
            get_pty=True,
            timeout=timeout or 120,
        )
        # Close the SSH connection
        if ssh_context_manager:
@@ -213,22 +218,12 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
            except:
                pass
        # Check if it's a timeout
        error_str = str(e).lower()
        if "timeout" in error_str:
            return {
                "stdout": "",
                "stderr": f"Command timed out after {timeout or 120} seconds",
                "returncode": 124
            }
        return {
            "stdout": "",
            "stderr": f"SSH execution failed: {str(e)}",
            "returncode": -1
        }
 def simple_terminal_tool(
    command: str,
    background: bool = False,
@@ -315,15 +310,21 @@ def simple_terminal_tool(
            result = _execute_ssh_command(instance, exec_command, timeout=10)
            # For background tasks, return immediately with info
            stderr_text = (result["stderr"] or "").strip()
            if result["returncode"] == 0:
                return json.dumps({
                    "output": "Background task started successfully",
                    "stderr": stderr_text,
                    "exit_code": 0,
                    "error": None
                }, ensure_ascii=False)
            else:
                output_text = result["stdout"] or ""
                if result["stderr"] and not output_text:
                    output_text = result["stderr"]
                return json.dumps({
-                    "output": result["stdout"],
+                    "output": output_text,
                    "stderr": stderr_text,
                    "exit_code": result["returncode"],
                    "error": result["stderr"]
                }, ensure_ascii=False)
@@ -331,13 +332,13 @@ def simple_terminal_tool(
            # Run foreground command
            result = _execute_ssh_command(instance, command, timeout=timeout)
-            # Combine stdout and stderr for output
+            output = result["stdout"] or ""
            output = result["stdout"]
            if result["stderr"] and result["returncode"] != 0:
                output = f"{output}\n{result['stderr']}" if output else result["stderr"]
-
+            stderr_text = (result["stderr"] or "").strip()
            return json.dumps({
                "output": output.strip(),
                "stderr": stderr_text,
                "exit_code": result["returncode"],
                "error": result["stderr"] if result["returncode"] != 0 else None
            }, ensure_ascii=False)
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -48,11 +48,11 @@ import uuid
 import datetime
 from pathlib import Path
 from typing import List, Dict, Any, Optional
-from firecrawl import Firecrawl
+from firecrawl import AsyncFirecrawl
 from openai import AsyncOpenAI
 # Initialize Firecrawl client once at module level
-firecrawl_client = Firecrawl(api_key=os.getenv("FIRECRAWL_API_KEY"))
+firecrawl_client = AsyncFirecrawl(api_key=os.getenv("FIRECRAWL_API_KEY"))
 # Initialize Nous Research API client for LLM processing (async)
 nous_client = AsyncOpenAI(
@@ -261,7 +261,7 @@ def clean_base64_images(text: str) -> str:
    return cleaned_text
-def web_search_tool(query: str, limit: int = 5) -> str:
+async def web_search_tool(query: str, limit: int = 5) -> str:
    """
    Search the web for information using available search API backend.
@@ -312,7 +312,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
        # Use Firecrawl's v2 search functionality WITHOUT scraping
        # We only want search result metadata, not scraped content
        # Docs: https://docs.firecrawl.dev/features/search
-        response = firecrawl_client.search(
+        response = await firecrawl_client.search(
            query=query,
            limit=limit
        )
@@ -446,7 +446,7 @@ async def web_extract_tool(
        for url in urls:
            try:
                print(f"  📄 Scraping: {url}")
-                scrape_result = firecrawl_client.scrape(
+                scrape_result = await firecrawl_client.scrape(
                    url=url,
                    formats=formats
                )
@@ -703,7 +703,7 @@ async def web_crawl_tool(
        # Use the crawl method which waits for completion automatically
        try:
-            crawl_result = firecrawl_client.crawl(
+            crawl_result = await firecrawl_client.crawl(
                url=url,
                **crawl_params
            )
Author	SHA1	Message	Date
hjc-puro	7d9a1e119d	add prokletor formatter	2025-11-23 10:24:58 -05:00
hjc-puro	e91d9e839a	switch to asyncio	2025-11-22 11:25:23 -05:00
hjc-puro	98321be8b0	gemini fake reasoning	2025-11-22 09:47:00 -05:00
hjc-puro	a219e178a1	support gemini models	2025-11-19 21:14:37 -05:00