add profiling

add linewise profiling
don't log exit code !=0 as terminal failure
2026-05-04 17:57:28 +08:00 · 2025-11-18 07:12:05 -05:00 · 2025-11-17 23:21:36 -05:00 · 2025-11-17 18:39:16 -05:00 · 2025-11-17 18:35:36 -05:00 · 2025-11-17 01:14:31 -05:00
10 changed files with 1563 additions and 218 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,12 @@ data/
 .pytest_cache/
 tmp/
 temp_vision_images/
 hermes-*/*
 examples/
 tests/quick_test_dataset.jsonl
 tests/sample_dataset.jsonl
 run_datagen_kimik2-thinking.sh
 run_datagen_megascience_glm4-6.sh
 run_datagen_sonnet.sh
 source-data/*
 run_datagen_megascience_glm4-6.sh
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -9,6 +9,8 @@ across multiple prompts from a dataset. It includes:
 - Checkpointing for fault tolerance and resumption
 - Trajectory saving in the proper format (from/value pairs)
 - Tool usage statistics aggregation across all batches
 - Cluster failure detection and graceful shutdown (morph, firecrawl, API errors)
 - Configurable failure thresholds with automatic data consolidation
 Usage:
    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run
@@ -18,6 +20,10 @@ Usage:
    # Use a specific toolset distribution
    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run --distribution=image_gen
    # Configure tool failure thresholds
    python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
                           --max_tool_failures=20 --max_tool_failure_rate=0.3 --min_tool_calls_for_rate=10
 """
 import json
@@ -29,6 +35,7 @@ from typing import List, Dict, Any, Optional, Tuple
 from datetime import datetime
 from multiprocessing import Pool, Manager, Lock
 import traceback
 import re
 import fire
@@ -39,11 +46,166 @@ from toolset_distributions import (
    sample_toolsets_from_distribution,
    validate_distribution
 )
 from safe_print import safe_print
 # Global configuration for worker processes
 _WORKER_CONFIG = {}
 # Canonical names for the terminal tool (old & new implementations)
 _TERMINAL_TOOL_NAMES = {"terminal", "terminal_tool", "simple_terminal_tool"}
 def _is_terminal_tool_name(tool_name: Optional[str]) -> bool:
    """Return True if the given tool name corresponds to a terminal tool."""
    return bool(tool_name) and tool_name.lower() in _TERMINAL_TOOL_NAMES
 def _terminal_tool_failed(content_json: Dict[str, Any]) -> bool:
    """
    Determine whether the terminal tool itself failed (not the user command).
    Terminal failures are indicated by explicit status flags or negative exit codes.
    Regular command failures (non-zero positive exit codes, stderr, timeouts) are not counted.
    """
    if not isinstance(content_json, dict):
        return False
    status = str(content_json.get("status", "")).lower()
    if status in {"error", "disabled"}:
        return True
    exit_code = content_json.get("exit_code")
    if isinstance(exit_code, int) and exit_code < 0:
        return True
    return False
 def _categorize_error_type(error_message: str) -> str:
    """
    Categorize an error message into a failure type.
    Args:
        error_message (str): The error message to categorize
    Returns:
        str: Category of the error
    """
    error_lower = error_message.lower()
    # Common error patterns
    if "timeout" in error_lower or "timed out" in error_lower:
        return "Timeout"
    elif "connection" in error_lower or "connect" in error_lower:
        return "Connection Error"
    elif "rate limit" in error_lower or "ratelimit" in error_lower or "429" in error_lower:
        return "Rate Limit"
    elif "authentication" in error_lower or "auth" in error_lower or "unauthorized" in error_lower or "401" in error_lower:
        return "Authentication"
    elif "not found" in error_lower or "404" in error_lower:
        return "Not Found"
    elif "permission" in error_lower or "forbidden" in error_lower or "403" in error_lower:
        return "Permission Denied"
    elif "invalid" in error_lower or "malformed" in error_lower or "bad request" in error_lower or "400" in error_lower:
        return "Invalid Input"
    elif "out of memory" in error_lower or "oom" in error_lower:
        return "Out of Memory"
    elif "network" in error_lower:
        return "Network Error"
    elif "server error" in error_lower or "500" in error_lower or "502" in error_lower or "503" in error_lower:
        return "Server Error"
    elif "vm" in error_lower and ("fail" in error_lower or "error" in error_lower):
        return "VM Error"
    else:
        return "Other"
 def _extract_tool_errors_from_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Extract tool errors from message history with tool names.
    Args:
        messages (List[Dict]): Message history
    Returns:
        List[Dict]: List of tool errors with tool name, error message, error type, and context
    """
    tool_errors = []
    tool_calls_map = {}  # Map tool_call_id to tool name
    for msg in messages:
        # Track tool calls from assistant messages
        if msg["role"] == "assistant" and "tool_calls" in msg and msg["tool_calls"]:
            for tool_call in msg["tool_calls"]:
                tool_name = tool_call["function"]["name"]
                tool_call_id = tool_call["id"]
                tool_calls_map[tool_call_id] = tool_name
        # Check tool responses for errors
        elif msg["role"] == "tool":
            tool_call_id = msg.get("tool_call_id", "")
            content = msg.get("content", "")
            # Determine if tool call had an error
            has_error = False
            error_msg = None
            try:
                content_json = json.loads(content) if isinstance(content, str) else content
                if isinstance(content_json, dict):
                    # Get tool name for special handling
                    tool_name = tool_calls_map.get(tool_call_id, "unknown")
                    # Special handling for terminal tool outputs
                    if _is_terminal_tool_name(tool_name):
                        if _terminal_tool_failed(content_json):
                            has_error = True
                            # Prefer explicit error text, fall back to status or generic message
                            error_msg = str(
                                content_json.get("error")
                                or content_json.get("status")
                                or "Terminal tool failure"
                            )
                    else:
                        # For other tools, check if error field exists AND has a non-null value
                        if "error" in content_json and content_json["error"] is not None:
                            has_error = True
                            error_msg = str(content_json["error"])
                        # Check nested content structure (some tools wrap responses)
                        if "content" in content_json and isinstance(content_json["content"], dict):
                            inner_content = content_json["content"]
                            if inner_content.get("error") is not None:
                                has_error = True
                                error_msg = inner_content.get("error")
                        # Check for "success": false pattern
                        if content_json.get("success") is False:
                            has_error = True
                            if not error_msg:
                                error_msg = str(content_json.get("message", content_json.get("error", "Unknown error")))
            except:
                # If not JSON, check if content explicitly states an error
                if content.strip().lower().startswith("error:"):
                    has_error = True
                    error_msg = content.strip()
            # Record error if found
            if has_error and tool_call_id in tool_calls_map:
                tool_name = tool_calls_map[tool_call_id]
                error_message = error_msg or "Unknown error"
                tool_errors.append({
                    "tool_name": tool_name,
                    "error_message": error_message,
                    "error_type": _categorize_error_type(error_message),
                    "full_content": content[:500]  # Keep first 500 chars of full response
                })
    return tool_errors
 def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, int]]:
    """
@@ -90,23 +252,28 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
                content_json = json.loads(content) if isinstance(content, str) else content
                if isinstance(content_json, dict):
-                    # Check if error field exists AND has a non-null value
+                    # Get tool name for special handling
-                    if "error" in content_json and content_json["error"] is not None:
+                    tool_name = tool_calls_map.get(tool_call_id, "unknown")
                        is_success = False
-                    # Special handling for terminal tool responses
+                    # Special handling for terminal tool: only count as failure when the tool itself fails
-                    # Terminal wraps its response in a "content" field
+                    if _is_terminal_tool_name(tool_name):
-                    if "content" in content_json and isinstance(content_json["content"], dict):
+                        if _terminal_tool_failed(content_json):
-                        inner_content = content_json["content"]
+                            is_success = False
-                        # Check for actual error (non-null error field or non-zero exit code)
+                    else:
-                        has_error = (inner_content.get("error") is not None or 
+                        # For other tools, check if error field exists AND has a non-null value
-                                   inner_content.get("exit_code", 0) != 0)
+                        if "error" in content_json and content_json["error"] is not None:
                        if has_error:
                            is_success = False
-                    # Check for "success": false pattern used by some tools
+                        # Check nested content structure (some tools wrap responses)
-                    if content_json.get("success") is False:
+                        if "content" in content_json and isinstance(content_json["content"], dict):
-                        is_success = False
+                            inner_content = content_json["content"]
                            # Check for actual error (non-null error field)
                            if inner_content.get("error") is not None:
                                is_success = False
                        # Check for "success": false pattern used by some tools
                        if content_json.get("success") is False:
                            is_success = False
            except:
                # If not JSON, check if content is empty or explicitly states an error
@@ -164,7 +331,8 @@ def _process_single_prompt(
            enabled_toolsets=selected_toolsets,
            save_trajectories=False,  # We handle saving ourselves
            verbose_logging=config.get("verbose", False),
-            ephemeral_system_prompt=config.get("ephemeral_system_prompt")
+            ephemeral_system_prompt=config.get("ephemeral_system_prompt"),
            log_prefix_chars=config.get("log_prefix_chars", 100)
        )
        # Run the agent with task_id to ensure each task gets its own isolated VM
@@ -173,6 +341,9 @@ def _process_single_prompt(
        # Extract tool usage statistics
        tool_stats = _extract_tool_stats(result["messages"])
        # Extract tool errors from conversation
        tool_errors = _extract_tool_errors_from_messages(result["messages"])
        # Convert to trajectory format (using existing method)
        trajectory = agent._convert_to_trajectory_format(
            result["messages"],
@@ -180,11 +351,16 @@ def _process_single_prompt(
            result["completed"]
        )
        # Get profiling stats from the result
        profiling_stats = result.get("profiling_stats", {"tools": {}, "api_calls": {}})
        return {
            "success": True,
            "prompt_index": prompt_index,
            "trajectory": trajectory,
            "tool_stats": tool_stats,
            "tool_errors": tool_errors,
            "profiling_stats": profiling_stats,
            "completed": result["completed"],
            "api_calls": result["api_calls"],
            "toolsets_used": selected_toolsets,
@@ -196,14 +372,19 @@ def _process_single_prompt(
        }
    except Exception as e:
-        print(f"❌ Error processing prompt {prompt_index}: {e}")
+        error_msg = str(e)
        tb = traceback.format_exc()
        safe_print(f"[bold red]❌ Error processing prompt {prompt_index}:[/bold red] {error_msg}")
        if config.get("verbose"):
-            traceback.print_exc()
+            safe_print(tb)
        return {
            "success": False,
            "prompt_index": prompt_index,
-            "error": str(e),
+            "error": error_msg,
            "traceback": tb,
            "tool_errors": [],
            "profiling_stats": {"tools": {}, "api_calls": {}},
            "trajectory": None,
            "tool_stats": {},
            "toolsets_used": [],
@@ -252,7 +433,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
    # Initialize aggregated stats for this batch
    batch_tool_stats = {}
    batch_profiling_stats = []  # Collect profiling stats from each prompt
    completed_in_batch = []
    all_tool_errors = []  # Track all tool errors in this batch
    exception_errors = []  # Track top-level exceptions
    # Process each prompt sequentially in this batch
    for prompt_index, prompt_data in prompts_to_process:
@@ -264,6 +448,26 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
            config
        )
        # Track tool errors from the conversation
        if result.get("tool_errors"):
            for tool_error in result["tool_errors"]:
                all_tool_errors.append({
                    "prompt_index": prompt_index,
                    "tool_name": tool_error["tool_name"],
                    "error_message": tool_error["error_message"],
                    "full_content": tool_error.get("full_content", ""),
                    "error_type": tool_error.get("error_type", "Other")
                })
        # Track top-level exceptions (not tool errors)
        if not result["success"]:
            exception_errors.append({
                "prompt_index": prompt_index,
                "error": result.get("error", "Unknown error"),
                "traceback": result.get("traceback", "")
            })
            safe_print(f"[bold red]❌ Exception in prompt {prompt_index}:[/bold red] {result.get('error', '')[:100]}")
        # Save trajectory if successful
        if result["success"] and result["trajectory"]:
            trajectory_entry = {
@@ -292,6 +496,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
            batch_tool_stats[tool_name]["success"] += stats["success"]
            batch_tool_stats[tool_name]["failure"] += stats["failure"]
        # Collect profiling statistics
        if result.get("profiling_stats"):
            batch_profiling_stats.append(result["profiling_stats"])
        completed_in_batch.append(prompt_index)
        print(f"   ✅ Prompt {prompt_index} completed")
@@ -302,7 +510,10 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
        "processed": len(prompts_to_process),
        "skipped": len(batch_data) - len(prompts_to_process),
        "tool_stats": batch_tool_stats,
-        "completed_prompts": completed_in_batch
+        "profiling_stats": batch_profiling_stats,
        "completed_prompts": completed_in_batch,
        "tool_errors": all_tool_errors,
        "exception_errors": exception_errors
    }
@@ -323,7 +534,12 @@ class BatchRunner:
        model: str = "claude-opus-4-20250514",
        num_workers: int = 4,
        verbose: bool = False,
-        ephemeral_system_prompt: str = None
+        ephemeral_system_prompt: str = None,
        log_prefix_chars: int = 100,
        max_tool_failures: int = 10,
        max_tool_failure_rate: float = 0.5,
        keep_recent_errors: int = 5,
        min_tool_calls_for_rate: int = 10,
    ):
        """
        Initialize the batch runner.
@@ -340,6 +556,11 @@ class BatchRunner:
            num_workers (int): Number of parallel workers
            verbose (bool): Enable verbose logging
            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
            max_tool_failures (int): Maximum number of tool failures before stopping (default: 10)
            max_tool_failure_rate (float): Maximum tool failure rate (0.0-1.0) before stopping (default: 0.5)
            keep_recent_errors (int): Number of recent errors to keep per tool (default: 5)
            min_tool_calls_for_rate (int): Minimum number of tool calls before checking failure rate (default: 10)
        """
        self.dataset_file = Path(dataset_file)
        self.batch_size = batch_size
@@ -352,6 +573,11 @@ class BatchRunner:
        self.num_workers = num_workers
        self.verbose = verbose
        self.ephemeral_system_prompt = ephemeral_system_prompt
        self.log_prefix_chars = log_prefix_chars
        self.max_tool_failures = max_tool_failures
        self.max_tool_failure_rate = max_tool_failure_rate
        self.keep_recent_errors = keep_recent_errors
        self.min_tool_calls_for_rate = min_tool_calls_for_rate
        # Validate distribution
        if not validate_distribution(distribution):
@@ -367,23 +593,31 @@ class BatchRunner:
        # Statistics file
        self.stats_file = self.output_dir / "statistics.json"
        # Errors file
        self.errors_file = self.output_dir / "errors.json"
        # Load dataset
        self.dataset = self._load_dataset()
        # Create batches
        self.batches = self._create_batches()
-        print(f"📊 Batch Runner Initialized")
+        safe_print("[bold cyan]📊 Batch Runner Initialized[/bold cyan]")
-        print(f"   Dataset: {self.dataset_file} ({len(self.dataset)} prompts)")
+        safe_print(f"   Dataset: {self.dataset_file} ({len(self.dataset)} prompts)")
-        print(f"   Batch size: {self.batch_size}")
+        safe_print(f"   Batch size: {self.batch_size}")
-        print(f"   Total batches: {len(self.batches)}")
+        safe_print(f"   Total batches: {len(self.batches)}")
-        print(f"   Run name: {self.run_name}")
+        safe_print(f"   Run name: {self.run_name}")
-        print(f"   Distribution: {self.distribution}")
+        safe_print(f"   Distribution: {self.distribution}")
-        print(f"   Output directory: {self.output_dir}")
+        safe_print(f"   Output directory: {self.output_dir}")
-        print(f"   Workers: {self.num_workers}")
+        safe_print(f"   Workers: {self.num_workers}")
        safe_print(f"   [yellow]Tool failure limits:[/yellow]")
        safe_print(f"      Max failures: {self.max_tool_failures}")
        safe_print(f"      Max failure rate: {self.max_tool_failure_rate:.1%}")
        safe_print(f"      Min tool calls for rate check: {self.min_tool_calls_for_rate}")
        safe_print(f"      Keep recent errors: {self.keep_recent_errors}")
        if self.ephemeral_system_prompt:
            prompt_preview = self.ephemeral_system_prompt[:60] + "..." if len(self.ephemeral_system_prompt) > 60 else self.ephemeral_system_prompt
-            print(f"   🔒 Ephemeral system prompt: '{prompt_preview}'")
+            safe_print(f"   🔒 Ephemeral system prompt: '{prompt_preview}'")
    def _load_dataset(self) -> List[Dict[str, Any]]:
        """
@@ -476,6 +710,118 @@ class BatchRunner:
            with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
                json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
    def _consolidate_data(self, num_batches: int, tool_stats: Dict[str, Dict[str, int]],
                          start_time: float, tool_errors_by_tool: Dict[str, List[Dict]],
                          exception_errors: List[Dict], early_exit: bool = False, exit_reason: str = None,
                          profiling_stats_list: List[Dict] = None):
        """
        Consolidate batch data into trajectories.jsonl and save statistics.
        Args:
            num_batches (int): Number of batches processed
            tool_stats (Dict): Aggregated tool statistics
            start_time (float): Start time of the run
            tool_errors_by_tool (Dict): Tool errors grouped by tool name with k most recent
            exception_errors (List): Top-level exceptions
            early_exit (bool): Whether this is an early exit
            exit_reason (str): Reason for early exit
            profiling_stats_list (List[Dict]): List of profiling statistics from each conversation
        """
        # Combine all batch files into a single trajectories.jsonl file
        combined_file = self.output_dir / "trajectories.jsonl"
        safe_print(f"\n[cyan]📦 Combining batch files into {combined_file.name}...[/cyan]")
        entries_written = 0
        with open(combined_file, 'w', encoding='utf-8') as outfile:
            for batch_num in range(num_batches):
                batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
                if batch_file.exists():
                    with open(batch_file, 'r', encoding='utf-8') as infile:
                        for line in infile:
                            outfile.write(line)
                            entries_written += 1
        safe_print(f"[green]✅ Combined {num_batches} batch files into trajectories.jsonl ({entries_written} entries)[/green]")
        # Calculate success rates for tool stats
        for tool_name in tool_stats:
            stats = tool_stats[tool_name]
            total_calls = stats["success"] + stats["failure"]
            if total_calls > 0:
                stats["success_rate"] = round(stats["success"] / total_calls * 100, 2)
                stats["failure_rate"] = round(stats["failure"] / total_calls * 100, 2)
            else:
                stats["success_rate"] = 0.0
                stats["failure_rate"] = 0.0
        # Build failure type breakdown for each tool
        failure_type_breakdown = {}
        for tool_name, errors in tool_errors_by_tool.items():
            failure_types = {}
            for error in errors:
                error_type = error.get("error_type", "Other")
                if error_type not in failure_types:
                    failure_types[error_type] = 0
                failure_types[error_type] += 1
            # Calculate percentages
            total_failures = len(errors)
            failure_type_breakdown[tool_name] = {
                "total_failures": total_failures,
                "types": {
                    error_type: {
                        "count": count,
                        "percentage": round((count / total_failures) * 100, 2)
                    }
                    for error_type, count in failure_types.items()
                }
            }
        # Save error information to separate file
        error_data = {
            "run_name": self.run_name,
            "completed_at": datetime.now().isoformat(),
            "total_tool_errors": sum(len(errors) for errors in tool_errors_by_tool.values()),
            "total_exception_errors": len(exception_errors),
            "tool_errors": tool_errors_by_tool,
            "failure_type_breakdown": failure_type_breakdown,
            "exception_errors": exception_errors[:self.keep_recent_errors]  # Keep k most recent
        }
        with open(self.errors_file, 'w', encoding='utf-8') as f:
            json.dump(error_data, f, indent=2, ensure_ascii=False)
        # Aggregate profiling statistics if available
        aggregated_profiling_stats = None
        if profiling_stats_list:
            from profiling import aggregate_profiling_stats
            aggregated_profiling_stats = aggregate_profiling_stats(profiling_stats_list)
        # Save final statistics (without detailed errors)
        final_stats = {
            "run_name": self.run_name,
            "distribution": self.distribution,
            "total_prompts": len(self.dataset),
            "total_batches": len(self.batches),
            "batches_processed": num_batches,
            "batch_size": self.batch_size,
            "model": self.model,
            "completed_at": datetime.now().isoformat(),
            "duration_seconds": round(time.time() - start_time, 2),
            "early_exit": early_exit,
            "exit_reason": exit_reason,
            "tool_statistics": tool_stats,
            "profiling_statistics": aggregated_profiling_stats
        }
        with open(self.stats_file, 'w', encoding='utf-8') as f:
            json.dump(final_stats, f, indent=2, ensure_ascii=False)
        # Display aggregated profiling statistics
        if aggregated_profiling_stats:
            from profiling import print_aggregated_statistics
            print_aggregated_statistics(aggregated_profiling_stats, detailed=True)
    def run(self, resume: bool = False):
        """
@@ -507,7 +853,8 @@ class BatchRunner:
            "base_url": self.base_url,
            "api_key": self.api_key,
            "verbose": self.verbose,
-            "ephemeral_system_prompt": self.ephemeral_system_prompt
+            "ephemeral_system_prompt": self.ephemeral_system_prompt,
            "log_prefix_chars": self.log_prefix_chars
        }
        # Get completed prompts set
@@ -515,6 +862,14 @@ class BatchRunner:
        # Aggregate statistics across all batches
        total_tool_stats = {}
        all_profiling_stats = []  # Collect all profiling stats for aggregation
        tool_errors_by_tool = {}  # {tool_name: [list of k most recent errors]}
        all_exception_errors = []
        all_completed_prompts = list(completed_prompts_set)
        total_processed = len(completed_prompts_set)
        total_tool_errors = 0
        early_exit = False
        exit_reason = None
        start_time = time.time()
@@ -532,82 +887,182 @@ class BatchRunner:
                for batch_num, batch_data in enumerate(self.batches)
            ]
-            # Use map to process batches in parallel
+            # Process batches in parallel and check tool failure threshold as results come in
-            results = pool.map(_process_batch_worker, tasks)
+            # imap_unordered allows parallel processing while getting results as they complete
            batch_num = 0
            try:
                for result in pool.imap_unordered(_process_batch_worker, tasks):
                    # Update statistics
                    all_completed_prompts.extend(result.get("completed_prompts", []))
                    total_processed += result.get("processed", 0)
-        # Aggregate all batch statistics and update checkpoint
+                    # Aggregate tool stats
-        all_completed_prompts = list(completed_prompts_set)
+                    for tool_name, stats in result.get("tool_stats", {}).items():
-        for batch_result in results:
+                        if tool_name not in total_tool_stats:
-            # Add newly completed prompts
+                            total_tool_stats[tool_name] = {
-            all_completed_prompts.extend(batch_result.get("completed_prompts", []))
+                                "count": 0,
                                "success": 0,
                                "failure": 0
                            }
-            # Aggregate tool stats
+                        total_tool_stats[tool_name]["count"] += stats["count"]
-            for tool_name, stats in batch_result.get("tool_stats", {}).items():
+                        total_tool_stats[tool_name]["success"] += stats["success"]
-                if tool_name not in total_tool_stats:
+                        total_tool_stats[tool_name]["failure"] += stats["failure"]
                    total_tool_stats[tool_name] = {
                        "count": 0,
                        "success": 0,
                        "failure": 0
                    }
-                total_tool_stats[tool_name]["count"] += stats["count"]
+                    # Collect profiling stats from this batch
-                total_tool_stats[tool_name]["success"] += stats["success"]
+                    if result.get("profiling_stats"):
-                total_tool_stats[tool_name]["failure"] += stats["failure"]
+                        all_profiling_stats.extend(result["profiling_stats"])
                    # Aggregate tool errors (keep k most recent per tool)
                    for tool_error in result.get("tool_errors", []):
                        tool_name = tool_error["tool_name"]
                        if tool_name not in tool_errors_by_tool:
                            tool_errors_by_tool[tool_name] = []
                        # Add error and keep only k most recent
                        tool_errors_by_tool[tool_name].append(tool_error)
                        if len(tool_errors_by_tool[tool_name]) > self.keep_recent_errors:
                            tool_errors_by_tool[tool_name] = tool_errors_by_tool[tool_name][-self.keep_recent_errors:]
                        total_tool_errors += 1
                    # Track exception errors
                    all_exception_errors.extend(result.get("exception_errors", []))
                    # Check tool failure thresholds
                    # Calculate total tool calls (not prompts)
                    total_tool_calls = sum(stats["count"] for stats in total_tool_stats.values())
                    # Check absolute count threshold
                    if total_tool_errors >= self.max_tool_failures:
                        early_exit = True
                        exit_reason = f"Exceeded maximum tool failures ({total_tool_errors}/{self.max_tool_failures})"
                        safe_print(f"\n[bold red]🛑 STOPPING: {exit_reason}[/bold red]")
                        pool.terminate()  # Stop all workers immediately
                        break
                    # Check rate threshold (only if we have enough tool calls to trust the rate)
                    if total_tool_calls >= self.min_tool_calls_for_rate:
                        tool_failure_rate = total_tool_errors / total_tool_calls
                        if tool_failure_rate >= self.max_tool_failure_rate:
                            early_exit = True
                            exit_reason = f"Exceeded tool failure rate ({tool_failure_rate:.2%} >= {self.max_tool_failure_rate:.2%}, {total_tool_errors}/{total_tool_calls} tool calls)"
                            safe_print(f"\n[bold red]🛑 STOPPING: {exit_reason}[/bold red]")
                            pool.terminate()  # Stop all workers immediately
                            break
                    # Update checkpoint after each batch completes
                    checkpoint_data["completed_prompts"] = all_completed_prompts
                    self._save_checkpoint(checkpoint_data)
                    batch_num += 1
            except KeyboardInterrupt:
                safe_print("\n[bold yellow]⚠️  Interrupted by user, stopping workers...[/bold yellow]")
                pool.terminate()
                early_exit = True
                exit_reason = "Interrupted by user"
        # Save final checkpoint
        checkpoint_data["completed_prompts"] = all_completed_prompts
        self._save_checkpoint(checkpoint_data)
-        # Calculate success rates
+        # Consolidate data and save statistics
-        for tool_name in total_tool_stats:
+        num_batches_processed = batch_num + 1 if early_exit else len(self.batches)
-            stats = total_tool_stats[tool_name]
+        self._consolidate_data(
-            total_calls = stats["success"] + stats["failure"]
+            num_batches_processed,
-            if total_calls > 0:
+            total_tool_stats,
-                stats["success_rate"] = round(stats["success"] / total_calls * 100, 2)
+            start_time,
-                stats["failure_rate"] = round(stats["failure"] / total_calls * 100, 2)
+            tool_errors_by_tool,
-            else:
+            all_exception_errors,
-                stats["success_rate"] = 0.0
+            early_exit,
-                stats["failure_rate"] = 0.0
+            exit_reason,
-        
+            all_profiling_stats
-        # Combine all batch files into a single trajectories.jsonl file
+        )
        combined_file = self.output_dir / "trajectories.jsonl"
        print(f"\n📦 Combining batch files into {combined_file.name}...")
        with open(combined_file, 'w', encoding='utf-8') as outfile:
            for batch_num in range(len(self.batches)):
                batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
                if batch_file.exists():
                    with open(batch_file, 'r', encoding='utf-8') as infile:
                        for line in infile:
                            outfile.write(line)
        print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl")
        # Save final statistics
        final_stats = {
            "run_name": self.run_name,
            "distribution": self.distribution,
            "total_prompts": len(self.dataset),
            "total_batches": len(self.batches),
            "batch_size": self.batch_size,
            "model": self.model,
            "completed_at": datetime.now().isoformat(),
            "duration_seconds": round(time.time() - start_time, 2),
            "tool_statistics": total_tool_stats
        }
        with open(self.stats_file, 'w', encoding='utf-8') as f:
            json.dump(final_stats, f, indent=2, ensure_ascii=False)
        # Print summary
-        print("\n" + "=" * 70)
+        safe_print("\n" + "=" * 70)
-        print("📊 BATCH PROCESSING COMPLETE")
+        if early_exit:
-        print("=" * 70)
+            safe_print("[bold yellow]⚠️  BATCH PROCESSING STOPPED EARLY[/bold yellow]")
-        print(f"✅ Total prompts processed: {len(self.dataset)}")
+            safe_print(f"[yellow]Reason: {exit_reason}[/yellow]")
-        print(f"✅ Total batches: {len(self.batches)}")
+        else:
-        print(f"⏱️  Total duration: {round(time.time() - start_time, 2)}s")
+            safe_print("[bold green]📊 BATCH PROCESSING COMPLETE[/bold green]")
-        print(f"\n📈 Tool Usage Statistics:")
+        safe_print("=" * 70)
-        print("-" * 70)
+
        safe_print(f"✅ Total prompts processed: {total_processed}")
        safe_print(f"✅ Batches completed: {num_batches_processed}/{len(self.batches)}")
        safe_print(f"⏱️  Total duration: {round(time.time() - start_time, 2)}s")
        # Tool error summary
        if tool_errors_by_tool:
            total_errors = sum(len(errors) for errors in tool_errors_by_tool.values())
            safe_print(f"\n[bold red]🚨 Tool Errors: {total_tool_errors} total ({len(tool_errors_by_tool)} tools)[/bold red]")
            safe_print("[red]-[/red]" * 70)
            # Sort tools by error count
            sorted_tools = sorted(
                tool_errors_by_tool.items(),
                key=lambda x: len(x[1]),
                reverse=True
            )
            for tool_name, errors in sorted_tools:
                # Count unique error messages
                unique_errors = {}
                for error in errors:
                    error_msg = error["error_message"][:100]  # Truncate for grouping
                    if error_msg not in unique_errors:
                        unique_errors[error_msg] = []
                    unique_errors[error_msg].append(error)
                safe_print(f"\n  [red]{tool_name}:[/red] {len(errors)} errors ({len(unique_errors)} unique)")
                # Show up to 3 most recent unique error types
                for idx, (error_msg, instances) in enumerate(list(unique_errors.items())[:3]):
                    error_preview = error_msg if len(error_msg) <= 100 else error_msg[:97] + "..."
                    safe_print(f"    [{idx+1}] [dim]{error_preview}[/dim] (x{len(instances)})")
                    # Show one example with prompt index and full content prefix
                    example = instances[-1]  # Most recent
                    safe_print(f"        [dim]Prompt {example['prompt_index']}[/dim]")
                    # Show full content prefix (first 200 chars)
                    full_content = example.get('full_content', '')
                    if full_content and full_content != error_preview:
                        content_preview = full_content[:200]
                        if len(full_content) > 200:
                            content_preview += "..."
                        # Show with prefix indicator
                        safe_print(f"        [dim]Content: {content_preview}[/dim]")
                if len(unique_errors) > 3:
                    safe_print(f"    [dim]... and {len(unique_errors) - 3} more error types[/dim]")
            tool_failure_rate = total_tool_errors / total_processed if total_processed > 0 else 0
            safe_print(f"\n  [red]Tool failure rate: {tool_failure_rate:.2%}[/red]")
        # Exception errors
        if all_exception_errors:
            safe_print(f"\n[bold red]💥 Top-level Exceptions: {len(all_exception_errors)}[/bold red]")
            safe_print("[red]-[/red]" * 70)
            for error in all_exception_errors[:self.keep_recent_errors]:
                error_msg = error["error"]
                error_preview = error_msg[:150]
                if len(error_msg) > 150:
                    error_preview += "..."
                safe_print(f"  [red]Prompt {error['prompt_index']}:[/red] [dim]{error_preview}[/dim]")
                # Show traceback prefix if available
                traceback_text = error.get("traceback", "")
                if traceback_text:
                    # Show last 3 lines of traceback for context
                    tb_lines = traceback_text.strip().split('\n')
                    relevant_lines = tb_lines[-3:] if len(tb_lines) > 3 else tb_lines
                    for line in relevant_lines:
                        safe_print(f"    [dim]{line}[/dim]")
        safe_print(f"\n[cyan]📈 Tool Usage Statistics:[/cyan]")
        safe_print("-" * 70)
        if total_tool_stats:
            # Sort by count descending
@@ -617,24 +1072,67 @@ class BatchRunner:
                reverse=True
            )
-            print(f"{'Tool Name':<25} {'Count':<10} {'Success':<10} {'Failure':<10} {'Success Rate':<12}")
+            safe_print(f"{'Tool Name':<25} {'Count':<10} {'Success':<10} {'Failure':<10} {'Success Rate':<12}")
-            print("-" * 70)
+            safe_print("-" * 70)
            for tool_name, stats in sorted_tools:
-                print(
+                safe_print(
                    f"{tool_name:<25} "
                    f"{stats['count']:<10} "
                    f"{stats['success']:<10} "
                    f"{stats['failure']:<10} "
-                    f"{stats['success_rate']:.1f}%"
+                    f"{stats.get('success_rate', 0):.1f}%"
                )
        else:
-            print("No tool calls were made during this run.")
+            safe_print("No tool calls were made during this run.")
-        print(f"\n💾 Results saved to: {self.output_dir}")
+        # Display failure type breakdown for tools with failures
-        print(f"   - Trajectories: trajectories.jsonl (combined)")
+        if tool_errors_by_tool:
-        print(f"   - Individual batches: batch_*.jsonl (for debugging)")
+            safe_print(f"\n[cyan]📊 Failure Type Breakdown:[/cyan]")
-        print(f"   - Statistics: {self.stats_file.name}")
+            safe_print("-" * 70)
-        print(f"   - Checkpoint: {self.checkpoint_file.name}")
+
            # Sort tools by total error count
            sorted_tools = sorted(
                tool_errors_by_tool.items(),
                key=lambda x: len(x[1]),
                reverse=True
            )
            for tool_name, errors in sorted_tools:
                # Count failure types for this tool
                failure_types = {}
                for error in errors:
                    error_type = error.get("error_type", "Other")
                    if error_type not in failure_types:
                        failure_types[error_type] = 0
                    failure_types[error_type] += 1
                # Display tool name and total failures
                total_failures = len(errors)
                safe_print(f"\n[yellow]{tool_name}[/yellow] ({total_failures} failures):")
                # Sort failure types by count
                sorted_types = sorted(
                    failure_types.items(),
                    key=lambda x: x[1],
                    reverse=True
                )
                # Display each failure type with count and percentage
                for failure_type, count in sorted_types:
                    percentage = (count / total_failures) * 100
                    safe_print(f"  • {failure_type:<20} {count:>4} ({percentage:>5.1f}%)")
        safe_print(f"\n[cyan]💾 Results saved to:[/cyan] {self.output_dir}")
        safe_print(f"   - Trajectories: trajectories.jsonl (combined)")
        safe_print(f"   - Individual batches: batch_*.jsonl (for debugging)")
        safe_print(f"   - Statistics: {self.stats_file.name}")
        safe_print(f"   - Errors: {self.errors_file.name}")
        safe_print(f"   - Checkpoint: {self.checkpoint_file.name}")
        if early_exit:
            safe_print(f"\n[bold yellow]ℹ️  Run was stopped early due to tool failures.[/bold yellow]")
            safe_print(f"[yellow]   Check {self.errors_file.name} for detailed error information including tracebacks.[/yellow]")
            safe_print(f"[yellow]   You can resume this run later with --resume flag.[/yellow]")
 def main(
@@ -650,7 +1148,12 @@ def main(
    resume: bool = False,
    verbose: bool = False,
    list_distributions: bool = False,
-    ephemeral_system_prompt: str = None
+    ephemeral_system_prompt: str = None,
    log_prefix_chars: int = 100,
    max_tool_failures: int = 10,
    max_tool_failure_rate: float = 0.5,
    keep_recent_errors: int = 5,
    min_tool_calls_for_rate: int = 10,
 ):
    """
    Run batch processing of agent prompts from a dataset.
@@ -669,6 +1172,11 @@ def main(
        verbose (bool): Enable verbose logging (default: False)
        list_distributions (bool): List available toolset distributions and exit
        ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
        max_tool_failures (int): Maximum number of tool failures before stopping (default: 10)
        max_tool_failure_rate (float): Maximum tool failure rate (0.0-1.0) before stopping (default: 0.5)
        keep_recent_errors (int): Number of recent errors to keep per tool for reporting (default: 5)
        min_tool_calls_for_rate (int): Minimum number of tool calls before checking failure rate (default: 10)
    Examples:
        # Basic usage
@@ -684,6 +1192,10 @@ def main(
        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
                               --ephemeral_system_prompt="You are a helpful assistant focused on image generation."
        # With custom tool failure thresholds
        python batch_runner.py --dataset_file=data.jsonl --batch_size=10 --run_name=my_run \\
                               --max_tool_failures=20 --max_tool_failure_rate=0.3 --min_tool_calls_for_rate=10 --keep_recent_errors=10
        # List available distributions
        python batch_runner.py --list_distributions
    """
@@ -729,7 +1241,12 @@ def main(
            model=model,
            num_workers=num_workers,
            verbose=verbose,
-            ephemeral_system_prompt=ephemeral_system_prompt
+            ephemeral_system_prompt=ephemeral_system_prompt,
            log_prefix_chars=log_prefix_chars,
            max_tool_failures=max_tool_failures,
            max_tool_failure_rate=max_tool_failure_rate,
            keep_recent_errors=keep_recent_errors,
            min_tool_calls_for_rate=min_tool_calls_for_rate
        )
        runner.run(resume=resume)
@@ -743,4 +1260,3 @@ def main(
 if __name__ == "__main__":
    fire.Fire(main)
--- a/model_tools.py
+++ b/model_tools.py
@@ -31,7 +31,9 @@ import asyncio
 from typing import Dict, Any, List, Optional
 from tools.web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
-from tools.terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
+from tools.simple_terminal_tool import simple_terminal_tool, check_requirements as check_simple_terminal_requirements, SIMPLE_TERMINAL_TOOL_DESCRIPTION
 # Keep old terminal tool for backwards compatibility if needed
 # from tools.terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
 from tools.vision_tools import vision_analyze_tool, check_vision_requirements
 from tools.mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
 from tools.image_generation_tool import image_generate_tool, check_image_generation_requirements
@@ -120,7 +122,7 @@ def get_terminal_tool_definitions() -> List[Dict[str, Any]]:
            "type": "function",
            "function": {
                "name": "terminal",
-                "description": TERMINAL_TOOL_DESCRIPTION,
+                "description": SIMPLE_TERMINAL_TOOL_DESCRIPTION,
                "parameters": {
                    "type": "object",
                    "properties": {
@@ -128,28 +130,18 @@ def get_terminal_tool_definitions() -> List[Dict[str, Any]]:
                            "type": "string",
                            "description": "The command to execute on the VM"
                        },
                        "input_keys": {
                            "type": "string",
                            "description": "Keystrokes to send to the most recent interactive session (e.g., 'hello\\n' for typing hello + Enter). If no active session exists, this will be ignored."
                        },
                        "background": {
                            "type": "boolean",
                            "description": "Whether to run the command in the background (default: false)",
                            "default": False
                        },
                        "idle_threshold": {
                            "type": "number",
                            "description": "Seconds to wait for output before considering session idle (default: 5.0)",
                            "default": 5.0,
                            "minimum": 0.1
                        },
                        "timeout": {
                            "type": "integer",
                            "description": "Command timeout in seconds (optional)",
                            "minimum": 1
                        }
                    },
-                    "required": []
+                    "required": ["command"]
                }
            }
        }
@@ -264,7 +256,7 @@ def get_all_tool_names() -> List[str]:
        tool_names.extend(["web_search", "web_extract", "web_crawl"])
    # Terminal tools
-    if check_hecate_requirements():
+    if check_simple_terminal_requirements():
        tool_names.extend(["terminal"])
    # Vision tools
@@ -347,7 +339,7 @@ def get_tool_definitions(
        for tool in get_web_tool_definitions():
            all_available_tools_map[tool["function"]["name"]] = tool
-    if check_hecate_requirements():
+    if check_simple_terminal_requirements():
        for tool in get_terminal_tool_definitions():
            all_available_tools_map[tool["function"]["name"]] = tool
@@ -494,12 +486,10 @@ def handle_terminal_function_call(function_name: str, function_args: Dict[str, A
    """
    if function_name == "terminal":
        command = function_args.get("command")
        input_keys = function_args.get("input_keys")
        background = function_args.get("background", False)
        idle_threshold = function_args.get("idle_threshold", 5.0)
        timeout = function_args.get("timeout")
-        return terminal_tool(command, input_keys, None, background, idle_threshold, timeout, task_id)
+        return simple_terminal_tool(command=command, background=background, timeout=timeout, task_id=task_id)
    else:
        return json.dumps({"error": f"Unknown terminal function: {function_name}"}, ensure_ascii=False)
@@ -681,10 +671,10 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
            "requirements": ["FIRECRAWL_API_KEY environment variable"]
        },
        "terminal_tools": {
-            "available": check_hecate_requirements(),
+            "available": check_simple_terminal_requirements(),
-            "tools": ["terminal_tool"],
+            "tools": ["simple_terminal_tool"],
-            "description": "Execute commands with optional interactive session support on Linux VMs",
+            "description": "Execute commands on secure Linux VMs without session persistence",
-            "requirements": ["MORPH_API_KEY environment variable", "hecate package"]
+            "requirements": ["MORPH_API_KEY environment variable"]
        },
        "vision_tools": {
            "available": check_vision_requirements(),
@@ -717,7 +707,7 @@ def check_toolset_requirements() -> Dict[str, bool]:
    """
    return {
        "web_tools": check_firecrawl_api_key(),
-        "terminal_tools": check_hecate_requirements(),
+        "terminal_tools": check_simple_terminal_requirements(),
        "vision_tools": check_vision_requirements(),
        "moa_tools": check_moa_requirements(),
        "image_tools": check_image_generation_requirements()
--- a/profiling.py
+++ b/profiling.py
@@ -0,0 +1,381 @@
 """
 Profiling module for tracking timing statistics of tools and LLM API calls.
 This module provides a centralized way to track timing information for various
 operations in the agent system, including:
 - Individual tool executions
 - OpenAI API calls
 - Aggregate statistics (min, max, median, mean, total)
 """
 import time
 from typing import Dict, List, Optional
 from dataclasses import dataclass, field
 from collections import defaultdict
 import statistics
@dataclass
 class ProfilingStats:
    """Statistics for a particular operation type."""
    call_count: int = 0
    total_time: float = 0.0
    min_time: float = float('inf')
    max_time: float = 0.0
    times: List[float] = field(default_factory=list)
    def add_timing(self, duration: float):
        """Add a timing measurement."""
        self.call_count += 1
        self.total_time += duration
        self.min_time = min(self.min_time, duration)
        self.max_time = max(self.max_time, duration)
        self.times.append(duration)
    @property
    def mean_time(self) -> float:
        """Calculate mean time."""
        return self.total_time / self.call_count if self.call_count > 0 else 0.0
    @property
    def median_time(self) -> float:
        """Calculate median time."""
        return statistics.median(self.times) if self.times else 0.0
    def to_dict(self) -> Dict:
        """Convert to dictionary for serialization."""
        return {
            "call_count": self.call_count,
            "total_time": self.total_time,
            "min_time": self.min_time if self.min_time != float('inf') else 0.0,
            "max_time": self.max_time,
            "mean_time": self.mean_time,
            "median_time": self.median_time
        }
 class Profiler:
    """
    Global profiler for tracking timing statistics across tools and API calls.
    Usage:
        profiler = Profiler()
        # Time a tool execution
        with profiler.time_tool("web_search"):
            # ... tool execution code ...
            pass
        # Time an API call
        with profiler.time_api_call():
            # ... API call code ...
            pass
        # Get statistics
        stats = profiler.get_statistics()
    """
    def __init__(self):
        """Initialize the profiler."""
        self.tool_stats: Dict[str, ProfilingStats] = defaultdict(ProfilingStats)
        self.api_stats: ProfilingStats = ProfilingStats()
        self._enabled = True
    def enable(self):
        """Enable profiling."""
        self._enabled = True
    def disable(self):
        """Disable profiling."""
        self._enabled = False
    def reset(self):
        """Reset all profiling data."""
        self.tool_stats.clear()
        self.api_stats = ProfilingStats()
    def record_tool_timing(self, tool_name: str, duration: float):
        """Record timing for a tool execution."""
        if self._enabled:
            self.tool_stats[tool_name].add_timing(duration)
    def record_api_timing(self, duration: float):
        """Record timing for an API call."""
        if self._enabled:
            self.api_stats.add_timing(duration)
    def get_statistics(self) -> Dict:
        """
        Get all profiling statistics.
        Returns:
            Dictionary containing tool and API statistics
        """
        return {
            "tools": {
                tool_name: stats.to_dict()
                for tool_name, stats in sorted(self.tool_stats.items())
            },
            "api_calls": self.api_stats.to_dict()
        }
    def print_statistics(self, detailed: bool = True):
        """
        Print profiling statistics in a readable format.
        Args:
            detailed: If True, show per-tool breakdown. If False, show summary only.
        """
        print("\n" + "="*80)
        print("📊 PROFILING STATISTICS")
        print("="*80)
        # API Call Statistics
        print("\n🔷 OpenAI API Calls:")
        if self.api_stats.call_count > 0:
            api_dict = self.api_stats.to_dict()
            print(f"  Total Calls:  {api_dict['call_count']}")
            print(f"  Total Time:   {api_dict['total_time']:.2f}s")
            print(f"  Min Time:     {api_dict['min_time']:.2f}s")
            print(f"  Max Time:     {api_dict['max_time']:.2f}s")
            print(f"  Mean Time:    {api_dict['mean_time']:.2f}s")
            print(f"  Median Time:  {api_dict['median_time']:.2f}s")
        else:
            print("  No API calls recorded")
        # Tool Statistics
        print("\n🔧 Tool Executions:")
        if self.tool_stats:
            if detailed:
                for tool_name in sorted(self.tool_stats.keys()):
                    stats_dict = self.tool_stats[tool_name].to_dict()
                    print(f"\n  📌 {tool_name}:")
                    print(f"     Total Calls:  {stats_dict['call_count']}")
                    print(f"     Total Time:   {stats_dict['total_time']:.2f}s")
                    print(f"     Min Time:     {stats_dict['min_time']:.2f}s")
                    print(f"     Max Time:     {stats_dict['max_time']:.2f}s")
                    print(f"     Mean Time:    {stats_dict['mean_time']:.2f}s")
                    print(f"     Median Time:  {stats_dict['median_time']:.2f}s")
            # Summary
            total_tool_calls = sum(s.call_count for s in self.tool_stats.values())
            total_tool_time = sum(s.total_time for s in self.tool_stats.values())
            print(f"\n  📊 Summary:")
            print(f"     Total Tool Calls:  {total_tool_calls}")
            print(f"     Total Tool Time:   {total_tool_time:.2f}s")
            print(f"     Unique Tools Used: {len(self.tool_stats)}")
        else:
            print("  No tool executions recorded")
        # Overall Summary
        total_api_time = self.api_stats.total_time
        total_tool_time = sum(s.total_time for s in self.tool_stats.values())
        print(f"\n📈 Overall Summary:")
        print(f"  Total API Time:   {total_api_time:.2f}s")
        print(f"  Total Tool Time:  {total_tool_time:.2f}s")
        print(f"  Total Time:       {total_api_time + total_tool_time:.2f}s")
        print("="*80 + "\n")
    def export_to_json(self) -> str:
        """Export statistics as JSON string."""
        import json
        return json.dumps(self.get_statistics(), indent=2)
    def export_to_file(self, filepath: str):
        """
        Export statistics to a JSON file.
        Args:
            filepath: Path to output file
        """
        import json
        with open(filepath, 'w') as f:
            json.dump(self.get_statistics(), f, indent=2)
        print(f"📁 Profiling statistics exported to: {filepath}")
 # Global profiler instance
 _global_profiler: Optional[Profiler] = None
 def get_profiler() -> Profiler:
    """Get or create the global profiler instance."""
    global _global_profiler
    if _global_profiler is None:
        _global_profiler = Profiler()
    return _global_profiler
 def reset_profiler():
    """Reset the global profiler."""
    global _global_profiler
    if _global_profiler is not None:
        _global_profiler.reset()
 class TimingContext:
    """Context manager for timing operations."""
    def __init__(self, profiler: Profiler, operation_type: str, operation_name: Optional[str] = None):
        """
        Initialize timing context.
        Args:
            profiler: Profiler instance to record timing
            operation_type: 'tool' or 'api'
            operation_name: Name of the operation (required for tools)
        """
        self.profiler = profiler
        self.operation_type = operation_type
        self.operation_name = operation_name
        self.start_time = None
    def __enter__(self):
        """Start timing."""
        self.start_time = time.time()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Stop timing and record."""
        duration = time.time() - self.start_time
        if self.operation_type == 'tool':
            self.profiler.record_tool_timing(self.operation_name, duration)
        elif self.operation_type == 'api':
            self.profiler.record_api_timing(duration)
        return False  # Don't suppress exceptions
 def aggregate_profiling_stats(stats_list: List[Dict]) -> Dict:
    """
    Aggregate multiple profiling statistics dictionaries into one.
    This is useful for batch processing where each worker process has its own
    profiler instance that needs to be combined.
    Args:
        stats_list: List of statistics dictionaries from get_statistics()
    Returns:
        Dict: Aggregated statistics with combined tool and API call data
    """
    aggregated = {
        "tools": defaultdict(lambda: {"times": []}),
        "api_calls": {"times": []}
    }
    # Aggregate tool statistics
    for stats in stats_list:
        # Aggregate tool timings
        for tool_name, tool_stats in stats.get("tools", {}).items():
            # Reconstruct individual timings from aggregated stats
            # Since we have mean_time and call_count, we approximate
            aggregated["tools"][tool_name]["times"].extend(
                [tool_stats.get("mean_time", 0.0)] * tool_stats.get("call_count", 0)
            )
        # Aggregate API call timings
        api_stats = stats.get("api_calls", {})
        if api_stats.get("call_count", 0) > 0:
            aggregated["api_calls"]["times"].extend(
                [api_stats.get("mean_time", 0.0)] * api_stats.get("call_count", 0)
            )
    # Calculate final statistics for tools
    final_stats = {"tools": {}, "api_calls": {}}
    for tool_name, data in aggregated["tools"].items():
        times = data["times"]
        if times:
            final_stats["tools"][tool_name] = {
                "call_count": len(times),
                "total_time": sum(times),
                "min_time": min(times),
                "max_time": max(times),
                "mean_time": statistics.mean(times),
                "median_time": statistics.median(times)
            }
    # Calculate final statistics for API calls
    api_times = aggregated["api_calls"]["times"]
    if api_times:
        final_stats["api_calls"] = {
            "call_count": len(api_times),
            "total_time": sum(api_times),
            "min_time": min(api_times),
            "max_time": max(api_times),
            "mean_time": statistics.mean(api_times),
            "median_time": statistics.median(api_times)
        }
    else:
        final_stats["api_calls"] = {
            "call_count": 0,
            "total_time": 0.0,
            "min_time": 0.0,
            "max_time": 0.0,
            "mean_time": 0.0,
            "median_time": 0.0
        }
    return final_stats
 def print_aggregated_statistics(stats: Dict, detailed: bool = True):
    """
    Print aggregated profiling statistics in a readable format.
    Args:
        stats: Aggregated statistics dictionary from aggregate_profiling_stats()
        detailed: If True, show per-tool breakdown. If False, show summary only.
    """
    print("\n" + "="*80)
    print("📊 AGGREGATED PROFILING STATISTICS")
    print("="*80)
    # API Call Statistics
    print("\n🔷 OpenAI API Calls:")
    api_stats = stats.get("api_calls", {})
    if api_stats.get("call_count", 0) > 0:
        print(f"  Total Calls:  {api_stats['call_count']}")
        print(f"  Total Time:   {api_stats['total_time']:.2f}s")
        print(f"  Min Time:     {api_stats['min_time']:.2f}s")
        print(f"  Max Time:     {api_stats['max_time']:.2f}s")
        print(f"  Mean Time:    {api_stats['mean_time']:.2f}s")
        print(f"  Median Time:  {api_stats['median_time']:.2f}s")
    else:
        print("  No API calls recorded")
    # Tool Statistics
    print("\n🔧 Tool Executions:")
    tool_stats = stats.get("tools", {})
    if tool_stats:
        if detailed:
            for tool_name in sorted(tool_stats.keys()):
                stats_dict = tool_stats[tool_name]
                print(f"\n  📌 {tool_name}:")
                print(f"     Total Calls:  {stats_dict['call_count']}")
                print(f"     Total Time:   {stats_dict['total_time']:.2f}s")
                print(f"     Min Time:     {stats_dict['min_time']:.2f}s")
                print(f"     Max Time:     {stats_dict['max_time']:.2f}s")
                print(f"     Mean Time:    {stats_dict['mean_time']:.2f}s")
                print(f"     Median Time:  {stats_dict['median_time']:.2f}s")
        # Summary
        total_tool_calls = sum(s["call_count"] for s in tool_stats.values())
        total_tool_time = sum(s["total_time"] for s in tool_stats.values())
        print(f"\n  📊 Summary:")
        print(f"     Total Tool Calls:  {total_tool_calls}")
        print(f"     Total Tool Time:   {total_tool_time:.2f}s")
        print(f"     Unique Tools Used: {len(tool_stats)}")
    else:
        print("  No tool executions recorded")
    # Overall Summary
    total_api_time = api_stats.get("total_time", 0.0)
    total_tool_time = sum(s["total_time"] for s in tool_stats.values())
    print(f"\n📈 Overall Summary:")
    print(f"  Total API Time:   {total_api_time:.2f}s")
    print(f"  Total Tool Time:  {total_tool_time:.2f}s")
    print(f"  Total Time:       {total_api_time + total_tool_time:.2f}s")
    print("="*80 + "\n")
--- a/run_agent.py
+++ b/run_agent.py
@@ -45,6 +45,9 @@ else:
 from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
 from tools.terminal_tool import cleanup_vm
 # Import profiling
 from profiling import get_profiler
 class AIAgent:
    """
@@ -65,7 +68,8 @@ class AIAgent:
        disabled_toolsets: List[str] = None,
        save_trajectories: bool = False,
        verbose_logging: bool = False,
-        ephemeral_system_prompt: str = None
+        ephemeral_system_prompt: str = None,
        log_prefix_chars: int = 100,
    ):
        """
        Initialize the AI Agent.
@@ -81,6 +85,7 @@ class AIAgent:
            save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False)
            verbose_logging (bool): Enable verbose logging for debugging (default: False)
            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 20)
        """
        self.model = model
        self.max_iterations = max_iterations
@@ -88,6 +93,7 @@ class AIAgent:
        self.save_trajectories = save_trajectories
        self.verbose_logging = verbose_logging
        self.ephemeral_system_prompt = ephemeral_system_prompt
        self.log_prefix_chars = log_prefix_chars
        # Store toolset filtering options
        self.enabled_toolsets = enabled_toolsets
@@ -361,6 +367,10 @@ class AIAgent:
        Returns:
            Dict: Complete conversation result with final response and message history
        """
        # Reset profiler for this conversation to get fresh stats
        from profiling import reset_profiler as reset_prof
        reset_prof()
        # Generate unique task_id if not provided to isolate VMs between concurrent tasks
        import uuid
        effective_task_id = task_id or str(uuid.uuid4())
@@ -385,7 +395,7 @@ class AIAgent:
        while api_call_count < self.max_iterations:
            api_call_count += 1
-            print(f"\n🔄 Making API call #{api_call_count}...")
+            print(f"\n🔄 Making OpenAI-compatible API call #{api_call_count}...")
            # Log request details if verbose
            if self.verbose_logging:
@@ -394,7 +404,7 @@ class AIAgent:
            api_start_time = time.time()
            retry_count = 0
-            max_retries = 3
+            max_retries = 6  # Increased to allow longer backoff periods
            while retry_count <= max_retries:
                try:
@@ -410,11 +420,14 @@ class AIAgent:
                        model=self.model,
                        messages=api_messages,
                        tools=self.tools if self.tools else None,
-                        timeout=60.0  # Add explicit timeout
+                        timeout=300.0  # 5 minute timeout for long-running agent tasks
                    )
                    api_duration = time.time() - api_start_time
-                    print(f"⏱️  API call completed in {api_duration:.2f}s")
+                    print(f"⏱️  OpenAI-compatible API call completed in {api_duration:.2f}s")
                    # Record API timing in profiler
                    get_profiler().record_api_timing(api_duration)
                    if self.verbose_logging:
                        logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
@@ -426,8 +439,8 @@ class AIAgent:
                    if retry_count > max_retries:
                        raise api_error
-                    wait_time = min(2 ** retry_count, 10)  # Exponential backoff, max 10s
+                    wait_time = min(2 ** retry_count, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
-                    print(f"⚠️  API call failed (attempt {retry_count}/{max_retries}): {str(api_error)[:100]}")
+                    print(f"⚠️  OpenAI-compatible API call failed (attempt {retry_count}/{max_retries}): {str(api_error)[:100]}")
                    print(f"⏳ Retrying in {wait_time}s...")
                    logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}")
                    time.sleep(wait_time)
@@ -474,7 +487,10 @@ class AIAgent:
                            print(f"❌ Invalid JSON in tool call arguments: {e}")
                            function_args = {}
-                        print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                        # Preview tool call arguments
                        args_str = json.dumps(function_args, ensure_ascii=False)
                        args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
                        print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
                        tool_start_time = time.time()
@@ -484,6 +500,9 @@ class AIAgent:
                        tool_duration = time.time() - tool_start_time
                        result_preview = function_result[:200] if len(function_result) > 200 else function_result
                        # Record tool timing in profiler
                        get_profiler().record_tool_timing(function_name, tool_duration)
                        if self.verbose_logging:
                            logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
                            logging.debug(f"Tool result preview: {result_preview}...")
@@ -495,7 +514,9 @@ class AIAgent:
                            "tool_call_id": tool_call.id
                        })
-                        print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
+                        # Preview tool response
                        response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
                        print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
                        # Delay between tool calls
                        if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
@@ -514,11 +535,11 @@ class AIAgent:
                        "content": final_response
                    })
-                    print(f"🎉 Conversation completed after {api_call_count} API call(s)")
+                    print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
                    break
            except Exception as e:
-                error_msg = f"Error during API call #{api_call_count}: {str(e)}"
+                error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}"
                print(f"❌ {error_msg}")
                if self.verbose_logging:
@@ -554,11 +575,15 @@ class AIAgent:
            if self.verbose_logging:
                logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
        # Get profiling statistics for this conversation
        profiling_stats = get_profiler().get_statistics()
        return {
            "final_response": final_response,
            "messages": messages,
            "api_calls": api_call_count,
-            "completed": completed
+            "completed": completed,
            "profiling_stats": profiling_stats
        }
    def chat(self, message: str) -> str:
@@ -585,7 +610,9 @@ def main(
    disabled_toolsets: str = None,
    list_tools: bool = False,
    save_trajectories: bool = False,
-    verbose: bool = False
+    verbose: bool = False,
    log_prefix_chars: int = 20,
    show_profiling: bool = True
 ):
    """
    Main function for running the agent directly.
@@ -603,6 +630,8 @@ def main(
        list_tools (bool): Just list available tools and exit
        save_trajectories (bool): Save conversation trajectories to JSONL files. Defaults to False.
        verbose (bool): Enable verbose logging for debugging. Defaults to False.
        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.
        show_profiling (bool): Display profiling statistics after conversation. Defaults to True.
    Toolset Examples:
        - "research": Web search, extract, crawl + vision tools
@@ -720,7 +749,8 @@ def main(
            enabled_toolsets=enabled_toolsets_list,
            disabled_toolsets=disabled_toolsets_list,
            save_trajectories=save_trajectories,
-            verbose_logging=verbose
+            verbose_logging=verbose,
            log_prefix_chars=log_prefix_chars
        )
    except RuntimeError as e:
        print(f"❌ Failed to initialize agent: {e}")
@@ -753,6 +783,10 @@ def main(
        print("-" * 30)
        print(result['final_response'])
    # Display profiling statistics if enabled
    if show_profiling:
        get_profiler().print_statistics(detailed=True)
    print("\n👋 Agent execution completed!")
--- a/safe_print.py
+++ b/safe_print.py
@@ -0,0 +1,20 @@
 #!/usr/bin/env python3
 """Simple safe print that tries rich, falls back to regular print."""
 try:
    from rich import print as rich_print
    RICH_AVAILABLE = True
 except ImportError:
    RICH_AVAILABLE = False
 def safe_print(*args, **kwargs):
    """Try rich.print, fall back to regular print if it fails."""
    if RICH_AVAILABLE:
        try:
            rich_print(*args, **kwargs)
            return
        except Exception:
            pass
    # Fallback to regular print
    print(*args, **kwargs)
--- a/tools/mixture_of_agents_tool.py
+++ b/tools/mixture_of_agents_tool.py
@@ -165,7 +165,7 @@ async def _run_reference_model_safe(
    user_prompt: str,
    temperature: float = REFERENCE_TEMPERATURE,
    max_tokens: int = 32000,
-    max_retries: int = 3
+    max_retries: int = 6
 ) -> tuple[str, str, bool]:
    """
    Run a single reference model with retry logic and graceful failure handling.
@@ -212,8 +212,8 @@ async def _run_reference_model_safe(
                print(f"⚠️  {model} unknown error (attempt {attempt + 1}): {error_str}")
            if attempt < max_retries - 1:
-                # Exponential backoff for rate limiting
+                # Exponential backoff for rate limiting: 2s, 4s, 8s, 16s, 32s, 60s
-                sleep_time = 2 ** attempt
+                sleep_time = min(2 ** (attempt + 1), 60)
                print(f"   Retrying in {sleep_time}s...")
                await asyncio.sleep(sleep_time)
            else:
--- a/tools/simple_terminal_tool.py
+++ b/tools/simple_terminal_tool.py
@@ -0,0 +1,395 @@
 #!/usr/bin/env python3
 """
 Simple Terminal Tool Module
 A simplified terminal tool that executes commands on MorphCloud VMs without tmux.
 No session persistence, no interactive app support - just simple command execution.
 Features:
 - Direct SSH command execution
 - Background task support
 - VM lifecycle management with TTL
 - Automatic cleanup after inactivity
 Usage:
    from simple_terminal_tool import simple_terminal_tool
    # Execute a simple command
    result = simple_terminal_tool("ls -la")
    # Execute in background
    result = simple_terminal_tool("python server.py", background=True)
 """
 import json
 import os
 import time
 import threading
 import atexit
 from typing import Optional, Dict, Any
 # Tool description for LLM
 SIMPLE_TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux VM environment.
 **Environment:**
 - Minimal Debian-based OS with internet access
 - Automatic VM lifecycle management (creates on-demand, reuses, cleans up)
 - Filesystem is persisted between tool calls but environment variables, venvs, etc are reset.
 **Command Execution:**
 - Simple commands: Just provide the 'command' parameter
 - Background processes: Set 'background': True for servers/long-running tasks
 - Command timeout: Optional 'timeout' parameter in seconds
 **Examples:**
 - Run command: `{"command": "ls -la"}`
 - Background task: `{"command": "source path/to/my/venv/bin/activate && python server.py", "background": True}`
 - With timeout: `{"command": "long_task.sh", "timeout": 300}`
 **Best Practices:**
 - Run servers/long processes in background
 - Monitor disk usage for large tasks
 - Install whatever tools you need with sudo apt-get
 - Do not be afraid to run pip with --break-system-packages
 **Things to avoid**
 - Do NOT use interactive tools such as tmux, vim, nano, python repl - you will get stuck. Even git sometimes becomes interactive if the output is large. If you're not sure pipe to cat.
 """
 # Global state for VM lifecycle management
 _active_instances: Dict[str, Any] = {}
 _last_activity: Dict[str, float] = {}
 _instance_lock = threading.Lock()
 _cleanup_thread = None
 _cleanup_running = False
 def _cleanup_inactive_vms(vm_lifetime_seconds: int = 300):
    """Clean up VMs that have been inactive for longer than vm_lifetime_seconds."""
    global _active_instances, _last_activity
    current_time = time.time()
    tasks_to_cleanup = []
    with _instance_lock:
        for task_id, last_time in list(_last_activity.items()):
            if current_time - last_time > vm_lifetime_seconds:
                tasks_to_cleanup.append(task_id)
        for task_id in tasks_to_cleanup:
            try:
                if task_id in _active_instances:
                    instance = _active_instances[task_id]
                    if hasattr(instance, 'terminate'):
                        instance.terminate()
                    elif hasattr(instance, 'stop'):
                        instance.stop()
                    elif hasattr(instance, 'delete'):
                        instance.delete()
                    del _active_instances[task_id]
                    print(f"[VM Cleanup] Terminated inactive VM for task: {task_id}")
                if task_id in _last_activity:
                    del _last_activity[task_id]
            except Exception as e:
                # 404 errors are benign - VM already cleaned up by TTL
                error_str = str(e)
                if "404" in error_str or "InstanceNotFoundError" in error_str or "not found" in error_str.lower():
                    print(f"[VM Cleanup] VM for task {task_id} already cleaned up (likely TTL expiration)")
                else:
                    print(f"[VM Cleanup] Error cleaning up VM for task {task_id}: {e}")
 def _cleanup_thread_worker():
    """Background thread worker that periodically cleans up inactive VMs."""
    global _cleanup_running
    while _cleanup_running:
        try:
            vm_lifetime = int(os.getenv("HECATE_VM_LIFETIME_SECONDS", "300"))
            _cleanup_inactive_vms(vm_lifetime)
        except Exception as e:
            print(f"[VM Cleanup] Error in cleanup thread: {e}")
        for _ in range(60):
            if not _cleanup_running:
                break
            time.sleep(1)
 def _start_cleanup_thread():
    """Start the background cleanup thread if not already running."""
    global _cleanup_thread, _cleanup_running
    with _instance_lock:
        if _cleanup_thread is None or not _cleanup_thread.is_alive():
            _cleanup_running = True
            _cleanup_thread = threading.Thread(target=_cleanup_thread_worker, daemon=True)
            _cleanup_thread.start()
 def _stop_cleanup_thread():
    """Stop the background cleanup thread."""
    global _cleanup_running
    _cleanup_running = False
    if _cleanup_thread is not None:
        _cleanup_thread.join(timeout=5)
 def cleanup_vm(task_id: str):
    """Manually clean up a specific VM by task_id."""
    global _active_instances, _last_activity
    with _instance_lock:
        try:
            if task_id in _active_instances:
                instance = _active_instances[task_id]
                if hasattr(instance, 'terminate'):
                    instance.terminate()
                elif hasattr(instance, 'stop'):
                    instance.stop()
                elif hasattr(instance, 'delete'):
                    instance.delete()
                del _active_instances[task_id]
                print(f"[VM Cleanup] Manually terminated VM for task: {task_id}")
            if task_id in _last_activity:
                del _last_activity[task_id]
        except Exception as e:
            # 404 errors are benign - VM already cleaned up by TTL
            error_str = str(e)
            if "404" in error_str or "InstanceNotFoundError" in error_str or "not found" in error_str.lower():
                print(f"[VM Cleanup] VM for task {task_id} already cleaned up (likely TTL expiration)")
            else:
                print(f"[VM Cleanup] Error manually cleaning up VM for task {task_id}: {e}")
 atexit.register(_stop_cleanup_thread)
 def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None) -> Dict[str, Any]:
    """
    Execute a command via SSH on the VM instance.
    Args:
        instance: MorphVM instance
        command: Command to execute
        timeout: Optional timeout in seconds
    Returns:
        dict with stdout, stderr, returncode
    """
    ssh_context_manager = None
    try:
        # Use the instance's SSH context manager
        ssh_context_manager = instance.ssh()
        ssh_context = ssh_context_manager.__enter__()
        # Execute the command
        result = ssh_context.run(command, get_pty=False, timeout=timeout or 120)
        # Close the SSH connection
        if ssh_context_manager:
            try:
                ssh_context_manager.__exit__(None, None, None)
            except:
                pass
        return {
            "stdout": result.stdout or "",
            "stderr": result.stderr or "",
            "returncode": result.returncode
        }
    except Exception as e:
        # Close connection on error
        if ssh_context_manager:
            try:
                ssh_context_manager.__exit__(None, None, None)
            except:
                pass
        # Check if it's a timeout
        error_str = str(e).lower()
        if "timeout" in error_str:
            return {
                "stdout": "",
                "stderr": f"Command timed out after {timeout or 120} seconds",
                "returncode": 124
            }
        return {
            "stdout": "",
            "stderr": f"SSH execution failed: {str(e)}",
            "returncode": -1
        }
 def simple_terminal_tool(
    command: str,
    background: bool = False,
    timeout: Optional[int] = None,
    task_id: Optional[str] = None
 ) -> str:
    """
    Execute a command on a MorphCloud VM without session persistence.
    Args:
        command: The command to execute
        background: Whether to run in background (default: False)
        timeout: Command timeout in seconds (default: 120)
        task_id: Unique identifier for VM isolation (optional)
    Returns:
        str: JSON string with output, exit_code, and error fields
    Examples:
        # Execute a simple command
        >>> result = simple_terminal_tool(command="ls -la /tmp")
        # Run a background task
        >>> result = simple_terminal_tool(command="python server.py", background=True)
        # With custom timeout
        >>> result = simple_terminal_tool(command="long_task.sh", timeout=300)
    """
    global _active_instances, _last_activity
    try:
        # Import required modules
        try:
            from morphcloud.api import MorphCloudClient
        except ImportError as import_error:
            return json.dumps({
                "output": "",
                "exit_code": -1,
                "error": f"Terminal tool disabled: {import_error}",
                "status": "disabled"
            }, ensure_ascii=False)
        # Get configuration
        vm_ttl_seconds = int(os.getenv("HECATE_VM_TTL_SECONDS", "1200"))
        snapshot_id = os.getenv("HECATE_DEFAULT_SNAPSHOT_ID", "snapshot_defv9tjg")
        # Check API key
        morph_api_key = os.getenv("MORPH_API_KEY")
        if not morph_api_key:
            return json.dumps({
                "output": "",
                "exit_code": -1,
                "error": "MORPH_API_KEY environment variable not set",
                "status": "disabled"
            }, ensure_ascii=False)
        # Use task_id for VM isolation
        effective_task_id = task_id or "default"
        # Start cleanup thread
        _start_cleanup_thread()
        # Get or create VM instance
        with _instance_lock:
            if effective_task_id not in _active_instances:
                morph_client = MorphCloudClient(api_key=morph_api_key)
                _active_instances[effective_task_id] = morph_client.instances.start(
                    snapshot_id=snapshot_id,
                    ttl_seconds=vm_ttl_seconds,
                    ttl_action="stop"
                )
            # Update last activity time
            _last_activity[effective_task_id] = time.time()
            instance = _active_instances[effective_task_id]
        # Wait for instance to be ready
        instance.wait_until_ready()
        # Prepare command for execution
        if background:
            # Run in background with nohup and redirect output
            exec_command = f"nohup {command} > /tmp/bg_output.log 2>&1 &"
            result = _execute_ssh_command(instance, exec_command, timeout=10)
            # For background tasks, return immediately with info
            if result["returncode"] == 0:
                return json.dumps({
                    "output": "Background task started successfully",
                    "exit_code": 0,
                    "error": None
                }, ensure_ascii=False)
            else:
                return json.dumps({
                    "output": result["stdout"],
                    "exit_code": result["returncode"],
                    "error": result["stderr"]
                }, ensure_ascii=False)
        else:
            # Run foreground command
            result = _execute_ssh_command(instance, command, timeout=timeout)
            # Combine stdout and stderr for output
            output = result["stdout"]
            if result["stderr"] and result["returncode"] != 0:
                output = f"{output}\n{result['stderr']}" if output else result["stderr"]
            return json.dumps({
                "output": output.strip(),
                "exit_code": result["returncode"],
                "error": result["stderr"] if result["returncode"] != 0 else None
            }, ensure_ascii=False)
    except Exception as e:
        return json.dumps({
            "output": "",
            "exit_code": -1,
            "error": f"Failed to execute command: {str(e)}",
            "status": "error"
        }, ensure_ascii=False)
 def check_requirements() -> bool:
    """Check if all requirements for the simple terminal tool are met."""
    required_vars = ["MORPH_API_KEY"]
    missing_required = [var for var in required_vars if not os.getenv(var)]
    if missing_required:
        print(f"Missing required environment variables: {', '.join(missing_required)}")
        return False
    try:
        from morphcloud.api import MorphCloudClient
        return True
    except Exception as e:
        print(f"MorphCloud not available: {e}")
        return False
 if __name__ == "__main__":
    """Simple test when run directly."""
    print("Simple Terminal Tool Module")
    print("=" * 40)
    if not check_requirements():
        print("Requirements not met. Please check the messages above.")
        exit(1)
    print("All requirements met!")
    print("\nAvailable Tool:")
    print("  - simple_terminal_tool: Execute commands without session persistence")
    print("\nUsage Examples:")
    print("  # Execute a command")
    print("  result = simple_terminal_tool(command='ls -la')")
    print("  ")
    print("  # Run a background task")
    print("  result = simple_terminal_tool(command='python server.py', background=True)")
    print("\nEnvironment Variables:")
    print(f"  MORPH_API_KEY: {'Set' if os.getenv('MORPH_API_KEY') else 'Not set'}")
    print(f"  HECATE_VM_TTL_SECONDS: {os.getenv('HECATE_VM_TTL_SECONDS', '1200')} (default: 1200 / 20 minutes)")
    print(f"  HECATE_VM_LIFETIME_SECONDS: {os.getenv('HECATE_VM_LIFETIME_SECONDS', '300')} (default: 300 / 5 minutes)")
    print(f"  HECATE_DEFAULT_SNAPSHOT_ID: {os.getenv('HECATE_DEFAULT_SNAPSHOT_ID', 'snapshot_defv9tjg')}")
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -184,7 +184,7 @@ Your goal is to preserve ALL important information while reducing length. Never
 Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
        # Call the LLM asynchronously with retry logic for flaky API
-        max_retries = 3
+        max_retries = 6
        retry_delay = 2  # Start with 2 seconds
        last_error = None
@@ -206,7 +206,7 @@ Create a markdown summary that captures all key information in a well-organized,
                    print(f"⚠️  LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(api_error)[:100]}")
                    print(f"   Retrying in {retry_delay}s...")
                    await asyncio.sleep(retry_delay)
-                    retry_delay *= 2  # Exponential backoff: 2s, 4s, 8s
+                    retry_delay = min(retry_delay * 2, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s
                else:
                    # All retries exhausted
                    raise last_error
--- a/toolset_distributions.py
+++ b/toolset_distributions.py
@@ -67,7 +67,7 @@ DISTRIBUTIONS = {
        "description": "Web research with vision analysis and reasoning",
        "toolsets": {
            "web": 94,      # 90% chance of web tools
-            "vision": 50,   # 50% chance of vision tools
+            "vision": 65,   # 50% chance of vision tools
            "moa": 10,      # 40% chance of reasoning tools
            "terminal": 94,  # 10% chance of terminal tools
            "image_gen": 15  # 80% chance of image generation tools
Author	SHA1	Message	Date
hjc-puro	e06a15b3ab	add profiling	2025-11-18 07:12:05 -05:00
hjc-puro	349e37de0a	add linewise profiling	2025-11-17 23:21:36 -05:00
hjc-puro	ab7293bed6	don't log exit code !=0 as terminal failure	2025-11-17 18:39:16 -05:00
hjc-puro	1614c15bb1	rate limits	2025-11-17 18:35:36 -05:00
hjc-puro	f813959750	add simple terminal	2025-11-17 01:14:31 -05:00
teknium	f957ec2267	update distribution and gitignore	2025-11-16 01:03:23 +00:00
Teknium	92e3074c10	Merge pull request #9 from NousResearch/tc-logging Add logging for first 100 chars of the tool call args json / tool response	2025-11-15 14:03:24 -08:00
hjc-puro	31c733383b	add tracking for cluster failurse	2025-11-15 00:01:19 -05:00
hjc-puro	0c618482c4	add logging of prefix of tool call and tool response	2025-11-07 14:43:44 -05:00
hjc-puro	2d8f6c46f1	log first 20 chars	2025-11-07 14:08:06 -05:00
teknium	c27787f09f	fix gitignore again	2025-11-05 06:43:03 +00:00
teknium	d90fcd4e2b	update gitignore	2025-11-05 06:43:03 +00:00
Teknium	69fd0ca9aa	Merge pull request #7 from NousResearch/test some cleanups	2025-11-04 19:54:49 -08:00