mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-01 00:11:39 +08:00
924 lines
32 KiB
Python
924 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Modal Integration Stress Tests & Full Integration Tests
|
|
|
|
This test suite includes:
|
|
1. Stress tests for Modal sandbox pools (concurrent load, scaling)
|
|
2. Atropos backend tests (requires atroposlib)
|
|
3. mini-swe-agent integration tests
|
|
|
|
Prerequisites:
|
|
# Install dev dependencies
|
|
pip install -e '.[dev,modal]'
|
|
|
|
# Install atroposlib for Atropos tests
|
|
pip install -e '.[atropos]'
|
|
|
|
# Clone mini-swe-agent (if not present)
|
|
git clone https://github.com/anthropics/mini-swe-agent.git mini-swe-agent
|
|
# Or as submodule:
|
|
git submodule add https://github.com/anthropics/mini-swe-agent.git mini-swe-agent
|
|
|
|
Run with:
|
|
# All tests
|
|
python tests/test_modal_stress.py
|
|
|
|
# Stress tests only
|
|
python tests/test_modal_stress.py --category stress
|
|
|
|
# Atropos tests only
|
|
python tests/test_modal_stress.py --category atropos
|
|
|
|
# Mini-swe-agent tests only
|
|
python tests/test_modal_stress.py --category miniswe
|
|
|
|
# Dry run (no Modal calls)
|
|
python tests/test_modal_stress.py --dry-run
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import random
|
|
import traceback
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
# Add parent to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
|
|
# =============================================================================
|
|
# Test Configuration
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class StressTestConfig:
|
|
dry_run: bool = False
|
|
verbose: bool = True
|
|
category: Optional[str] = None
|
|
# Stress test parameters (reduced defaults for faster first-run)
|
|
concurrent_tasks: int = 3 # Start small - Modal cold starts are slow
|
|
total_operations: int = 10
|
|
max_sandboxes: int = 3
|
|
slots_per_sandbox: int = 3
|
|
|
|
|
|
# =============================================================================
|
|
# Test Results Tracking
|
|
# =============================================================================
|
|
|
|
class TestResults:
|
|
def __init__(self):
|
|
self.passed: List[str] = []
|
|
self.failed: List[Tuple[str, str]] = []
|
|
self.skipped: List[Tuple[str, str]] = []
|
|
self.metrics: Dict[str, Any] = {}
|
|
|
|
def record_pass(self, name: str, metrics: Optional[Dict] = None):
|
|
self.passed.append(name)
|
|
if metrics:
|
|
self.metrics[name] = metrics
|
|
print(f" ✅ {name}")
|
|
if metrics:
|
|
for k, v in metrics.items():
|
|
print(f" 📊 {k}: {v}")
|
|
|
|
def record_fail(self, name: str, error: str):
|
|
self.failed.append((name, error))
|
|
print(f" ❌ {name}: {error}")
|
|
|
|
def record_skip(self, name: str, reason: str):
|
|
self.skipped.append((name, reason))
|
|
print(f" ⏭️ {name}: {reason}")
|
|
|
|
def summary(self):
|
|
total = len(self.passed) + len(self.failed) + len(self.skipped)
|
|
print(f"\n{'='*70}")
|
|
print(f"STRESS TEST RESULTS: {len(self.passed)}/{total} passed")
|
|
print(f" Passed: {len(self.passed)}")
|
|
print(f" Failed: {len(self.failed)}")
|
|
print(f" Skipped: {len(self.skipped)}")
|
|
|
|
if self.failed:
|
|
print(f"\nFailed tests:")
|
|
for name, error in self.failed:
|
|
print(f" - {name}: {error}")
|
|
|
|
if self.metrics:
|
|
print(f"\nPerformance Metrics:")
|
|
for test, metrics in self.metrics.items():
|
|
print(f" {test}:")
|
|
for k, v in metrics.items():
|
|
print(f" - {k}: {v}")
|
|
|
|
return len(self.failed) == 0
|
|
|
|
|
|
results = TestResults()
|
|
|
|
|
|
# =============================================================================
|
|
# Helper: Atropos Import
|
|
# =============================================================================
|
|
|
|
def try_import_atropos():
|
|
"""Try importing Atropos backend components."""
|
|
try:
|
|
from atropos.backends.modal_backend import (
|
|
ModalToolBackend, ModalSandboxConfig,
|
|
_ModalMultiProfileManager
|
|
)
|
|
from atropos.slots.slot import Slot, SlotState
|
|
return ModalToolBackend, ModalSandboxConfig, Slot, SlotState
|
|
except (ImportError, ModuleNotFoundError) as e:
|
|
return None
|
|
|
|
|
|
def try_import_miniswe():
|
|
"""Try importing mini-swe-agent components."""
|
|
try:
|
|
# Check if mini-swe-agent path exists and has content
|
|
mini_swe_path = Path(__file__).parent.parent / "mini-swe-agent" / "src"
|
|
if mini_swe_path.exists() and list(mini_swe_path.iterdir()):
|
|
sys.path.insert(0, str(mini_swe_path))
|
|
import minisweagent
|
|
return minisweagent
|
|
return None
|
|
except (ImportError, ModuleNotFoundError) as e:
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# CATEGORY 1: Stress Tests (Terminal Tool)
|
|
# =============================================================================
|
|
|
|
def test_stress_concurrent_tasks(config: StressTestConfig):
|
|
"""Stress test: Multiple concurrent task_ids hitting the pool."""
|
|
if config.dry_run:
|
|
results.record_skip("test_stress_concurrent_tasks", "Dry run mode")
|
|
return
|
|
|
|
from tools.terminal_tool import terminal_tool, cleanup_vm
|
|
|
|
original_env = os.environ.get("TERMINAL_ENV")
|
|
os.environ["TERMINAL_ENV"] = "modal"
|
|
|
|
try:
|
|
num_tasks = config.concurrent_tasks
|
|
task_ids = [f"stress-concurrent-{i}-{int(time.time())}" for i in range(num_tasks)]
|
|
|
|
start_time = time.time()
|
|
errors = []
|
|
successes = 0
|
|
|
|
def run_task(task_id: str) -> Tuple[bool, str]:
|
|
try:
|
|
result = json.loads(terminal_tool(
|
|
f"echo 'Hello from {task_id}' && sleep 0.5",
|
|
task_id=task_id,
|
|
))
|
|
success = result["exit_code"] == 0
|
|
|
|
# IMPORTANT: Clean up immediately after task completes
|
|
# This releases the sandbox back to the pool for other tasks
|
|
try:
|
|
cleanup_vm(task_id)
|
|
except:
|
|
pass
|
|
|
|
if success:
|
|
return True, ""
|
|
# Include more details for debugging
|
|
error_detail = result.get("error", "no error message")
|
|
output = result.get("output", "")[:100] # First 100 chars
|
|
return False, f"Exit code: {result['exit_code']}, error: {error_detail}, output: {output}"
|
|
except Exception as e:
|
|
# Clean up even on failure
|
|
try:
|
|
cleanup_vm(task_id)
|
|
except:
|
|
pass
|
|
import traceback
|
|
return False, f"Exception: {str(e)}\n{traceback.format_exc()}"
|
|
|
|
# Run all tasks concurrently using threads
|
|
with ThreadPoolExecutor(max_workers=num_tasks) as executor:
|
|
futures = {executor.submit(run_task, tid): tid for tid in task_ids}
|
|
|
|
for future in as_completed(futures):
|
|
task_id = futures[future]
|
|
try:
|
|
success, error = future.result(timeout=60)
|
|
if success:
|
|
successes += 1
|
|
else:
|
|
errors.append(f"{task_id}: {error}")
|
|
except Exception as e:
|
|
errors.append(f"{task_id}: {str(e)}")
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# No need for cleanup here - each task cleans up immediately
|
|
|
|
# Report
|
|
success_rate = successes / num_tasks * 100
|
|
|
|
if success_rate >= 90: # Allow 10% failure rate for stress test
|
|
results.record_pass("test_stress_concurrent_tasks", {
|
|
"concurrent_tasks": num_tasks,
|
|
"successes": successes,
|
|
"failures": len(errors),
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"total_time": f"{elapsed:.2f}s",
|
|
"avg_time_per_task": f"{elapsed/num_tasks:.2f}s",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_stress_concurrent_tasks",
|
|
f"Success rate {success_rate:.1f}% < 90%. Errors: {errors[:3]}"
|
|
)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_stress_concurrent_tasks", str(e))
|
|
finally:
|
|
if original_env:
|
|
os.environ["TERMINAL_ENV"] = original_env
|
|
elif "TERMINAL_ENV" in os.environ:
|
|
del os.environ["TERMINAL_ENV"]
|
|
|
|
|
|
def test_stress_rapid_fire(config: StressTestConfig):
|
|
"""Stress test: Rapid sequential commands to same task_id."""
|
|
if config.dry_run:
|
|
results.record_skip("test_stress_rapid_fire", "Dry run mode")
|
|
return
|
|
|
|
from tools.terminal_tool import terminal_tool, cleanup_vm
|
|
|
|
original_env = os.environ.get("TERMINAL_ENV")
|
|
os.environ["TERMINAL_ENV"] = "modal"
|
|
|
|
try:
|
|
task_id = f"stress-rapid-{int(time.time())}"
|
|
num_commands = config.total_operations
|
|
|
|
start_time = time.time()
|
|
successes = 0
|
|
errors = []
|
|
|
|
for i in range(num_commands):
|
|
try:
|
|
result = json.loads(terminal_tool(f"echo {i}", task_id=task_id))
|
|
if result["exit_code"] == 0 and str(i) in result["output"]:
|
|
successes += 1
|
|
else:
|
|
errors.append(f"Command {i}: unexpected result")
|
|
except Exception as e:
|
|
errors.append(f"Command {i}: {str(e)}")
|
|
|
|
elapsed = time.time() - start_time
|
|
cleanup_vm(task_id)
|
|
|
|
success_rate = successes / num_commands * 100
|
|
commands_per_second = num_commands / elapsed
|
|
|
|
if success_rate >= 95:
|
|
results.record_pass("test_stress_rapid_fire", {
|
|
"total_commands": num_commands,
|
|
"successes": successes,
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"total_time": f"{elapsed:.2f}s",
|
|
"commands_per_second": f"{commands_per_second:.1f}",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_stress_rapid_fire",
|
|
f"Success rate {success_rate:.1f}% < 95%"
|
|
)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_stress_rapid_fire", str(e))
|
|
finally:
|
|
if original_env:
|
|
os.environ["TERMINAL_ENV"] = original_env
|
|
elif "TERMINAL_ENV" in os.environ:
|
|
del os.environ["TERMINAL_ENV"]
|
|
|
|
|
|
def test_stress_pool_scaling(config: StressTestConfig):
|
|
"""Stress test: Force pool to scale up and down by running tasks in batches."""
|
|
if config.dry_run:
|
|
results.record_skip("test_stress_pool_scaling", "Dry run mode")
|
|
return
|
|
|
|
from tools.terminal_tool import terminal_tool, cleanup_vm, _ModalPoolManager
|
|
|
|
original_env = os.environ.get("TERMINAL_ENV")
|
|
os.environ["TERMINAL_ENV"] = "modal"
|
|
|
|
try:
|
|
# Run tasks in batches matching max_sandboxes to test pool reuse
|
|
# This verifies sandboxes can be acquired, used, released, and reused
|
|
batch_size = config.max_sandboxes
|
|
num_batches = 3
|
|
total_tasks = batch_size * num_batches
|
|
|
|
start_time = time.time()
|
|
successes = 0
|
|
|
|
for batch in range(num_batches):
|
|
task_ids = [f"stress-scale-{batch}-{i}-{int(time.time())}" for i in range(batch_size)]
|
|
|
|
def run_task(task_id: str):
|
|
try:
|
|
result = json.loads(terminal_tool(
|
|
"echo done", # Fast command to test scaling
|
|
task_id=task_id,
|
|
))
|
|
success = result["exit_code"] == 0
|
|
try:
|
|
cleanup_vm(task_id)
|
|
except:
|
|
pass
|
|
return success
|
|
except:
|
|
try:
|
|
cleanup_vm(task_id)
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
# Run batch concurrently
|
|
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
|
batch_results = list(executor.map(run_task, task_ids))
|
|
successes += sum(batch_results)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Check pool status
|
|
try:
|
|
manager = _ModalPoolManager.get_instance()
|
|
pool_status = manager.get_status() if hasattr(manager, 'get_status') else {}
|
|
except:
|
|
pool_status = {}
|
|
|
|
success_rate = successes / total_tasks * 100
|
|
|
|
if success_rate >= 80: # Allow some tolerance
|
|
results.record_pass("test_stress_pool_scaling", {
|
|
"total_tasks": total_tasks,
|
|
"num_batches": num_batches,
|
|
"batch_size": batch_size,
|
|
"successes": successes,
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"total_time": f"{elapsed:.2f}s",
|
|
"pool_status": pool_status,
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_stress_pool_scaling",
|
|
f"Success rate {success_rate:.1f}% < 80%"
|
|
)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_stress_pool_scaling", str(e))
|
|
finally:
|
|
if original_env:
|
|
os.environ["TERMINAL_ENV"] = original_env
|
|
elif "TERMINAL_ENV" in os.environ:
|
|
del os.environ["TERMINAL_ENV"]
|
|
|
|
|
|
def test_stress_large_output(config: StressTestConfig):
|
|
"""Stress test: Commands producing large output."""
|
|
if config.dry_run:
|
|
results.record_skip("test_stress_large_output", "Dry run mode")
|
|
return
|
|
|
|
from tools.terminal_tool import terminal_tool, cleanup_vm
|
|
|
|
original_env = os.environ.get("TERMINAL_ENV")
|
|
os.environ["TERMINAL_ENV"] = "modal"
|
|
|
|
try:
|
|
task_id = f"stress-large-{int(time.time())}"
|
|
|
|
# First verify basic connectivity with simple command
|
|
warmup = json.loads(terminal_tool("echo warmup", task_id=task_id))
|
|
if warmup["exit_code"] != 0:
|
|
results.record_fail(
|
|
"test_stress_large_output",
|
|
f"Warmup failed: {warmup.get('error', 'unknown')}"
|
|
)
|
|
return
|
|
|
|
# Generate output - use seq which is more portable
|
|
start_time = time.time()
|
|
result = json.loads(terminal_tool(
|
|
'seq 1 500 | while read i; do echo "Line $i: This is test content for large output"; done',
|
|
task_id=task_id,
|
|
timeout=60,
|
|
))
|
|
elapsed = time.time() - start_time
|
|
|
|
cleanup_vm(task_id)
|
|
|
|
output_size = len(result.get("output", ""))
|
|
error_msg = result.get("error", "")
|
|
|
|
if result["exit_code"] == 0 and output_size > 5000:
|
|
results.record_pass("test_stress_large_output", {
|
|
"output_size": f"{output_size:,} bytes",
|
|
"time": f"{elapsed:.2f}s",
|
|
"throughput": f"{output_size/elapsed/1024:.1f} KB/s" if elapsed > 0 else "N/A",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_stress_large_output",
|
|
f"Exit code: {result['exit_code']}, output size: {output_size}, error: {error_msg}"
|
|
)
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
results.record_fail("test_stress_large_output", f"{str(e)}\n{traceback.format_exc()}")
|
|
finally:
|
|
try:
|
|
cleanup_vm(task_id)
|
|
except:
|
|
pass
|
|
if original_env:
|
|
os.environ["TERMINAL_ENV"] = original_env
|
|
elif "TERMINAL_ENV" in os.environ:
|
|
del os.environ["TERMINAL_ENV"]
|
|
|
|
|
|
def test_stress_error_recovery(config: StressTestConfig):
|
|
"""Stress test: Commands that fail and verify sandbox continues working."""
|
|
if config.dry_run:
|
|
results.record_skip("test_stress_error_recovery", "Dry run mode")
|
|
return
|
|
|
|
from tools.terminal_tool import terminal_tool, cleanup_vm
|
|
|
|
original_env = os.environ.get("TERMINAL_ENV")
|
|
os.environ["TERMINAL_ENV"] = "modal"
|
|
|
|
try:
|
|
task_id = f"stress-error-{int(time.time())}"
|
|
|
|
# Run some failing commands
|
|
failing_commands = [
|
|
"exit 1",
|
|
"false",
|
|
"cat /nonexistent/file",
|
|
"command_that_does_not_exist",
|
|
]
|
|
|
|
for cmd in failing_commands:
|
|
result = json.loads(terminal_tool(cmd, task_id=task_id))
|
|
# These should fail but not crash
|
|
assert result["exit_code"] != 0 or result.get("error"), f"Expected failure for: {cmd}"
|
|
|
|
# Now run a command that should succeed
|
|
result = json.loads(terminal_tool("echo 'recovery success'", task_id=task_id))
|
|
|
|
cleanup_vm(task_id)
|
|
|
|
if result["exit_code"] == 0 and "recovery success" in result["output"]:
|
|
results.record_pass("test_stress_error_recovery", {
|
|
"failed_commands": len(failing_commands),
|
|
"recovery": "success",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_stress_error_recovery",
|
|
f"Recovery failed: {result}"
|
|
)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_stress_error_recovery", str(e))
|
|
finally:
|
|
if original_env:
|
|
os.environ["TERMINAL_ENV"] = original_env
|
|
elif "TERMINAL_ENV" in os.environ:
|
|
del os.environ["TERMINAL_ENV"]
|
|
|
|
|
|
# =============================================================================
|
|
# CATEGORY 2: Atropos Backend Stress Tests
|
|
# =============================================================================
|
|
|
|
async def test_atropos_stress_slot_churn(config: StressTestConfig):
|
|
"""Atropos stress test: Rapid slot acquire/release cycles."""
|
|
if config.dry_run:
|
|
results.record_skip("test_atropos_stress_slot_churn", "Dry run mode")
|
|
return
|
|
|
|
imports = try_import_atropos()
|
|
if imports is None:
|
|
results.record_skip("test_atropos_stress_slot_churn", "Requires atroposlib")
|
|
return
|
|
|
|
ModalToolBackend, ModalSandboxConfig, _, _ = imports
|
|
|
|
try:
|
|
backend_config = ModalSandboxConfig(
|
|
app_name=f"stress-churn-{int(time.time())}",
|
|
min_sandboxes=1,
|
|
max_sandboxes=3,
|
|
slots_per_sandbox=5,
|
|
)
|
|
|
|
backend = ModalToolBackend(backend_config)
|
|
await backend.start()
|
|
|
|
try:
|
|
num_cycles = config.total_operations
|
|
start_time = time.time()
|
|
successes = 0
|
|
|
|
for i in range(num_cycles):
|
|
try:
|
|
slot = await backend.acquire(f"churn-{i}")
|
|
|
|
# Quick command
|
|
results_list = await backend.execute_batch([
|
|
(slot, "bash", {"command": f"echo {i}"})
|
|
])
|
|
|
|
if results_list[0].success:
|
|
successes += 1
|
|
|
|
await backend.release(slot, reset_workspace=(i % 5 == 0))
|
|
except Exception as e:
|
|
pass # Count as failure
|
|
|
|
elapsed = time.time() - start_time
|
|
success_rate = successes / num_cycles * 100
|
|
|
|
if success_rate >= 90:
|
|
results.record_pass("test_atropos_stress_slot_churn", {
|
|
"cycles": num_cycles,
|
|
"successes": successes,
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"total_time": f"{elapsed:.2f}s",
|
|
"cycles_per_second": f"{num_cycles/elapsed:.1f}",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_atropos_stress_slot_churn",
|
|
f"Success rate {success_rate:.1f}% < 90%"
|
|
)
|
|
|
|
finally:
|
|
await backend.stop(purge=True)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_atropos_stress_slot_churn", str(e))
|
|
|
|
|
|
async def test_atropos_stress_parallel_batches(config: StressTestConfig):
|
|
"""Atropos stress test: Multiple parallel batch executions."""
|
|
if config.dry_run:
|
|
results.record_skip("test_atropos_stress_parallel_batches", "Dry run mode")
|
|
return
|
|
|
|
imports = try_import_atropos()
|
|
if imports is None:
|
|
results.record_skip("test_atropos_stress_parallel_batches", "Requires atroposlib")
|
|
return
|
|
|
|
ModalToolBackend, ModalSandboxConfig, _, _ = imports
|
|
|
|
try:
|
|
backend_config = ModalSandboxConfig(
|
|
app_name=f"stress-batch-{int(time.time())}",
|
|
min_sandboxes=2,
|
|
max_sandboxes=4,
|
|
slots_per_sandbox=5,
|
|
)
|
|
|
|
backend = ModalToolBackend(backend_config)
|
|
await backend.start()
|
|
|
|
try:
|
|
num_slots = 10
|
|
slots = []
|
|
|
|
# Acquire multiple slots
|
|
for i in range(num_slots):
|
|
slot = await backend.acquire(f"batch-{i}")
|
|
slots.append(slot)
|
|
|
|
# Run multiple batches in parallel
|
|
start_time = time.time()
|
|
num_batches = 5
|
|
|
|
async def run_batch(batch_id: int):
|
|
requests = [
|
|
(slot, "bash", {"command": f"echo 'batch{batch_id}-slot{i}'"})
|
|
for i, slot in enumerate(slots)
|
|
]
|
|
return await backend.execute_batch(requests)
|
|
|
|
batch_tasks = [run_batch(i) for i in range(num_batches)]
|
|
all_results = await asyncio.gather(*batch_tasks)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Count successes
|
|
total_commands = num_batches * num_slots
|
|
successes = sum(
|
|
1 for batch_result in all_results
|
|
for r in batch_result
|
|
if r.success
|
|
)
|
|
|
|
# Release slots
|
|
for slot in slots:
|
|
await backend.release(slot)
|
|
|
|
success_rate = successes / total_commands * 100
|
|
|
|
if success_rate >= 90:
|
|
results.record_pass("test_atropos_stress_parallel_batches", {
|
|
"batches": num_batches,
|
|
"slots": num_slots,
|
|
"total_commands": total_commands,
|
|
"successes": successes,
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"total_time": f"{elapsed:.2f}s",
|
|
"commands_per_second": f"{total_commands/elapsed:.1f}",
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_atropos_stress_parallel_batches",
|
|
f"Success rate {success_rate:.1f}% < 90%"
|
|
)
|
|
|
|
finally:
|
|
await backend.stop(purge=True)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_atropos_stress_parallel_batches", str(e))
|
|
|
|
|
|
async def test_atropos_stress_multi_profile_load(config: StressTestConfig):
|
|
"""Atropos stress test: Load across multiple profiles."""
|
|
if config.dry_run:
|
|
results.record_skip("test_atropos_stress_multi_profile_load", "Dry run mode")
|
|
return
|
|
|
|
imports = try_import_atropos()
|
|
if imports is None:
|
|
results.record_skip("test_atropos_stress_multi_profile_load", "Requires atroposlib")
|
|
return
|
|
|
|
ModalToolBackend, ModalSandboxConfig, _, _ = imports
|
|
|
|
try:
|
|
backend = ModalToolBackend.with_profiles(
|
|
app_name=f"stress-multiprofile-{int(time.time())}",
|
|
profiles={
|
|
"cpu-light": ModalSandboxConfig(
|
|
name="cpu-light",
|
|
cpu=0.5,
|
|
memory=1024,
|
|
min_sandboxes=1,
|
|
max_sandboxes=2,
|
|
slots_per_sandbox=5,
|
|
),
|
|
"cpu-heavy": ModalSandboxConfig(
|
|
name="cpu-heavy",
|
|
cpu=2.0,
|
|
memory=4096,
|
|
min_sandboxes=0,
|
|
max_sandboxes=2,
|
|
slots_per_sandbox=3,
|
|
),
|
|
}
|
|
)
|
|
|
|
await backend.start(profiles_to_start=["cpu-light", "cpu-heavy"])
|
|
|
|
try:
|
|
num_tasks_per_profile = 5
|
|
slots = []
|
|
|
|
# Acquire from both profiles
|
|
for i in range(num_tasks_per_profile):
|
|
light_slot = await backend.acquire(f"light-{i}", profile="cpu-light")
|
|
heavy_slot = await backend.acquire(f"heavy-{i}", profile="cpu-heavy")
|
|
slots.append((light_slot, "cpu-light"))
|
|
slots.append((heavy_slot, "cpu-heavy"))
|
|
|
|
# Execute batch across all profiles
|
|
start_time = time.time()
|
|
|
|
requests = [
|
|
(slot, "bash", {"command": f"echo 'profile={profile}'"})
|
|
for slot, profile in slots
|
|
]
|
|
|
|
batch_results = await backend.execute_batch(requests)
|
|
elapsed = time.time() - start_time
|
|
|
|
successes = sum(1 for r in batch_results if r.success)
|
|
|
|
# Release all
|
|
for slot, _ in slots:
|
|
await backend.release(slot)
|
|
|
|
status = backend.get_status()
|
|
|
|
success_rate = successes / len(slots) * 100
|
|
|
|
if success_rate >= 90:
|
|
results.record_pass("test_atropos_stress_multi_profile_load", {
|
|
"profiles": 2,
|
|
"tasks_per_profile": num_tasks_per_profile,
|
|
"total_tasks": len(slots),
|
|
"successes": successes,
|
|
"success_rate": f"{success_rate:.1f}%",
|
|
"time": f"{elapsed:.2f}s",
|
|
"status": status,
|
|
})
|
|
else:
|
|
results.record_fail(
|
|
"test_atropos_stress_multi_profile_load",
|
|
f"Success rate {success_rate:.1f}% < 90%"
|
|
)
|
|
|
|
finally:
|
|
await backend.stop(purge=True)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_atropos_stress_multi_profile_load", str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# CATEGORY 3: Mini-SWE-Agent Integration Tests
|
|
# =============================================================================
|
|
|
|
def test_miniswe_environment_available():
|
|
"""Check if mini-swe-agent is properly set up."""
|
|
mini_swe_path = Path(__file__).parent.parent / "mini-swe-agent" / "src"
|
|
|
|
if not mini_swe_path.exists():
|
|
results.record_skip(
|
|
"test_miniswe_environment_available",
|
|
"mini-swe-agent not found. Run: git clone https://github.com/anthropics/mini-swe-agent.git mini-swe-agent"
|
|
)
|
|
return
|
|
|
|
if not list(mini_swe_path.iterdir()):
|
|
results.record_skip(
|
|
"test_miniswe_environment_available",
|
|
"mini-swe-agent directory is empty. Run: git submodule update --init"
|
|
)
|
|
return
|
|
|
|
miniswe = try_import_miniswe()
|
|
if miniswe is None:
|
|
results.record_fail(
|
|
"test_miniswe_environment_available",
|
|
"Failed to import minisweagent module"
|
|
)
|
|
return
|
|
|
|
results.record_pass("test_miniswe_environment_available", {
|
|
"path": str(mini_swe_path),
|
|
"module": miniswe.__name__,
|
|
})
|
|
|
|
|
|
def test_miniswe_modal_backend(config: StressTestConfig):
|
|
"""Test mini-swe-agent with Modal backend."""
|
|
if config.dry_run:
|
|
results.record_skip("test_miniswe_modal_backend", "Dry run mode")
|
|
return
|
|
|
|
miniswe = try_import_miniswe()
|
|
if miniswe is None:
|
|
results.record_skip(
|
|
"test_miniswe_modal_backend",
|
|
"mini-swe-agent not available"
|
|
)
|
|
return
|
|
|
|
try:
|
|
# Check if ModalEnvironment exists in minisweagent
|
|
if not hasattr(miniswe, 'ModalEnvironment'):
|
|
results.record_skip(
|
|
"test_miniswe_modal_backend",
|
|
"minisweagent.ModalEnvironment not found"
|
|
)
|
|
return
|
|
|
|
# Create Modal environment
|
|
env = miniswe.ModalEnvironment(
|
|
image="python:3.11",
|
|
timeout=60,
|
|
)
|
|
|
|
# Execute a command
|
|
result = env.execute("echo 'Hello from mini-swe-agent Modal'")
|
|
|
|
env.cleanup()
|
|
|
|
if "Hello from mini-swe-agent Modal" in str(result):
|
|
results.record_pass("test_miniswe_modal_backend")
|
|
else:
|
|
results.record_fail(
|
|
"test_miniswe_modal_backend",
|
|
f"Unexpected result: {result}"
|
|
)
|
|
|
|
except Exception as e:
|
|
results.record_fail("test_miniswe_modal_backend", str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Test Runner
|
|
# =============================================================================
|
|
|
|
def run_sync_tests(config: StressTestConfig):
|
|
"""Run synchronous tests."""
|
|
if config.category in (None, "stress"):
|
|
print("\n" + "="*70)
|
|
print("STRESS TESTS (Terminal Tool)")
|
|
print("="*70)
|
|
|
|
test_stress_concurrent_tasks(config)
|
|
test_stress_rapid_fire(config)
|
|
test_stress_pool_scaling(config)
|
|
test_stress_large_output(config)
|
|
test_stress_error_recovery(config)
|
|
|
|
if config.category in (None, "miniswe"):
|
|
print("\n" + "="*70)
|
|
print("MINI-SWE-AGENT INTEGRATION TESTS")
|
|
print("="*70)
|
|
|
|
test_miniswe_environment_available()
|
|
test_miniswe_modal_backend(config)
|
|
|
|
|
|
async def run_async_tests(config: StressTestConfig):
|
|
"""Run asynchronous tests."""
|
|
if config.category in (None, "atropos"):
|
|
print("\n" + "="*70)
|
|
print("ATROPOS BACKEND STRESS TESTS")
|
|
print("="*70)
|
|
|
|
await test_atropos_stress_slot_churn(config)
|
|
await test_atropos_stress_parallel_batches(config)
|
|
await test_atropos_stress_multi_profile_load(config)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Modal Stress Test Suite")
|
|
parser.add_argument("--dry-run", action="store_true", help="Skip tests requiring Modal")
|
|
parser.add_argument("--category", choices=["stress", "atropos", "miniswe"], help="Run specific category")
|
|
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent tasks")
|
|
parser.add_argument("--operations", type=int, default=50, help="Total operations for stress tests")
|
|
parser.add_argument("--verbose", action="store_true", default=True)
|
|
args = parser.parse_args()
|
|
|
|
config = StressTestConfig(
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose,
|
|
category=args.category,
|
|
concurrent_tasks=args.concurrent,
|
|
total_operations=args.operations,
|
|
)
|
|
|
|
print("="*70)
|
|
print("MODAL STRESS & INTEGRATION TEST SUITE")
|
|
print("="*70)
|
|
print(f"Mode: {'DRY RUN' if config.dry_run else 'LIVE'}")
|
|
print(f"Category: {config.category or 'ALL'}")
|
|
print(f"Concurrent tasks: {config.concurrent_tasks}")
|
|
print(f"Total operations: {config.total_operations}")
|
|
|
|
# Run sync tests
|
|
run_sync_tests(config)
|
|
|
|
# Run async tests
|
|
asyncio.run(run_async_tests(config))
|
|
|
|
# Summary
|
|
success = results.summary()
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|