diff --git a/tests/tools/test_mixture_of_agents_tool.py b/tests/tools/test_mixture_of_agents_tool.py new file mode 100644 index 0000000000..84d1ffece5 --- /dev/null +++ b/tests/tools/test_mixture_of_agents_tool.py @@ -0,0 +1,82 @@ +import importlib +import json +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +moa = importlib.import_module("tools.mixture_of_agents_tool") + + +def test_moa_defaults_track_current_openrouter_frontier_models(): + assert moa.REFERENCE_MODELS == [ + "anthropic/claude-opus-4.6", + "google/gemini-3-pro-preview", + "openai/gpt-5.4-pro", + "deepseek/deepseek-v3.2", + ] + assert moa.AGGREGATOR_MODEL == "anthropic/claude-opus-4.6" + + +@pytest.mark.asyncio +async def test_reference_model_retry_warnings_avoid_exc_info_until_terminal_failure(monkeypatch): + fake_client = SimpleNamespace( + chat=SimpleNamespace( + completions=SimpleNamespace( + create=AsyncMock(side_effect=RuntimeError("rate limited")) + ) + ) + ) + warn = MagicMock() + err = MagicMock() + + monkeypatch.setattr(moa, "_get_openrouter_client", lambda: fake_client) + monkeypatch.setattr(moa.logger, "warning", warn) + monkeypatch.setattr(moa.logger, "error", err) + + model, message, success = await moa._run_reference_model_safe( + "openai/gpt-5.4-pro", "hello", max_retries=2 + ) + + assert model == "openai/gpt-5.4-pro" + assert success is False + assert "failed after 2 attempts" in message + assert warn.call_count == 2 + assert all(call.kwargs.get("exc_info") is None for call in warn.call_args_list) + err.assert_called_once() + assert err.call_args.kwargs.get("exc_info") is True + + +@pytest.mark.asyncio +async def test_moa_top_level_error_logs_single_traceback_on_aggregator_failure(monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + monkeypatch.setattr( + moa, + "_run_reference_model_safe", + AsyncMock(return_value=("anthropic/claude-opus-4.6", "ok", True)), + ) + monkeypatch.setattr( + moa, + "_run_aggregator_model", + AsyncMock(side_effect=RuntimeError("aggregator boom")), + ) + monkeypatch.setattr( + moa, + "_debug", + SimpleNamespace(log_call=MagicMock(), save=MagicMock(), active=False), + ) + + err = MagicMock() + monkeypatch.setattr(moa.logger, "error", err) + + result = json.loads( + await moa.mixture_of_agents_tool( + "solve this", + reference_models=["anthropic/claude-opus-4.6"], + ) + ) + + assert result["success"] is False + assert "Error in MoA processing" in result["error"] + err.assert_called_once() + assert err.call_args.kwargs.get("exc_info") is True diff --git a/tools/mixture_of_agents_tool.py b/tools/mixture_of_agents_tool.py index 355419817f..d23297d545 100644 --- a/tools/mixture_of_agents_tool.py +++ b/tools/mixture_of_agents_tool.py @@ -25,8 +25,8 @@ Architecture: 3. Multiple layers can be used for iterative refinement (future enhancement) Models Used (via OpenRouter): -- Reference Models: claude-opus-4, gemini-2.5-pro, gpt-4.1, deepseek-r1 -- Aggregator Model: claude-opus-4 (highest capability for synthesis) +- Reference Models: claude-opus-4.6, gemini-3-pro-preview, gpt-5.4-pro, deepseek-v3.2 +- Aggregator Model: claude-opus-4.6 (highest capability for synthesis) Configuration: To customize the MoA setup, modify the configuration constants at the top of this file: @@ -57,16 +57,18 @@ from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) # Configuration for MoA processing -# Reference models - these generate diverse initial responses in parallel (OpenRouter slugs) +# Reference models - these generate diverse initial responses in parallel. +# Keep this list aligned with current top-tier OpenRouter frontier options. REFERENCE_MODELS = [ - "anthropic/claude-opus-4.5", - "google/gemini-3-pro-preview", - "openai/gpt-5.2-pro", - "deepseek/deepseek-v3.2" + "anthropic/claude-opus-4.6", + "google/gemini-3-pro-preview", + "openai/gpt-5.4-pro", + "deepseek/deepseek-v3.2", ] -# Aggregator model - synthesizes reference responses into final output -AGGREGATOR_MODEL = "anthropic/claude-opus-4.5" # Use highest capability model for aggregation +# Aggregator model - synthesizes reference responses into final output. +# Prefer the strongest synthesis model in the current OpenRouter lineup. +AGGREGATOR_MODEL = "anthropic/claude-opus-4.6" # Temperature settings optimized for MoA performance REFERENCE_TEMPERATURE = 0.6 # Balanced creativity for diverse perspectives @@ -147,14 +149,15 @@ async def _run_reference_model_safe( except Exception as e: error_str = str(e) - # Log more detailed error information for debugging + # Keep retry-path logging concise; full tracebacks are reserved for + # terminal failure paths so long-running MoA retries don't flood logs. if "invalid" in error_str.lower(): logger.warning("%s invalid request error (attempt %s): %s", model, attempt + 1, error_str) elif "rate" in error_str.lower() or "limit" in error_str.lower(): logger.warning("%s rate limit error (attempt %s): %s", model, attempt + 1, error_str) else: logger.warning("%s unknown error (attempt %s): %s", model, attempt + 1, error_str) - + if attempt < max_retries - 1: # Exponential backoff for rate limiting: 2s, 4s, 8s, 16s, 32s, 60s sleep_time = min(2 ** (attempt + 1), 60) @@ -162,7 +165,7 @@ async def _run_reference_model_safe( await asyncio.sleep(sleep_time) else: error_msg = f"{model} failed after {max_retries} attempts: {error_str}" - logger.error("%s", error_msg) + logger.error("%s", error_msg, exc_info=True) return model, error_msg, False @@ -185,7 +188,7 @@ async def _run_aggregator_model( str: Synthesized final response """ logger.info("Running aggregator model: %s", AGGREGATOR_MODEL) - + # Build parameters for the API call api_params = { "model": AGGREGATOR_MODEL, @@ -200,14 +203,14 @@ async def _run_aggregator_model( } } } - + # GPT models (especially gpt-4o-mini) don't support custom temperature values # Only include temperature for non-GPT models if not AGGREGATOR_MODEL.lower().startswith('gpt-'): api_params["temperature"] = temperature - + response = await _get_openrouter_client().chat.completions.create(**api_params) - + content = response.choices[0].message.content.strip() logger.info("Aggregation complete (%s characters)", len(content)) return content @@ -364,7 +367,7 @@ async def mixture_of_agents_tool( except Exception as e: error_msg = f"Error in MoA processing: {str(e)}" - logger.error("%s", error_msg) + logger.error("%s", error_msg, exc_info=True) # Calculate processing time even for errors end_time = datetime.datetime.now()