feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the full LLM request lifecycle: 1. Resolve provider + model from task config or explicit args 2. Get or create a cached client for that provider 3. Format request args (max_tokens handling, provider extra_body) 4. Make the API call with max_tokens/max_completion_tokens retry 5. Return the response Config: expanded auxiliary section with provider:model slots for all tasks (compression, vision, web_extract, session_search, skills_hub, mcp, flush_memories). Config version bumped to 7. Migrated all auxiliary consumers: - context_compressor.py: uses call_llm(task='compression') - vision_tools.py: uses async_call_llm(task='vision') - web_tools.py: uses async_call_llm(task='web_extract') - session_search_tool.py: uses async_call_llm(task='session_search') - browser_tool.py: uses call_llm(task='vision'/'web_extract') - mcp_tool.py: uses call_llm(task='mcp') - skills_guard.py: uses call_llm(provider='openrouter') - run_agent.py flush_memories: uses call_llm(task='flush_memories') Tests updated for context_compressor and MCP tool. Some test mocks still need updating (15 remaining failures from mock pattern changes, 2 pre-existing).
2026-04-28 06:51:16 +08:00 · 2026-03-11 20:52:19 -07:00
parent 013cc4d2fc
commit 0aa31cd3cb
13 changed files with 552 additions and 375 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -2623,19 +2623,22 @@ class AIAgent:

            # Use auxiliary client for the flush call when available --
            # it's cheaper and avoids Codex Responses API incompatibility.
-            from agent.auxiliary_client import get_text_auxiliary_client
-            aux_client, aux_model = get_text_auxiliary_client()
+            from agent.auxiliary_client import call_llm as _call_llm
+            _aux_available = True
+            try:
+                response = _call_llm(
+                    task="flush_memories",
+                    messages=api_messages,
+                    tools=[memory_tool_def],
+                    temperature=0.3,
+                    max_tokens=5120,
+                    timeout=30.0,
+                )
+            except RuntimeError:
+                _aux_available = False
+                response = None

-            if aux_client:
-                api_kwargs = {
-                    "model": aux_model,
-                    "messages": api_messages,
-                    "tools": [memory_tool_def],
-                    "temperature": 0.3,
-                    "max_tokens": 5120,
-                }
-                response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0)
-            elif self.api_mode == "codex_responses":
+            if not _aux_available and self.api_mode == "codex_responses":
                # No auxiliary client -- use the Codex Responses path directly
                codex_kwargs = self._build_api_kwargs(api_messages)
                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
@@ -2643,7 +2646,7 @@ class AIAgent:
                if "max_output_tokens" in codex_kwargs:
                    codex_kwargs["max_output_tokens"] = 5120
                response = self._run_codex_stream(codex_kwargs)
-            else:
+            elif not _aux_available:
                api_kwargs = {
                    "model": self.model,
                    "messages": api_messages,
@@ -2655,7 +2658,7 @@ class AIAgent:

            # Extract tool calls from the response, handling both API formats
            tool_calls = []
-            if self.api_mode == "codex_responses" and not aux_client:
+            if self.api_mode == "codex_responses" and not _aux_available:
                assistant_msg, _ = self._normalize_codex_response(response)
                if assistant_msg and assistant_msg.tool_calls:
                    tool_calls = assistant_msg.tool_calls