mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-01 00:11:39 +08:00
Compare commits
1 Commits
codex-port
...
custom_aux
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f984cc335b |
@@ -4,7 +4,7 @@ Provides a single resolution chain so every consumer (context compression,
|
|||||||
session search, web extraction, vision analysis, browser vision) picks up
|
session search, web extraction, vision analysis, browser vision) picks up
|
||||||
the best available backend without duplicating fallback logic.
|
the best available backend without duplicating fallback logic.
|
||||||
|
|
||||||
Resolution order for text tasks:
|
Resolution order (same for text and vision tasks):
|
||||||
1. OpenRouter (OPENROUTER_API_KEY)
|
1. OpenRouter (OPENROUTER_API_KEY)
|
||||||
2. Nous Portal (~/.hermes/auth.json active provider)
|
2. Nous Portal (~/.hermes/auth.json active provider)
|
||||||
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
||||||
@@ -14,10 +14,10 @@ Resolution order for text tasks:
|
|||||||
— checked via PROVIDER_REGISTRY entries with auth_type='api_key'
|
— checked via PROVIDER_REGISTRY entries with auth_type='api_key'
|
||||||
6. None
|
6. None
|
||||||
|
|
||||||
Resolution order for vision/multimodal tasks:
|
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
|
||||||
1. OpenRouter
|
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
|
||||||
2. Nous Portal
|
"openrouter", "nous", or "main" (= steps 3-5).
|
||||||
3. None (custom endpoints can't substitute for Gemini multimodal)
|
Default "auto" follows the full chain above.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@@ -329,59 +329,122 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
# ── Public API ──────────────────────────────────────────────────────────────
|
# ── Provider resolution helpers ─────────────────────────────────────────────
|
||||||
|
|
||||||
def get_text_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
def _get_auxiliary_provider(task: str = "") -> str:
|
||||||
"""Return (client, model_slug) for text-only auxiliary tasks.
|
"""Read the provider override for a specific auxiliary task.
|
||||||
|
|
||||||
Falls through OpenRouter -> Nous Portal -> custom endpoint -> Codex OAuth
|
Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
|
||||||
-> direct API-key providers -> (None, None).
|
then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
|
||||||
|
then falls back to "auto". Returns one of: "auto", "openrouter", "nous", "main".
|
||||||
"""
|
"""
|
||||||
# 1. OpenRouter
|
if task:
|
||||||
|
for prefix in ("AUXILIARY_", "CONTEXT_"):
|
||||||
|
val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
|
||||||
|
if val and val != "auto":
|
||||||
|
return val
|
||||||
|
return "auto"
|
||||||
|
|
||||||
|
|
||||||
|
def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
or_key = os.getenv("OPENROUTER_API_KEY")
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
||||||
if or_key:
|
if not or_key:
|
||||||
logger.debug("Auxiliary text client: OpenRouter")
|
return None, None
|
||||||
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
|
logger.debug("Auxiliary client: OpenRouter")
|
||||||
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
|
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
|
||||||
|
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
|
||||||
|
|
||||||
# 2. Nous Portal
|
|
||||||
|
def _try_nous() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
nous = _read_nous_auth()
|
nous = _read_nous_auth()
|
||||||
if nous:
|
if not nous:
|
||||||
global auxiliary_is_nous
|
return None, None
|
||||||
auxiliary_is_nous = True
|
global auxiliary_is_nous
|
||||||
logger.debug("Auxiliary text client: Nous Portal")
|
auxiliary_is_nous = True
|
||||||
return (
|
logger.debug("Auxiliary client: Nous Portal")
|
||||||
OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
|
return (
|
||||||
_NOUS_MODEL,
|
OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
|
||||||
)
|
_NOUS_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
# 3. Custom endpoint (both base URL and key must be set)
|
|
||||||
|
def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
custom_base = os.getenv("OPENAI_BASE_URL")
|
custom_base = os.getenv("OPENAI_BASE_URL")
|
||||||
custom_key = os.getenv("OPENAI_API_KEY")
|
custom_key = os.getenv("OPENAI_API_KEY")
|
||||||
if custom_base and custom_key:
|
if not custom_base or not custom_key:
|
||||||
model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
|
return None, None
|
||||||
logger.debug("Auxiliary text client: custom endpoint (%s)", model)
|
model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
|
||||||
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
logger.debug("Auxiliary client: custom endpoint (%s)", model)
|
||||||
|
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
||||||
|
|
||||||
# 4. Codex OAuth -- uses the Responses API (only endpoint the token
|
|
||||||
# can access), wrapped to look like a chat.completions client.
|
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
||||||
codex_token = _read_codex_access_token()
|
codex_token = _read_codex_access_token()
|
||||||
if codex_token:
|
if not codex_token:
|
||||||
logger.debug("Auxiliary text client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
|
return None, None
|
||||||
real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
|
logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
|
||||||
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
|
||||||
|
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
||||||
|
|
||||||
# 5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, etc.)
|
|
||||||
api_client, api_model = _resolve_api_key_provider()
|
|
||||||
if api_client is not None:
|
|
||||||
return api_client, api_model
|
|
||||||
|
|
||||||
# 6. Nothing available
|
def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
logger.debug("Auxiliary text client: none available")
|
"""Resolve a specific forced provider. Returns (None, None) if creds missing."""
|
||||||
|
if forced == "openrouter":
|
||||||
|
client, model = _try_openrouter()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("auxiliary.provider=openrouter but OPENROUTER_API_KEY not set")
|
||||||
|
return client, model
|
||||||
|
|
||||||
|
if forced == "nous":
|
||||||
|
client, model = _try_nous()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("auxiliary.provider=nous but Nous Portal not configured (run: hermes login)")
|
||||||
|
return client, model
|
||||||
|
|
||||||
|
if forced == "main":
|
||||||
|
# "main" = skip OpenRouter/Nous, use the main chat model's credentials.
|
||||||
|
for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider):
|
||||||
|
client, model = try_fn()
|
||||||
|
if client is not None:
|
||||||
|
return client, model
|
||||||
|
logger.warning("auxiliary.provider=main but no main endpoint credentials found")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Unknown provider name — fall through to auto
|
||||||
|
logger.warning("Unknown auxiliary.provider=%r, falling back to auto", forced)
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def get_async_text_auxiliary_client():
|
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Full auto-detection chain: OpenRouter → Nous → custom → Codex → API-key → None."""
|
||||||
|
for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
|
||||||
|
_try_codex, _resolve_api_key_provider):
|
||||||
|
client, model = try_fn()
|
||||||
|
if client is not None:
|
||||||
|
return client, model
|
||||||
|
logger.debug("Auxiliary client: none available")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
|
"""Return (client, default_model_slug) for text-only auxiliary tasks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task: Optional task name ("compression", "web_extract") to check
|
||||||
|
for a task-specific provider override.
|
||||||
|
|
||||||
|
Callers may override the returned model with a per-task env var
|
||||||
|
(e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
|
||||||
|
"""
|
||||||
|
forced = _get_auxiliary_provider(task)
|
||||||
|
if forced != "auto":
|
||||||
|
return _resolve_forced_provider(forced)
|
||||||
|
return _resolve_auto()
|
||||||
|
|
||||||
|
|
||||||
|
def get_async_text_auxiliary_client(task: str = ""):
|
||||||
"""Return (async_client, model_slug) for async consumers.
|
"""Return (async_client, model_slug) for async consumers.
|
||||||
|
|
||||||
For standard providers returns (AsyncOpenAI, model). For Codex returns
|
For standard providers returns (AsyncOpenAI, model). For Codex returns
|
||||||
@@ -390,7 +453,7 @@ def get_async_text_auxiliary_client():
|
|||||||
"""
|
"""
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
sync_client, model = get_text_auxiliary_client()
|
sync_client, model = get_text_auxiliary_client(task)
|
||||||
if sync_client is None:
|
if sync_client is None:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
@@ -407,30 +470,16 @@ def get_async_text_auxiliary_client():
|
|||||||
|
|
||||||
|
|
||||||
def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
"""Return (client, model_slug) for vision/multimodal auxiliary tasks.
|
"""Return (client, default_model_slug) for vision/multimodal auxiliary tasks.
|
||||||
|
|
||||||
Only OpenRouter and Nous Portal qualify — custom endpoints cannot
|
Checks AUXILIARY_VISION_PROVIDER for a forced provider, otherwise
|
||||||
substitute for Gemini multimodal.
|
auto-detects. Callers may override the returned model with
|
||||||
|
AUXILIARY_VISION_MODEL.
|
||||||
"""
|
"""
|
||||||
# 1. OpenRouter
|
forced = _get_auxiliary_provider("vision")
|
||||||
or_key = os.getenv("OPENROUTER_API_KEY")
|
if forced != "auto":
|
||||||
if or_key:
|
return _resolve_forced_provider(forced)
|
||||||
logger.debug("Auxiliary vision client: OpenRouter")
|
return _resolve_auto()
|
||||||
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
|
|
||||||
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
|
|
||||||
|
|
||||||
# 2. Nous Portal
|
|
||||||
nous = _read_nous_auth()
|
|
||||||
if nous:
|
|
||||||
logger.debug("Auxiliary vision client: Nous Portal")
|
|
||||||
return (
|
|
||||||
OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
|
|
||||||
_NOUS_MODEL,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Nothing suitable
|
|
||||||
logger.debug("Auxiliary vision client: none available")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
|
|
||||||
def get_auxiliary_extra_body() -> dict:
|
def get_auxiliary_extra_body() -> dict:
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class ContextCompressor:
|
|||||||
self.last_completion_tokens = 0
|
self.last_completion_tokens = 0
|
||||||
self.last_total_tokens = 0
|
self.last_total_tokens = 0
|
||||||
|
|
||||||
self.client, default_model = get_text_auxiliary_client()
|
self.client, default_model = get_text_auxiliary_client("compression")
|
||||||
self.summary_model = summary_model_override or default_model
|
self.summary_model = summary_model_override or default_model
|
||||||
|
|
||||||
def update_from_response(self, usage: Dict[str, Any]):
|
def update_from_response(self, usage: Dict[str, Any]):
|
||||||
|
|||||||
@@ -199,9 +199,59 @@ compression:
|
|||||||
threshold: 0.85
|
threshold: 0.85
|
||||||
|
|
||||||
# Model to use for generating summaries (fast/cheap recommended)
|
# Model to use for generating summaries (fast/cheap recommended)
|
||||||
# This model compresses the middle turns into a concise summary
|
# This model compresses the middle turns into a concise summary.
|
||||||
|
# IMPORTANT: it receives the full middle section of the conversation, so it
|
||||||
|
# MUST support a context length at least as large as your main model's.
|
||||||
summary_model: "google/gemini-3-flash-preview"
|
summary_model: "google/gemini-3-flash-preview"
|
||||||
|
|
||||||
|
# Provider for the summary model (default: "auto")
|
||||||
|
# Options: "auto", "openrouter", "nous", "main"
|
||||||
|
# summary_provider: "auto"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Auxiliary Models (Advanced — Experimental)
|
||||||
|
# =============================================================================
|
||||||
|
# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
|
||||||
|
# browser screenshot analysis, web page summarization, and context compression.
|
||||||
|
#
|
||||||
|
# By default these use Gemini Flash via OpenRouter or Nous Portal and are
|
||||||
|
# auto-detected from your credentials. You do NOT need to change anything
|
||||||
|
# here for normal usage.
|
||||||
|
#
|
||||||
|
# WARNING: Overriding these with providers other than OpenRouter or Nous Portal
|
||||||
|
# is EXPERIMENTAL and may not work. Not all models/providers support vision,
|
||||||
|
# produce usable summaries, or accept the same API format. Change at your own
|
||||||
|
# risk — if things break, reset to "auto" / empty values.
|
||||||
|
#
|
||||||
|
# Each task has its own provider + model pair so you can mix providers.
|
||||||
|
# For example: OpenRouter for vision (needs multimodal), but your main
|
||||||
|
# local endpoint for compression (just needs text).
|
||||||
|
#
|
||||||
|
# Provider options:
|
||||||
|
# "auto" - Best available: OpenRouter → Nous Portal → main endpoint (default)
|
||||||
|
# "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
|
||||||
|
# "nous" - Force Nous Portal (requires: hermes login)
|
||||||
|
# "main" - Use the same provider & credentials as your main chat model.
|
||||||
|
# Skips OpenRouter/Nous and uses your custom endpoint
|
||||||
|
# (OPENAI_BASE_URL), Codex OAuth, or API-key provider directly.
|
||||||
|
# Useful if you run a local model and want auxiliary tasks to
|
||||||
|
# use it too.
|
||||||
|
#
|
||||||
|
# Model: leave empty to use the provider's default. When empty, OpenRouter
|
||||||
|
# uses "google/gemini-3-flash-preview" and Nous uses "gemini-3-flash".
|
||||||
|
# Other providers pick a sensible default automatically.
|
||||||
|
#
|
||||||
|
# auxiliary:
|
||||||
|
# # Image analysis: vision_analyze tool + browser screenshots
|
||||||
|
# vision:
|
||||||
|
# provider: "auto"
|
||||||
|
# model: "" # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
|
||||||
|
#
|
||||||
|
# # Web page scraping / summarization + browser page text extraction
|
||||||
|
# web_extract:
|
||||||
|
# provider: "auto"
|
||||||
|
# model: ""
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Persistent Memory
|
# Persistent Memory
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
24
cli.py
24
cli.py
@@ -332,12 +332,36 @@ def load_cli_config() -> Dict[str, Any]:
|
|||||||
"enabled": "CONTEXT_COMPRESSION_ENABLED",
|
"enabled": "CONTEXT_COMPRESSION_ENABLED",
|
||||||
"threshold": "CONTEXT_COMPRESSION_THRESHOLD",
|
"threshold": "CONTEXT_COMPRESSION_THRESHOLD",
|
||||||
"summary_model": "CONTEXT_COMPRESSION_MODEL",
|
"summary_model": "CONTEXT_COMPRESSION_MODEL",
|
||||||
|
"summary_provider": "CONTEXT_COMPRESSION_PROVIDER",
|
||||||
}
|
}
|
||||||
|
|
||||||
for config_key, env_var in compression_env_mappings.items():
|
for config_key, env_var in compression_env_mappings.items():
|
||||||
if config_key in compression_config:
|
if config_key in compression_config:
|
||||||
os.environ[env_var] = str(compression_config[config_key])
|
os.environ[env_var] = str(compression_config[config_key])
|
||||||
|
|
||||||
|
# Apply auxiliary model overrides to environment variables.
|
||||||
|
# Vision and web_extract each have their own provider + model pair.
|
||||||
|
# (Compression is handled in the compression section above.)
|
||||||
|
# Only set env vars for non-empty / non-default values so auto-detection
|
||||||
|
# still works.
|
||||||
|
auxiliary_config = defaults.get("auxiliary", {})
|
||||||
|
auxiliary_task_env = {
|
||||||
|
# config key → (provider env var, model env var)
|
||||||
|
"vision": ("AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL"),
|
||||||
|
"web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL"),
|
||||||
|
}
|
||||||
|
|
||||||
|
for task_key, (prov_env, model_env) in auxiliary_task_env.items():
|
||||||
|
task_cfg = auxiliary_config.get(task_key, {})
|
||||||
|
if not isinstance(task_cfg, dict):
|
||||||
|
continue
|
||||||
|
prov = str(task_cfg.get("provider", "")).strip()
|
||||||
|
model = str(task_cfg.get("model", "")).strip()
|
||||||
|
if prov and prov != "auto":
|
||||||
|
os.environ[prov_env] = prov
|
||||||
|
if model:
|
||||||
|
os.environ[model_env] = model
|
||||||
|
|
||||||
return defaults
|
return defaults
|
||||||
|
|
||||||
# Load configuration at module startup
|
# Load configuration at module startup
|
||||||
|
|||||||
@@ -87,6 +87,20 @@ DEFAULT_CONFIG = {
|
|||||||
"enabled": True,
|
"enabled": True,
|
||||||
"threshold": 0.85,
|
"threshold": 0.85,
|
||||||
"summary_model": "google/gemini-3-flash-preview",
|
"summary_model": "google/gemini-3-flash-preview",
|
||||||
|
"summary_provider": "auto",
|
||||||
|
},
|
||||||
|
|
||||||
|
# Auxiliary model overrides (advanced). By default Hermes auto-selects
|
||||||
|
# the provider and model for each side task. Set these to override.
|
||||||
|
"auxiliary": {
|
||||||
|
"vision": {
|
||||||
|
"provider": "auto", # auto | openrouter | nous | main
|
||||||
|
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
||||||
|
},
|
||||||
|
"web_extract": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"display": {
|
"display": {
|
||||||
@@ -890,6 +904,31 @@ def show_config():
|
|||||||
if enabled:
|
if enabled:
|
||||||
print(f" Threshold: {compression.get('threshold', 0.85) * 100:.0f}%")
|
print(f" Threshold: {compression.get('threshold', 0.85) * 100:.0f}%")
|
||||||
print(f" Model: {compression.get('summary_model', 'google/gemini-3-flash-preview')}")
|
print(f" Model: {compression.get('summary_model', 'google/gemini-3-flash-preview')}")
|
||||||
|
comp_provider = compression.get('summary_provider', 'auto')
|
||||||
|
if comp_provider != 'auto':
|
||||||
|
print(f" Provider: {comp_provider}")
|
||||||
|
|
||||||
|
# Auxiliary models
|
||||||
|
auxiliary = config.get('auxiliary', {})
|
||||||
|
aux_tasks = {
|
||||||
|
"Vision": auxiliary.get('vision', {}),
|
||||||
|
"Web extract": auxiliary.get('web_extract', {}),
|
||||||
|
}
|
||||||
|
has_overrides = any(
|
||||||
|
t.get('provider', 'auto') != 'auto' or t.get('model', '')
|
||||||
|
for t in aux_tasks.values()
|
||||||
|
)
|
||||||
|
if has_overrides:
|
||||||
|
print()
|
||||||
|
print(color("◆ Auxiliary Models (overrides)", Colors.CYAN, Colors.BOLD))
|
||||||
|
for label, task_cfg in aux_tasks.items():
|
||||||
|
prov = task_cfg.get('provider', 'auto')
|
||||||
|
mdl = task_cfg.get('model', '')
|
||||||
|
if prov != 'auto' or mdl:
|
||||||
|
parts = [f"provider={prov}"]
|
||||||
|
if mdl:
|
||||||
|
parts.append(f"model={mdl}")
|
||||||
|
print(f" {label:12s} {', '.join(parts)}")
|
||||||
|
|
||||||
# Messaging
|
# Messaging
|
||||||
print()
|
print()
|
||||||
|
|||||||
@@ -151,10 +151,10 @@ class TestGetTextAuxiliaryClient:
|
|||||||
assert model is None
|
assert model is None
|
||||||
|
|
||||||
|
|
||||||
class TestCodexNotInVisionClient:
|
class TestVisionClientFallback:
|
||||||
"""Codex fallback should NOT apply to vision tasks."""
|
"""Vision client uses the same full fallback chain as text."""
|
||||||
|
|
||||||
def test_vision_returns_none_without_openrouter_nous(self):
|
def test_vision_returns_none_without_any_credentials(self):
|
||||||
with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
|
with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
|
||||||
client, model = get_vision_auxiliary_client()
|
client, model = get_vision_auxiliary_client()
|
||||||
assert client is None
|
assert client is None
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ import time
|
|||||||
import requests
|
import requests
|
||||||
from typing import Dict, Any, Optional, List
|
from typing import Dict, Any, Optional, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from agent.auxiliary_client import get_vision_auxiliary_client
|
from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -80,8 +80,28 @@ DEFAULT_SESSION_TIMEOUT = 300
|
|||||||
# Max tokens for snapshot content before summarization
|
# Max tokens for snapshot content before summarization
|
||||||
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
|
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
|
||||||
|
|
||||||
# Resolve vision auxiliary client for extraction/vision tasks
|
# Vision client — for browser_vision (screenshot analysis)
|
||||||
_aux_vision_client, EXTRACTION_MODEL = get_vision_auxiliary_client()
|
_aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
|
||||||
|
|
||||||
|
# Text client — for page snapshot summarization (same config as web_extract)
|
||||||
|
_aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
|
||||||
|
|
||||||
|
# Module-level alias for availability checks
|
||||||
|
EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def _get_vision_model() -> str:
|
||||||
|
"""Model for browser_vision (screenshot analysis — multimodal)."""
|
||||||
|
return (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
|
||||||
|
or _DEFAULT_VISION_MODEL
|
||||||
|
or "google/gemini-3-flash-preview")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_extraction_model() -> str:
|
||||||
|
"""Model for page snapshot text summarization — same as web_extract."""
|
||||||
|
return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
||||||
|
or _DEFAULT_TEXT_MODEL
|
||||||
|
or "google/gemini-3-flash-preview")
|
||||||
|
|
||||||
|
|
||||||
def _is_local_mode() -> bool:
|
def _is_local_mode() -> bool:
|
||||||
@@ -850,9 +870,9 @@ def _extract_relevant_content(
|
|||||||
) -> str:
|
) -> str:
|
||||||
"""Use LLM to extract relevant content from a snapshot based on the user's task.
|
"""Use LLM to extract relevant content from a snapshot based on the user's task.
|
||||||
|
|
||||||
Falls back to simple truncation when no auxiliary vision model is configured.
|
Falls back to simple truncation when no auxiliary text model is configured.
|
||||||
"""
|
"""
|
||||||
if _aux_vision_client is None or EXTRACTION_MODEL is None:
|
if _aux_text_client is None:
|
||||||
return _truncate_snapshot(snapshot_text)
|
return _truncate_snapshot(snapshot_text)
|
||||||
|
|
||||||
if user_task:
|
if user_task:
|
||||||
@@ -880,8 +900,8 @@ def _extract_relevant_content(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import auxiliary_max_tokens_param
|
from agent.auxiliary_client import auxiliary_max_tokens_param
|
||||||
response = _aux_vision_client.chat.completions.create(
|
response = _aux_text_client.chat.completions.create(
|
||||||
model=EXTRACTION_MODEL,
|
model=_get_extraction_model(),
|
||||||
messages=[{"role": "user", "content": extraction_prompt}],
|
messages=[{"role": "user", "content": extraction_prompt}],
|
||||||
**auxiliary_max_tokens_param(4000),
|
**auxiliary_max_tokens_param(4000),
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
@@ -1304,7 +1324,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||||||
effective_task_id = task_id or "default"
|
effective_task_id = task_id or "default"
|
||||||
|
|
||||||
# Check auxiliary vision client
|
# Check auxiliary vision client
|
||||||
if _aux_vision_client is None or EXTRACTION_MODEL is None:
|
if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": "Browser vision unavailable: no auxiliary vision model configured. "
|
"error": "Browser vision unavailable: no auxiliary vision model configured. "
|
||||||
@@ -1354,7 +1374,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||||||
# Use the sync auxiliary vision client directly
|
# Use the sync auxiliary vision client directly
|
||||||
from agent.auxiliary_client import auxiliary_max_tokens_param
|
from agent.auxiliary_client import auxiliary_max_tokens_param
|
||||||
response = _aux_vision_client.chat.completions.create(
|
response = _aux_vision_client.chat.completions.create(
|
||||||
model=EXTRACTION_MODEL,
|
model=_get_vision_model(),
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|||||||
@@ -468,7 +468,9 @@ def _handle_vision_analyze(args, **kw):
|
|||||||
image_url = args.get("image_url", "")
|
image_url = args.get("image_url", "")
|
||||||
question = args.get("question", "")
|
question = args.get("question", "")
|
||||||
full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
|
full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
|
||||||
model = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
|
model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
|
||||||
|
or DEFAULT_VISION_MODEL
|
||||||
|
or "google/gemini-3-flash-preview")
|
||||||
return vision_analyze_tool(image_url, full_prompt, model)
|
return vision_analyze_tool(image_url, full_prompt, model)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -85,7 +85,13 @@ DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
|||||||
|
|
||||||
# Resolve async auxiliary client at module level.
|
# Resolve async auxiliary client at module level.
|
||||||
# Handles Codex Responses API adapter transparently.
|
# Handles Codex Responses API adapter transparently.
|
||||||
_aux_async_client, DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client()
|
_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
|
||||||
|
|
||||||
|
# Allow per-task override via config.yaml auxiliary.web_extract_model
|
||||||
|
DEFAULT_SUMMARIZER_MODEL = (
|
||||||
|
os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
||||||
|
or _DEFAULT_SUMMARIZER_MODEL
|
||||||
|
)
|
||||||
|
|
||||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user