mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-16 15:11:18 +08:00
Compare commits
3 Commits
fix/matrix
...
fix/browse
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8cd4a96686 | ||
|
|
4433b83378 | ||
|
|
6405d389aa |
@@ -65,10 +65,15 @@ OPENCODE_GO_API_KEY=
|
||||
# TOOL API KEYS
|
||||
# =============================================================================
|
||||
|
||||
# Parallel API Key - AI-native web search and extract
|
||||
# Get at: https://parallel.ai
|
||||
PARALLEL_API_KEY=
|
||||
|
||||
# Firecrawl API Key - Web search, extract, and crawl
|
||||
# Get at: https://firecrawl.dev/
|
||||
FIRECRAWL_API_KEY=
|
||||
|
||||
|
||||
# FAL.ai API Key - Image generation
|
||||
# Get at: https://fal.ai/
|
||||
FAL_KEY=
|
||||
|
||||
@@ -44,7 +44,7 @@ hermes-agent/
|
||||
│ ├── terminal_tool.py # Terminal orchestration
|
||||
│ ├── process_registry.py # Background process management
|
||||
│ ├── file_tools.py # File read/write/search/patch
|
||||
│ ├── web_tools.py # Firecrawl search/extract
|
||||
│ ├── web_tools.py # Web search/extract (Parallel + Firecrawl)
|
||||
│ ├── browser_tool.py # Browserbase browser automation
|
||||
│ ├── code_execution_tool.py # execute_code sandbox
|
||||
│ ├── delegate_tool.py # Subagent delegation
|
||||
|
||||
@@ -147,7 +147,7 @@ hermes-agent/
|
||||
│ ├── approval.py # Dangerous command detection + per-session approval
|
||||
│ ├── terminal_tool.py # Terminal orchestration (sudo, env lifecycle, backends)
|
||||
│ ├── file_operations.py # read_file, write_file, search, patch, etc.
|
||||
│ ├── web_tools.py # web_search, web_extract (Firecrawl + Gemini summarization)
|
||||
│ ├── web_tools.py # web_search, web_extract (Parallel/Firecrawl + Gemini summarization)
|
||||
│ ├── vision_tools.py # Image analysis via multimodal models
|
||||
│ ├── delegate_tool.py # Subagent spawning and parallel task execution
|
||||
│ ├── code_execution_tool.py # Sandboxed Python with RPC tool access
|
||||
|
||||
@@ -550,6 +550,14 @@ OPTIONAL_ENV_VARS = {
|
||||
},
|
||||
|
||||
# ── Tool API keys ──
|
||||
"PARALLEL_API_KEY": {
|
||||
"description": "Parallel API key for AI-native web search and extract",
|
||||
"prompt": "Parallel API key",
|
||||
"url": "https://parallel.ai/",
|
||||
"tools": ["web_search", "web_extract"],
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"FIRECRAWL_API_KEY": {
|
||||
"description": "Firecrawl API key for web search and scraping",
|
||||
"prompt": "Firecrawl API key",
|
||||
@@ -1506,6 +1514,7 @@ def show_config():
|
||||
keys = [
|
||||
("OPENROUTER_API_KEY", "OpenRouter"),
|
||||
("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"),
|
||||
("PARALLEL_API_KEY", "Parallel"),
|
||||
("FIRECRAWL_API_KEY", "Firecrawl"),
|
||||
("BROWSERBASE_API_KEY", "Browserbase"),
|
||||
("BROWSER_USE_API_KEY", "Browser Use"),
|
||||
@@ -1655,7 +1664,7 @@ def set_config_value(key: str, value: str):
|
||||
# Check if it's an API key (goes to .env)
|
||||
api_keys = [
|
||||
'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY',
|
||||
'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY',
|
||||
'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY',
|
||||
'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
|
||||
'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
|
||||
'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',
|
||||
|
||||
@@ -444,11 +444,11 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
else:
|
||||
tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY"))
|
||||
|
||||
# Firecrawl (web tools)
|
||||
if get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"):
|
||||
# Web tools (Parallel or Firecrawl)
|
||||
if get_env_value("PARALLEL_API_KEY") or get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"):
|
||||
tool_status.append(("Web Search & Extract", True, None))
|
||||
else:
|
||||
tool_status.append(("Web Search & Extract", False, "FIRECRAWL_API_KEY"))
|
||||
tool_status.append(("Web Search & Extract", False, "PARALLEL_API_KEY or FIRECRAWL_API_KEY"))
|
||||
|
||||
# Browser tools (local Chromium or Browserbase cloud)
|
||||
import shutil
|
||||
|
||||
@@ -151,19 +151,29 @@ TOOL_CATEGORIES = {
|
||||
"web": {
|
||||
"name": "Web Search & Extract",
|
||||
"setup_title": "Select Search Provider",
|
||||
"setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need Firecrawl.",
|
||||
"setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need a premium provider.",
|
||||
"icon": "🔍",
|
||||
"providers": [
|
||||
{
|
||||
"name": "Firecrawl Cloud",
|
||||
"tag": "Recommended - hosted service",
|
||||
"tag": "Hosted service - search, extract, and crawl",
|
||||
"web_backend": "firecrawl",
|
||||
"env_vars": [
|
||||
{"key": "FIRECRAWL_API_KEY", "prompt": "Firecrawl API key", "url": "https://firecrawl.dev"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Parallel",
|
||||
"tag": "AI-native search and extract",
|
||||
"web_backend": "parallel",
|
||||
"env_vars": [
|
||||
{"key": "PARALLEL_API_KEY", "prompt": "Parallel API key", "url": "https://parallel.ai"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Firecrawl Self-Hosted",
|
||||
"tag": "Free - run your own instance",
|
||||
"web_backend": "firecrawl",
|
||||
"env_vars": [
|
||||
{"key": "FIRECRAWL_API_URL", "prompt": "Your Firecrawl instance URL (e.g., http://localhost:3002)"},
|
||||
],
|
||||
@@ -618,6 +628,9 @@ def _is_provider_active(provider: dict, config: dict) -> bool:
|
||||
if "browser_provider" in provider:
|
||||
current = config.get("browser", {}).get("cloud_provider")
|
||||
return provider["browser_provider"] == current
|
||||
if provider.get("web_backend"):
|
||||
current = config.get("web", {}).get("backend")
|
||||
return current == provider["web_backend"]
|
||||
return False
|
||||
|
||||
|
||||
@@ -650,6 +663,11 @@ def _configure_provider(provider: dict, config: dict):
|
||||
else:
|
||||
config.get("browser", {}).pop("cloud_provider", None)
|
||||
|
||||
# Set web search backend in config if applicable
|
||||
if provider.get("web_backend"):
|
||||
config.setdefault("web", {})["backend"] = provider["web_backend"]
|
||||
_print_success(f" Web backend set to: {provider['web_backend']}")
|
||||
|
||||
if not env_vars:
|
||||
_print_success(f" {provider['name']} - no configuration needed!")
|
||||
return
|
||||
|
||||
@@ -27,6 +27,7 @@ dependencies = [
|
||||
"prompt_toolkit",
|
||||
# Tools
|
||||
"firecrawl-py",
|
||||
"parallel-web>=0.4.2",
|
||||
"fal-client",
|
||||
# Text-to-speech (Edge TTS is free, no API key needed)
|
||||
"edge-tts",
|
||||
|
||||
@@ -18,6 +18,7 @@ PyJWT[crypto]
|
||||
|
||||
# Web tools
|
||||
firecrawl-py
|
||||
parallel-web>=0.4.2
|
||||
|
||||
# Image generation
|
||||
fal-client
|
||||
|
||||
@@ -112,9 +112,14 @@ class TestDefaultContextLengths:
|
||||
|
||||
def test_gpt4_models_128k(self):
|
||||
for key, value in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
if "gpt-4" in key:
|
||||
if "gpt-4" in key and "gpt-4.1" not in key:
|
||||
assert value == 128000, f"{key} should be 128000"
|
||||
|
||||
def test_gpt41_models_1m(self):
|
||||
for key, value in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
if "gpt-4.1" in key:
|
||||
assert value == 1047576, f"{key} should be 1047576"
|
||||
|
||||
def test_gemini_models_1m(self):
|
||||
for key, value in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
if "gemini" in key:
|
||||
|
||||
@@ -5,6 +5,13 @@ from hermes_cli.config import load_config, save_config
|
||||
from hermes_cli.setup import setup_model_provider
|
||||
|
||||
|
||||
def _maybe_keep_current_tts(question, choices):
|
||||
if question != "Select TTS provider:":
|
||||
return None
|
||||
assert choices[-1].startswith("Keep current (")
|
||||
return len(choices) - 1
|
||||
|
||||
|
||||
def _clear_provider_env(monkeypatch):
|
||||
for key in (
|
||||
"NOUS_API_KEY",
|
||||
@@ -25,16 +32,22 @@ def test_nous_oauth_setup_keeps_current_model_when_syncing_disk_provider(
|
||||
|
||||
config = load_config()
|
||||
|
||||
# Provider selection always comes first. Depending on available vision
|
||||
# backends, setup may either skip the optional vision step or prompt for
|
||||
# it before the default-model choice. Provide enough selections for both
|
||||
# paths while still ending on "keep current model".
|
||||
prompt_choices = iter([0, 2, 2])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.setup.prompt_choice",
|
||||
lambda *args, **kwargs: next(prompt_choices),
|
||||
)
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
if question == "Select your inference provider:":
|
||||
return 0
|
||||
if question == "Configure vision:":
|
||||
return len(choices) - 1
|
||||
if question == "Select default model:":
|
||||
assert choices[-1] == "Keep current (anthropic/claude-opus-4.6)"
|
||||
return len(choices) - 1
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: "")
|
||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||
|
||||
def _fake_login_nous(*args, **kwargs):
|
||||
auth_path = tmp_path / "auth.json"
|
||||
@@ -74,20 +87,29 @@ def test_custom_setup_clears_active_oauth_provider(tmp_path, monkeypatch):
|
||||
|
||||
config = load_config()
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: 3)
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
if question == "Select your inference provider:":
|
||||
return 3
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
|
||||
prompt_values = iter(
|
||||
[
|
||||
"https://custom.example/v1",
|
||||
"custom-api-key",
|
||||
"custom/model",
|
||||
"",
|
||||
]
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.setup.prompt",
|
||||
lambda *args, **kwargs: next(prompt_values),
|
||||
)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False)
|
||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||
|
||||
setup_model_provider(config)
|
||||
save_config(config)
|
||||
@@ -109,11 +131,17 @@ def test_codex_setup_uses_runtime_access_token_for_live_model_list(tmp_path, mon
|
||||
|
||||
config = load_config()
|
||||
|
||||
prompt_choices = iter([1, 0])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.setup.prompt_choice",
|
||||
lambda *args, **kwargs: next(prompt_choices),
|
||||
)
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
if question == "Select your inference provider:":
|
||||
return 1
|
||||
if question == "Select default model:":
|
||||
return 0
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: "")
|
||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||
monkeypatch.setattr("hermes_cli.auth._login_openai_codex", lambda *args, **kwargs: None)
|
||||
|
||||
@@ -6,6 +6,13 @@ from hermes_cli.config import load_config, save_config, save_env_value
|
||||
from hermes_cli.setup import _print_setup_summary, setup_model_provider
|
||||
|
||||
|
||||
def _maybe_keep_current_tts(question, choices):
|
||||
if question != "Select TTS provider:":
|
||||
return None
|
||||
assert choices[-1].startswith("Keep current (")
|
||||
return len(choices) - 1
|
||||
|
||||
|
||||
def _read_env(home):
|
||||
env_path = home / ".env"
|
||||
data = {}
|
||||
@@ -50,13 +57,13 @@ def test_setup_keep_current_custom_from_config_does_not_fall_through(tmp_path, m
|
||||
}
|
||||
save_config(config)
|
||||
|
||||
calls = {"count": 0}
|
||||
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
calls["count"] += 1
|
||||
if calls["count"] == 1:
|
||||
if question == "Select your inference provider:":
|
||||
assert choices[-1] == "Keep current (Custom: https://example.invalid/v1)"
|
||||
return len(choices) - 1
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError("Model menu should not appear for keep-current custom")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
@@ -72,7 +79,6 @@ def test_setup_keep_current_custom_from_config_does_not_fall_through(tmp_path, m
|
||||
assert reloaded["model"]["provider"] == "custom"
|
||||
assert reloaded["model"]["default"] == "custom/model"
|
||||
assert reloaded["model"]["base_url"] == "https://example.invalid/v1"
|
||||
assert calls["count"] == 1
|
||||
|
||||
|
||||
def test_setup_custom_endpoint_saves_working_v1_base_url(tmp_path, monkeypatch):
|
||||
@@ -86,6 +92,9 @@ def test_setup_custom_endpoint_saves_working_v1_base_url(tmp_path, monkeypatch):
|
||||
return 3 # Custom endpoint
|
||||
if question == "Configure vision:":
|
||||
return len(choices) - 1 # Skip
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
def fake_prompt(message, current=None, **kwargs):
|
||||
@@ -140,22 +149,23 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm
|
||||
save_config(config)
|
||||
|
||||
captured = {"provider_choices": None, "model_choices": None}
|
||||
calls = {"count": 0}
|
||||
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
calls["count"] += 1
|
||||
if calls["count"] == 1:
|
||||
if question == "Select your inference provider:":
|
||||
captured["provider_choices"] = list(choices)
|
||||
assert choices[-1] == "Keep current (Anthropic)"
|
||||
return len(choices) - 1
|
||||
if calls["count"] == 2:
|
||||
if question == "Configure vision:":
|
||||
assert question == "Configure vision:"
|
||||
assert choices[-1] == "Skip for now"
|
||||
return len(choices) - 1
|
||||
if calls["count"] == 3:
|
||||
if question == "Select default model:":
|
||||
captured["model_choices"] = list(choices)
|
||||
return len(choices) - 1 # keep current model
|
||||
raise AssertionError("Unexpected extra prompt_choice call")
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: "")
|
||||
@@ -172,7 +182,6 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm
|
||||
assert captured["model_choices"] is not None
|
||||
assert captured["model_choices"][0] == "claude-opus-4-6"
|
||||
assert "anthropic/claude-opus-4.6 (recommended)" not in captured["model_choices"]
|
||||
assert calls["count"] == 3
|
||||
|
||||
|
||||
def test_setup_keep_current_anthropic_can_configure_openai_vision_default(tmp_path, monkeypatch):
|
||||
@@ -186,14 +195,24 @@ def test_setup_keep_current_anthropic_can_configure_openai_vision_default(tmp_pa
|
||||
}
|
||||
save_config(config)
|
||||
|
||||
picks = iter([
|
||||
10, # keep current provider (shifted +1 by kilocode insertion)
|
||||
1, # configure vision with OpenAI
|
||||
5, # use default gpt-4o-mini vision model
|
||||
4, # keep current Anthropic model
|
||||
])
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
if question == "Select your inference provider:":
|
||||
assert choices[-1] == "Keep current (Anthropic)"
|
||||
return len(choices) - 1
|
||||
if question == "Configure vision:":
|
||||
return 1
|
||||
if question == "Select vision model:":
|
||||
assert choices[-1] == "Use default (gpt-4o-mini)"
|
||||
return len(choices) - 1
|
||||
if question == "Select default model:":
|
||||
assert choices[-1] == "Keep current (claude-opus-4-6)"
|
||||
return len(choices) - 1
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: next(picks))
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.setup.prompt",
|
||||
lambda message, *args, **kwargs: "sk-openai" if "OpenAI API key" in message else "",
|
||||
@@ -229,8 +248,17 @@ def test_setup_switch_custom_to_codex_clears_custom_endpoint_and_updates_config(
|
||||
}
|
||||
save_config(config)
|
||||
|
||||
picks = iter([1, 0])
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: next(picks))
|
||||
def fake_prompt_choice(question, choices, default=0):
|
||||
if question == "Select your inference provider:":
|
||||
return 1
|
||||
if question == "Select default model:":
|
||||
return 0
|
||||
tts_idx = _maybe_keep_current_tts(question, choices)
|
||||
if tts_idx is not None:
|
||||
return tts_idx
|
||||
raise AssertionError(f"Unexpected prompt_choice call: {question}")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt", lambda *args, **kwargs: "")
|
||||
monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False)
|
||||
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
||||
|
||||
@@ -63,11 +63,13 @@ class TestFromEnv:
|
||||
|
||||
class TestFromGlobalConfig:
|
||||
def test_missing_config_falls_back_to_env(self, tmp_path):
|
||||
config = HonchoClientConfig.from_global_config(
|
||||
config_path=tmp_path / "nonexistent.json"
|
||||
)
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
config = HonchoClientConfig.from_global_config(
|
||||
config_path=tmp_path / "nonexistent.json"
|
||||
)
|
||||
# Should fall back to from_env
|
||||
assert config.enabled is True or config.api_key is None # depends on env
|
||||
assert config.enabled is False
|
||||
assert config.api_key is None
|
||||
|
||||
def test_reads_full_config(self, tmp_path):
|
||||
config_file = tmp_path / "config.json"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Comprehensive Test Suite for Web Tools Module
|
||||
|
||||
This script tests all web tools functionality to ensure they work correctly.
|
||||
Run this after any updates to the web_tools.py module or Firecrawl library.
|
||||
Run this after any updates to the web_tools.py module or backend libraries.
|
||||
|
||||
Usage:
|
||||
python test_web_tools.py # Run all tests
|
||||
@@ -11,7 +11,7 @@ Usage:
|
||||
python test_web_tools.py --verbose # Show detailed output
|
||||
|
||||
Requirements:
|
||||
- FIRECRAWL_API_KEY environment variable must be set
|
||||
- PARALLEL_API_KEY or FIRECRAWL_API_KEY environment variable must be set
|
||||
- An auxiliary LLM provider (OPENROUTER_API_KEY or Nous Portal auth) (optional, for LLM tests)
|
||||
"""
|
||||
|
||||
@@ -28,12 +28,14 @@ from typing import List
|
||||
|
||||
# Import the web tools to test (updated path after moving tools/)
|
||||
from tools.web_tools import (
|
||||
web_search_tool,
|
||||
web_extract_tool,
|
||||
web_search_tool,
|
||||
web_extract_tool,
|
||||
web_crawl_tool,
|
||||
check_firecrawl_api_key,
|
||||
check_web_api_key,
|
||||
check_auxiliary_model,
|
||||
get_debug_session_info
|
||||
get_debug_session_info,
|
||||
_get_backend,
|
||||
)
|
||||
|
||||
|
||||
@@ -121,12 +123,13 @@ class WebToolsTester:
|
||||
"""Test environment setup and API keys"""
|
||||
print_section("Environment Check")
|
||||
|
||||
# Check Firecrawl API key
|
||||
if not check_firecrawl_api_key():
|
||||
self.log_result("Firecrawl API Key", "failed", "FIRECRAWL_API_KEY not set")
|
||||
# Check web backend API key (Parallel or Firecrawl)
|
||||
if not check_web_api_key():
|
||||
self.log_result("Web Backend API Key", "failed", "PARALLEL_API_KEY or FIRECRAWL_API_KEY not set")
|
||||
return False
|
||||
else:
|
||||
self.log_result("Firecrawl API Key", "passed", "Found")
|
||||
backend = _get_backend()
|
||||
self.log_result("Web Backend API Key", "passed", f"Using {backend} backend")
|
||||
|
||||
# Check auxiliary LLM provider (optional)
|
||||
if not check_auxiliary_model():
|
||||
@@ -578,7 +581,9 @@ class WebToolsTester:
|
||||
},
|
||||
"results": self.test_results,
|
||||
"environment": {
|
||||
"web_backend": _get_backend() if check_web_api_key() else None,
|
||||
"firecrawl_api_key": check_firecrawl_api_key(),
|
||||
"parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")),
|
||||
"auxiliary_model": check_auxiliary_model(),
|
||||
"debug_mode": get_debug_session_info()["enabled"]
|
||||
}
|
||||
|
||||
@@ -98,11 +98,14 @@ class TestProviderRegistry:
|
||||
# =============================================================================
|
||||
|
||||
PROVIDER_ENV_VARS = (
|
||||
"OPENROUTER_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY",
|
||||
"OPENROUTER_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN",
|
||||
"CLAUDE_CODE_OAUTH_TOKEN",
|
||||
"GLM_API_KEY", "ZAI_API_KEY", "Z_AI_API_KEY",
|
||||
"KIMI_API_KEY", "KIMI_BASE_URL", "MINIMAX_API_KEY", "MINIMAX_CN_API_KEY",
|
||||
"AI_GATEWAY_API_KEY", "AI_GATEWAY_BASE_URL",
|
||||
"KILOCODE_API_KEY", "KILOCODE_BASE_URL",
|
||||
"DASHSCOPE_API_KEY", "OPENCODE_ZEN_API_KEY", "OPENCODE_GO_API_KEY",
|
||||
"NOUS_API_KEY",
|
||||
"OPENAI_BASE_URL",
|
||||
)
|
||||
|
||||
@@ -111,6 +114,7 @@ PROVIDER_ENV_VARS = (
|
||||
def _clear_provider_env(monkeypatch):
|
||||
for key in PROVIDER_ENV_VARS:
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
monkeypatch.setattr("hermes_cli.auth._load_auth_store", lambda: {})
|
||||
|
||||
|
||||
class TestResolveProvider:
|
||||
|
||||
@@ -2596,17 +2596,19 @@ class TestMCPSelectiveToolLoading:
|
||||
|
||||
async def run():
|
||||
with patch("tools.mcp_tool._connect_server", side_effect=fake_connect), \
|
||||
patch.dict("tools.mcp_tool._servers", {}, clear=True), \
|
||||
patch("tools.registry.registry", mock_registry), \
|
||||
patch("toolsets.create_custom_toolset"):
|
||||
return await _discover_and_register_server(
|
||||
registered = await _discover_and_register_server(
|
||||
"ink_existing",
|
||||
{"url": "https://mcp.example.com", "tools": {"include": ["create_service"]}},
|
||||
)
|
||||
return registered, _existing_tool_names()
|
||||
|
||||
try:
|
||||
registered = asyncio.run(run())
|
||||
registered, existing = asyncio.run(run())
|
||||
assert registered == ["mcp_ink_existing_create_service"]
|
||||
assert _existing_tool_names() == ["mcp_ink_existing_create_service"]
|
||||
assert existing == ["mcp_ink_existing_create_service"]
|
||||
finally:
|
||||
_servers.pop("ink_existing", None)
|
||||
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
"""Tests for Firecrawl client configuration and singleton behavior.
|
||||
"""Tests for web backend client configuration and singleton behavior.
|
||||
|
||||
Coverage:
|
||||
_get_firecrawl_client() — configuration matrix, singleton caching,
|
||||
constructor failure recovery, return value verification, edge cases.
|
||||
_get_backend() — backend selection logic with env var combinations.
|
||||
_get_parallel_client() — Parallel client configuration, singleton caching.
|
||||
check_web_api_key() — unified availability check.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -117,3 +120,157 @@ class TestFirecrawlClientConfig:
|
||||
from tools.web_tools import _get_firecrawl_client
|
||||
with pytest.raises(ValueError):
|
||||
_get_firecrawl_client()
|
||||
|
||||
|
||||
class TestBackendSelection:
|
||||
"""Test suite for _get_backend() backend selection logic.
|
||||
|
||||
The backend is configured via config.yaml (web.backend), set by
|
||||
``hermes tools``. Falls back to key-based detection for legacy/manual
|
||||
setups.
|
||||
"""
|
||||
|
||||
_ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL")
|
||||
|
||||
def setup_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def teardown_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
# ── Config-based selection (web.backend in config.yaml) ───────────
|
||||
|
||||
def test_config_parallel(self):
|
||||
"""web.backend=parallel in config → 'parallel' regardless of keys."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "parallel"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
def test_config_firecrawl(self):
|
||||
"""web.backend=firecrawl in config → 'firecrawl' even if Parallel key set."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "firecrawl"}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_config_case_insensitive(self):
|
||||
"""web.backend=Parallel (mixed case) → 'parallel'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "Parallel"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
# ── Fallback (no web.backend in config) ───────────────────────────
|
||||
|
||||
def test_fallback_parallel_only_key(self):
|
||||
"""Only PARALLEL_API_KEY set → 'parallel'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
def test_fallback_both_keys_defaults_to_firecrawl(self):
|
||||
"""Both keys set, no config → 'firecrawl' (backward compat)."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key", "FIRECRAWL_API_KEY": "fc-test"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_fallback_firecrawl_only_key(self):
|
||||
"""Only FIRECRAWL_API_KEY set → 'firecrawl'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_fallback_no_keys_defaults_to_firecrawl(self):
|
||||
"""No keys, no config → 'firecrawl' (will fail at client init)."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_invalid_config_falls_through_to_fallback(self):
|
||||
"""web.backend=invalid → ignored, uses key-based fallback."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
|
||||
class TestParallelClientConfig:
|
||||
"""Test suite for Parallel client initialization."""
|
||||
|
||||
def setup_method(self):
|
||||
import tools.web_tools
|
||||
tools.web_tools._parallel_client = None
|
||||
os.environ.pop("PARALLEL_API_KEY", None)
|
||||
|
||||
def teardown_method(self):
|
||||
import tools.web_tools
|
||||
tools.web_tools._parallel_client = None
|
||||
os.environ.pop("PARALLEL_API_KEY", None)
|
||||
|
||||
def test_creates_client_with_key(self):
|
||||
"""PARALLEL_API_KEY set → creates Parallel client."""
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import _get_parallel_client
|
||||
from parallel import Parallel
|
||||
client = _get_parallel_client()
|
||||
assert client is not None
|
||||
assert isinstance(client, Parallel)
|
||||
|
||||
def test_no_key_raises_with_helpful_message(self):
|
||||
"""No PARALLEL_API_KEY → ValueError with guidance."""
|
||||
from tools.web_tools import _get_parallel_client
|
||||
with pytest.raises(ValueError, match="PARALLEL_API_KEY"):
|
||||
_get_parallel_client()
|
||||
|
||||
def test_singleton_returns_same_instance(self):
|
||||
"""Second call returns cached client."""
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import _get_parallel_client
|
||||
client1 = _get_parallel_client()
|
||||
client2 = _get_parallel_client()
|
||||
assert client1 is client2
|
||||
|
||||
|
||||
class TestCheckWebApiKey:
|
||||
"""Test suite for check_web_api_key() unified availability check."""
|
||||
|
||||
_ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL")
|
||||
|
||||
def setup_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def teardown_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def test_parallel_key_only(self):
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_firecrawl_key_only(self):
|
||||
with patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_firecrawl_url_only(self):
|
||||
with patch.dict(os.environ, {"FIRECRAWL_API_URL": "http://localhost:3002"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_no_keys_returns_false(self):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is False
|
||||
|
||||
def test_both_keys_returns_true(self):
|
||||
with patch.dict(os.environ, {
|
||||
"PARALLEL_API_KEY": "test-key",
|
||||
"FIRECRAWL_API_KEY": "fc-test",
|
||||
}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
@@ -426,6 +426,8 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch):
|
||||
async def test_web_crawl_short_circuits_blocked_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
|
||||
# web_crawl_tool checks for Firecrawl env before website policy
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
monkeypatch.setattr(
|
||||
web_tools,
|
||||
"check_website_access",
|
||||
@@ -453,6 +455,9 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch):
|
||||
async def test_web_crawl_blocks_redirected_final_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
|
||||
# web_crawl_tool checks for Firecrawl env before website policy
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
def fake_check(url):
|
||||
if url == "https://allowed.test":
|
||||
return None
|
||||
|
||||
@@ -555,6 +555,11 @@ def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]:
|
||||
session_info = provider.create_session(task_id)
|
||||
|
||||
with _cleanup_lock:
|
||||
# Double-check: another thread may have created a session while we
|
||||
# were doing the network call. Use the existing one to avoid leaking
|
||||
# orphan cloud sessions.
|
||||
if task_id in _active_sessions:
|
||||
return _active_sessions[task_id]
|
||||
_active_sessions[task_id] = session_info
|
||||
|
||||
return session_info
|
||||
|
||||
@@ -82,6 +82,9 @@ def _build_provider_env_blocklist() -> frozenset:
|
||||
"FIREWORKS_API_KEY", # Fireworks AI
|
||||
"XAI_API_KEY", # xAI (Grok)
|
||||
"HELICONE_API_KEY", # LLM Observability proxy
|
||||
"PARALLEL_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
# Gateway/runtime config not represented in OPTIONAL_ENV_VARS.
|
||||
"TELEGRAM_HOME_CHANNEL",
|
||||
"TELEGRAM_HOME_CHANNEL_NAME",
|
||||
|
||||
@@ -3,16 +3,16 @@
|
||||
Standalone Web Tools Module
|
||||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Currently uses Firecrawl as the backend, and the interface makes it easy to swap
|
||||
providers without changing the function signatures.
|
||||
Backend is selected during ``hermes tools`` setup (web.backend in config.yaml).
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
- web_extract_tool: Extract content from specific web pages
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
- web_crawl_tool: Crawl websites with specific instructions (Firecrawl only)
|
||||
|
||||
Backend compatibility:
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl)
|
||||
- Parallel: https://docs.parallel.ai (search, extract)
|
||||
|
||||
LLM Processing:
|
||||
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
|
||||
@@ -53,6 +53,39 @@ from tools.website_policy import check_website_access
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Backend Selection ────────────────────────────────────────────────────────
|
||||
|
||||
def _load_web_config() -> dict:
|
||||
"""Load the ``web:`` section from ~/.hermes/config.yaml."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
return load_config().get("web", {})
|
||||
except (ImportError, Exception):
|
||||
return {}
|
||||
|
||||
|
||||
def _get_backend() -> str:
|
||||
"""Determine which web backend to use.
|
||||
|
||||
Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
|
||||
Falls back to whichever API key is present for users who configured
|
||||
keys manually without running setup.
|
||||
"""
|
||||
configured = _load_web_config().get("backend", "").lower().strip()
|
||||
if configured in ("parallel", "firecrawl"):
|
||||
return configured
|
||||
# Fallback for manual / legacy config — use whichever key is present.
|
||||
has_firecrawl = bool(os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL"))
|
||||
has_parallel = bool(os.getenv("PARALLEL_API_KEY"))
|
||||
if has_parallel and not has_firecrawl:
|
||||
return "parallel"
|
||||
# Default to firecrawl (backward compat, or when both are set)
|
||||
return "firecrawl"
|
||||
|
||||
|
||||
# ─── Firecrawl Client ────────────────────────────────────────────────────────
|
||||
|
||||
_firecrawl_client = None
|
||||
|
||||
def _get_firecrawl_client():
|
||||
@@ -81,6 +114,47 @@ def _get_firecrawl_client():
|
||||
_firecrawl_client = Firecrawl(**kwargs)
|
||||
return _firecrawl_client
|
||||
|
||||
|
||||
# ─── Parallel Client ─────────────────────────────────────────────────────────
|
||||
|
||||
_parallel_client = None
|
||||
_async_parallel_client = None
|
||||
|
||||
def _get_parallel_client():
|
||||
"""Get or create the Parallel sync client (lazy initialization).
|
||||
|
||||
Requires PARALLEL_API_KEY environment variable.
|
||||
"""
|
||||
from parallel import Parallel
|
||||
global _parallel_client
|
||||
if _parallel_client is None:
|
||||
api_key = os.getenv("PARALLEL_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"PARALLEL_API_KEY environment variable not set. "
|
||||
"Get your API key at https://parallel.ai"
|
||||
)
|
||||
_parallel_client = Parallel(api_key=api_key)
|
||||
return _parallel_client
|
||||
|
||||
|
||||
def _get_async_parallel_client():
|
||||
"""Get or create the Parallel async client (lazy initialization).
|
||||
|
||||
Requires PARALLEL_API_KEY environment variable.
|
||||
"""
|
||||
from parallel import AsyncParallel
|
||||
global _async_parallel_client
|
||||
if _async_parallel_client is None:
|
||||
api_key = os.getenv("PARALLEL_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"PARALLEL_API_KEY environment variable not set. "
|
||||
"Get your API key at https://parallel.ai"
|
||||
)
|
||||
_async_parallel_client = AsyncParallel(api_key=api_key)
|
||||
return _async_parallel_client
|
||||
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
# Allow per-task override via env var
|
||||
@@ -428,13 +502,89 @@ def clean_base64_images(text: str) -> str:
|
||||
return cleaned_text
|
||||
|
||||
|
||||
# ─── Parallel Search & Extract Helpers ────────────────────────────────────────
|
||||
|
||||
def _parallel_search(query: str, limit: int = 5) -> dict:
|
||||
"""Search using the Parallel SDK and return results as a dict."""
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return {"error": "Interrupted", "success": False}
|
||||
|
||||
mode = os.getenv("PARALLEL_SEARCH_MODE", "agentic").lower().strip()
|
||||
if mode not in ("fast", "one-shot", "agentic"):
|
||||
mode = "agentic"
|
||||
|
||||
logger.info("Parallel search: '%s' (mode=%s, limit=%d)", query, mode, limit)
|
||||
response = _get_parallel_client().beta.search(
|
||||
search_queries=[query],
|
||||
objective=query,
|
||||
mode=mode,
|
||||
max_results=min(limit, 20),
|
||||
)
|
||||
|
||||
web_results = []
|
||||
for i, result in enumerate(response.results or []):
|
||||
excerpts = result.excerpts or []
|
||||
web_results.append({
|
||||
"url": result.url or "",
|
||||
"title": result.title or "",
|
||||
"description": " ".join(excerpts) if excerpts else "",
|
||||
"position": i + 1,
|
||||
})
|
||||
|
||||
return {"success": True, "data": {"web": web_results}}
|
||||
|
||||
|
||||
async def _parallel_extract(urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Extract content from URLs using the Parallel async SDK.
|
||||
|
||||
Returns a list of result dicts matching the structure expected by the
|
||||
LLM post-processing pipeline (url, title, content, metadata).
|
||||
"""
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return [{"url": u, "error": "Interrupted", "title": ""} for u in urls]
|
||||
|
||||
logger.info("Parallel extract: %d URL(s)", len(urls))
|
||||
response = await _get_async_parallel_client().beta.extract(
|
||||
urls=urls,
|
||||
full_content=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
for result in response.results or []:
|
||||
content = result.full_content or ""
|
||||
if not content:
|
||||
content = "\n\n".join(result.excerpts or [])
|
||||
url = result.url or ""
|
||||
title = result.title or ""
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"raw_content": content,
|
||||
"metadata": {"sourceURL": url, "title": title},
|
||||
})
|
||||
|
||||
for error in response.errors or []:
|
||||
results.append({
|
||||
"url": error.url or "",
|
||||
"title": "",
|
||||
"content": "",
|
||||
"error": error.content or error.error_type or "extraction failed",
|
||||
"metadata": {"sourceURL": error.url or ""},
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
"""
|
||||
Search the web for information using available search API backend.
|
||||
|
||||
|
||||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Firecrawl.
|
||||
|
||||
with multiple backends (Parallel or Firecrawl).
|
||||
|
||||
Note: This function returns search result metadata only (URLs, titles, descriptions).
|
||||
Use web_extract_tool to get full content from specific URLs.
|
||||
|
||||
@@ -478,17 +628,28 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
if is_interrupted():
|
||||
return json.dumps({"error": "Interrupted", "success": False})
|
||||
|
||||
# Dispatch to the configured backend
|
||||
backend = _get_backend()
|
||||
if backend == "parallel":
|
||||
response_data = _parallel_search(query, limit)
|
||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
||||
debug_call_data["final_response_size"] = len(result_json)
|
||||
_debug.log_call("web_search_tool", debug_call_data)
|
||||
_debug.save()
|
||||
return result_json
|
||||
|
||||
logger.info("Searching the web for: '%s' (limit: %d)", query, limit)
|
||||
|
||||
|
||||
response = _get_firecrawl_client().search(
|
||||
query=query,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
|
||||
# The response is a SearchData object with web, news, and images attributes
|
||||
# When not scraping, the results are directly in these attributes
|
||||
web_results = []
|
||||
|
||||
|
||||
# Check if response has web attribute (SearchData object)
|
||||
if hasattr(response, 'web'):
|
||||
# Response is a SearchData object with web attribute
|
||||
@@ -596,123 +757,130 @@ async def web_extract_tool(
|
||||
|
||||
try:
|
||||
logger.info("Extracting content from %d URL(s)", len(urls))
|
||||
|
||||
# Determine requested formats for Firecrawl v2
|
||||
formats: List[str] = []
|
||||
if format == "markdown":
|
||||
formats = ["markdown"]
|
||||
elif format == "html":
|
||||
formats = ["html"]
|
||||
|
||||
# Dispatch to the configured backend
|
||||
backend = _get_backend()
|
||||
|
||||
if backend == "parallel":
|
||||
results = await _parallel_extract(urls)
|
||||
else:
|
||||
# Default: request markdown for LLM-readiness and include html as backup
|
||||
formats = ["markdown", "html"]
|
||||
|
||||
# Always use individual scraping for simplicity and reliability
|
||||
# Batch scraping adds complexity without much benefit for small numbers of URLs
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
from tools.interrupt import is_interrupted as _is_interrupted
|
||||
for url in urls:
|
||||
if _is_interrupted():
|
||||
results.append({"url": url, "error": "Interrupted", "title": ""})
|
||||
continue
|
||||
# ── Firecrawl extraction ──
|
||||
# Determine requested formats for Firecrawl v2
|
||||
formats: List[str] = []
|
||||
if format == "markdown":
|
||||
formats = ["markdown"]
|
||||
elif format == "html":
|
||||
formats = ["html"]
|
||||
else:
|
||||
# Default: request markdown for LLM-readiness and include html as backup
|
||||
formats = ["markdown", "html"]
|
||||
|
||||
# Website policy check — block before fetching
|
||||
blocked = check_website_access(url)
|
||||
if blocked:
|
||||
logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
|
||||
results.append({
|
||||
"url": url, "title": "", "content": "",
|
||||
"error": blocked["message"],
|
||||
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
|
||||
})
|
||||
continue
|
||||
# Always use individual scraping for simplicity and reliability
|
||||
# Batch scraping adds complexity without much benefit for small numbers of URLs
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
try:
|
||||
logger.info("Scraping: %s", url)
|
||||
scrape_result = _get_firecrawl_client().scrape(
|
||||
url=url,
|
||||
formats=formats
|
||||
)
|
||||
|
||||
# Process the result - properly handle object serialization
|
||||
metadata = {}
|
||||
title = ""
|
||||
content_markdown = None
|
||||
content_html = None
|
||||
|
||||
# Extract data from the scrape result
|
||||
if hasattr(scrape_result, 'model_dump'):
|
||||
# Pydantic model - use model_dump to get dict
|
||||
result_dict = scrape_result.model_dump()
|
||||
content_markdown = result_dict.get('markdown')
|
||||
content_html = result_dict.get('html')
|
||||
metadata = result_dict.get('metadata', {})
|
||||
elif hasattr(scrape_result, '__dict__'):
|
||||
# Regular object with attributes
|
||||
content_markdown = getattr(scrape_result, 'markdown', None)
|
||||
content_html = getattr(scrape_result, 'html', None)
|
||||
|
||||
# Handle metadata - convert to dict if it's an object
|
||||
metadata_obj = getattr(scrape_result, 'metadata', {})
|
||||
if hasattr(metadata_obj, 'model_dump'):
|
||||
metadata = metadata_obj.model_dump()
|
||||
elif hasattr(metadata_obj, '__dict__'):
|
||||
metadata = metadata_obj.__dict__
|
||||
elif isinstance(metadata_obj, dict):
|
||||
metadata = metadata_obj
|
||||
else:
|
||||
metadata = {}
|
||||
elif isinstance(scrape_result, dict):
|
||||
# Already a dictionary
|
||||
content_markdown = scrape_result.get('markdown')
|
||||
content_html = scrape_result.get('html')
|
||||
metadata = scrape_result.get('metadata', {})
|
||||
|
||||
# Ensure metadata is a dict (not an object)
|
||||
if not isinstance(metadata, dict):
|
||||
if hasattr(metadata, 'model_dump'):
|
||||
metadata = metadata.model_dump()
|
||||
elif hasattr(metadata, '__dict__'):
|
||||
metadata = metadata.__dict__
|
||||
else:
|
||||
metadata = {}
|
||||
|
||||
# Get title from metadata
|
||||
title = metadata.get("title", "")
|
||||
|
||||
# Re-check final URL after redirect
|
||||
final_url = metadata.get("sourceURL", url)
|
||||
final_blocked = check_website_access(final_url)
|
||||
if final_blocked:
|
||||
logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
|
||||
from tools.interrupt import is_interrupted as _is_interrupted
|
||||
for url in urls:
|
||||
if _is_interrupted():
|
||||
results.append({"url": url, "error": "Interrupted", "title": ""})
|
||||
continue
|
||||
|
||||
# Website policy check — block before fetching
|
||||
blocked = check_website_access(url)
|
||||
if blocked:
|
||||
logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
|
||||
results.append({
|
||||
"url": final_url, "title": title, "content": "", "raw_content": "",
|
||||
"error": final_blocked["message"],
|
||||
"blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
|
||||
"url": url, "title": "", "content": "",
|
||||
"error": blocked["message"],
|
||||
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
|
||||
})
|
||||
continue
|
||||
|
||||
# Choose content based on requested format
|
||||
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
|
||||
|
||||
results.append({
|
||||
"url": final_url,
|
||||
"title": title,
|
||||
"content": chosen_content,
|
||||
"raw_content": chosen_content,
|
||||
"metadata": metadata # Now guaranteed to be a dict
|
||||
})
|
||||
|
||||
except Exception as scrape_err:
|
||||
logger.debug("Scrape failed for %s: %s", url, scrape_err)
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"raw_content": "",
|
||||
"error": str(scrape_err)
|
||||
})
|
||||
try:
|
||||
logger.info("Scraping: %s", url)
|
||||
scrape_result = _get_firecrawl_client().scrape(
|
||||
url=url,
|
||||
formats=formats
|
||||
)
|
||||
|
||||
# Process the result - properly handle object serialization
|
||||
metadata = {}
|
||||
title = ""
|
||||
content_markdown = None
|
||||
content_html = None
|
||||
|
||||
# Extract data from the scrape result
|
||||
if hasattr(scrape_result, 'model_dump'):
|
||||
# Pydantic model - use model_dump to get dict
|
||||
result_dict = scrape_result.model_dump()
|
||||
content_markdown = result_dict.get('markdown')
|
||||
content_html = result_dict.get('html')
|
||||
metadata = result_dict.get('metadata', {})
|
||||
elif hasattr(scrape_result, '__dict__'):
|
||||
# Regular object with attributes
|
||||
content_markdown = getattr(scrape_result, 'markdown', None)
|
||||
content_html = getattr(scrape_result, 'html', None)
|
||||
|
||||
# Handle metadata - convert to dict if it's an object
|
||||
metadata_obj = getattr(scrape_result, 'metadata', {})
|
||||
if hasattr(metadata_obj, 'model_dump'):
|
||||
metadata = metadata_obj.model_dump()
|
||||
elif hasattr(metadata_obj, '__dict__'):
|
||||
metadata = metadata_obj.__dict__
|
||||
elif isinstance(metadata_obj, dict):
|
||||
metadata = metadata_obj
|
||||
else:
|
||||
metadata = {}
|
||||
elif isinstance(scrape_result, dict):
|
||||
# Already a dictionary
|
||||
content_markdown = scrape_result.get('markdown')
|
||||
content_html = scrape_result.get('html')
|
||||
metadata = scrape_result.get('metadata', {})
|
||||
|
||||
# Ensure metadata is a dict (not an object)
|
||||
if not isinstance(metadata, dict):
|
||||
if hasattr(metadata, 'model_dump'):
|
||||
metadata = metadata.model_dump()
|
||||
elif hasattr(metadata, '__dict__'):
|
||||
metadata = metadata.__dict__
|
||||
else:
|
||||
metadata = {}
|
||||
|
||||
# Get title from metadata
|
||||
title = metadata.get("title", "")
|
||||
|
||||
# Re-check final URL after redirect
|
||||
final_url = metadata.get("sourceURL", url)
|
||||
final_blocked = check_website_access(final_url)
|
||||
if final_blocked:
|
||||
logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
|
||||
results.append({
|
||||
"url": final_url, "title": title, "content": "", "raw_content": "",
|
||||
"error": final_blocked["message"],
|
||||
"blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
|
||||
})
|
||||
continue
|
||||
|
||||
# Choose content based on requested format
|
||||
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
|
||||
|
||||
results.append({
|
||||
"url": final_url,
|
||||
"title": title,
|
||||
"content": chosen_content,
|
||||
"raw_content": chosen_content,
|
||||
"metadata": metadata # Now guaranteed to be a dict
|
||||
})
|
||||
|
||||
except Exception as scrape_err:
|
||||
logger.debug("Scrape failed for %s: %s", url, scrape_err)
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"raw_content": "",
|
||||
"error": str(scrape_err)
|
||||
})
|
||||
|
||||
response = {"results": results}
|
||||
|
||||
@@ -887,6 +1055,14 @@ async def web_crawl_tool(
|
||||
}
|
||||
|
||||
try:
|
||||
# web_crawl requires Firecrawl — Parallel has no crawl API
|
||||
if not (os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")):
|
||||
return json.dumps({
|
||||
"error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, "
|
||||
"or use web_search + web_extract instead.",
|
||||
"success": False,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Ensure URL has protocol
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = f'https://{url}'
|
||||
@@ -1151,13 +1327,22 @@ async def web_crawl_tool(
|
||||
def check_firecrawl_api_key() -> bool:
|
||||
"""
|
||||
Check if the Firecrawl API key is available in environment variables.
|
||||
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||
|
||||
|
||||
def check_web_api_key() -> bool:
|
||||
"""Check if any web backend API key is available (Parallel or Firecrawl)."""
|
||||
return bool(
|
||||
os.getenv("PARALLEL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_URL")
|
||||
)
|
||||
|
||||
|
||||
def check_auxiliary_model() -> bool:
|
||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||
try:
|
||||
@@ -1184,26 +1369,30 @@ if __name__ == "__main__":
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API keys are available
|
||||
firecrawl_available = check_firecrawl_api_key()
|
||||
web_available = check_web_api_key()
|
||||
nous_available = check_auxiliary_model()
|
||||
|
||||
if not firecrawl_available:
|
||||
print("❌ FIRECRAWL_API_KEY environment variable not set")
|
||||
print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://firecrawl.dev/")
|
||||
|
||||
if web_available:
|
||||
backend = _get_backend()
|
||||
print(f"✅ Web backend: {backend}")
|
||||
if backend == "parallel":
|
||||
print(" Using Parallel API (https://parallel.ai)")
|
||||
else:
|
||||
print(" Using Firecrawl API (https://firecrawl.dev)")
|
||||
else:
|
||||
print("✅ Firecrawl API key found")
|
||||
|
||||
print("❌ No web search backend configured")
|
||||
print("Set PARALLEL_API_KEY (https://parallel.ai) or FIRECRAWL_API_KEY (https://firecrawl.dev)")
|
||||
|
||||
if not nous_available:
|
||||
print("❌ No auxiliary model available for LLM content processing")
|
||||
print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
|
||||
print("⚠️ Without an auxiliary model, LLM content processing will be disabled")
|
||||
else:
|
||||
print(f"✅ Auxiliary model available: {DEFAULT_SUMMARIZER_MODEL}")
|
||||
|
||||
if not firecrawl_available:
|
||||
|
||||
if not web_available:
|
||||
exit(1)
|
||||
|
||||
|
||||
print("🛠️ Web tools ready for use!")
|
||||
|
||||
if nous_available:
|
||||
@@ -1301,8 +1490,8 @@ registry.register(
|
||||
toolset="web",
|
||||
schema=WEB_SEARCH_SCHEMA,
|
||||
handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=5),
|
||||
check_fn=check_firecrawl_api_key,
|
||||
requires_env=["FIRECRAWL_API_KEY"],
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"],
|
||||
emoji="🔍",
|
||||
)
|
||||
registry.register(
|
||||
@@ -1311,8 +1500,8 @@ registry.register(
|
||||
schema=WEB_EXTRACT_SCHEMA,
|
||||
handler=lambda args, **kw: web_extract_tool(
|
||||
args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
|
||||
check_fn=check_firecrawl_api_key,
|
||||
requires_env=["FIRECRAWL_API_KEY"],
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"],
|
||||
is_async=True,
|
||||
emoji="📄",
|
||||
)
|
||||
|
||||
@@ -61,6 +61,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `PARALLEL_API_KEY` | AI-native web search ([parallel.ai](https://parallel.ai/)) |
|
||||
| `FIRECRAWL_API_KEY` | Web scraping ([firecrawl.dev](https://firecrawl.dev/)) |
|
||||
| `FIRECRAWL_API_URL` | Custom Firecrawl API endpoint for self-hosted instances (optional) |
|
||||
| `BROWSERBASE_API_KEY` | Browser automation ([browserbase.com](https://browserbase.com/)) |
|
||||
|
||||
Reference in New Issue
Block a user