mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-10 04:37:27 +08:00
Compare commits
33 Commits
fix/multim
...
fix/smart-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e373745a9 | ||
|
|
efb780c754 | ||
|
|
c64efa9260 | ||
|
|
43cb35cb21 | ||
|
|
db496180db | ||
|
|
c69adfbb17 | ||
|
|
683c8b24d4 | ||
|
|
d2dee43825 | ||
|
|
59b53f0a23 | ||
|
|
d198a647e2 | ||
|
|
0f53275169 | ||
|
|
366de72a38 | ||
|
|
13f5459670 | ||
|
|
93333387d6 | ||
|
|
1f9e7cd659 | ||
|
|
09fc64c6b6 | ||
|
|
84147f4d81 | ||
|
|
ee4b20b55b | ||
|
|
ed27b826c5 | ||
|
|
b03aefaf20 | ||
|
|
d7f4db53f5 | ||
|
|
2c97bf3936 | ||
|
|
1dfa544250 | ||
|
|
eac5f8f40f | ||
|
|
184aa5b2b3 | ||
|
|
bdcf247efe | ||
|
|
b16d7f2da6 | ||
|
|
9423fda5cb | ||
|
|
4d873f77c1 | ||
|
|
09336a6710 | ||
|
|
1db8609ac9 | ||
|
|
0d96f1991c | ||
|
|
4e3a8a0637 |
12
.env.example
12
.env.example
@@ -201,6 +201,18 @@ VOICE_TOOLS_OPENAI_KEY=
|
||||
# WHATSAPP_ENABLED=false
|
||||
# WHATSAPP_ALLOWED_USERS=15551234567
|
||||
|
||||
# Email (IMAP/SMTP — send and receive emails as Hermes)
|
||||
# For Gmail: enable 2FA → create App Password at https://myaccount.google.com/apppasswords
|
||||
# EMAIL_ADDRESS=hermes@gmail.com
|
||||
# EMAIL_PASSWORD=xxxx xxxx xxxx xxxx
|
||||
# EMAIL_IMAP_HOST=imap.gmail.com
|
||||
# EMAIL_IMAP_PORT=993
|
||||
# EMAIL_SMTP_HOST=smtp.gmail.com
|
||||
# EMAIL_SMTP_PORT=587
|
||||
# EMAIL_POLL_INTERVAL=15
|
||||
# EMAIL_ALLOWED_USERS=your@email.com
|
||||
# EMAIL_HOME_ADDRESS=your@email.com
|
||||
|
||||
# Gateway-wide: allow ALL users without an allowlist (default: false = deny)
|
||||
# Only set to true if you intentionally want open access.
|
||||
# GATEWAY_ALLOW_ALL_USERS=false
|
||||
|
||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
- name: Run tests
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pytest tests/ -q --ignore=tests/integration --tb=short
|
||||
python -m pytest tests/ -q --ignore=tests/integration --tb=short -n auto
|
||||
env:
|
||||
# Ensure tests don't accidentally call real APIs
|
||||
OPENROUTER_API_KEY: ""
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -49,3 +49,4 @@ cli-config.yaml
|
||||
skills/.hub/
|
||||
ignored/
|
||||
.worktrees/
|
||||
environments/benchmarks/evals/
|
||||
|
||||
@@ -41,7 +41,6 @@ After installation:
|
||||
|
||||
```bash
|
||||
source ~/.bashrc # reload shell (or: source ~/.zshrc)
|
||||
hermes setup # configure your LLM provider
|
||||
hermes # start chatting!
|
||||
```
|
||||
|
||||
@@ -51,9 +50,11 @@ hermes # start chatting!
|
||||
|
||||
```bash
|
||||
hermes # Interactive CLI — start a conversation
|
||||
hermes model # Switch provider or model
|
||||
hermes setup # Re-run the setup wizard
|
||||
hermes model # Choose your LLM provider and model
|
||||
hermes tools # Configure which tools are enabled
|
||||
hermes config set # Set individual config values
|
||||
hermes gateway # Start the messaging gateway (Telegram, Discord, etc.)
|
||||
hermes setup # Run the full setup wizard (configures everything at once)
|
||||
hermes update # Update to the latest version
|
||||
hermes doctor # Diagnose any issues
|
||||
```
|
||||
|
||||
@@ -5,7 +5,6 @@ Uses Gemini Flash (cheap/fast) to summarize middle turns while
|
||||
protecting head and tail context.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
@@ -83,41 +82,6 @@ class ContextCompressor:
|
||||
"compression_count": self.compression_count,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _content_to_text(content: Any) -> str:
|
||||
"""Convert message content to plain text for summarization.
|
||||
|
||||
Handles:
|
||||
- str → returned as-is
|
||||
- None → empty string
|
||||
- list (multimodal) → text parts joined, images replaced with [image]
|
||||
- other → JSON serialization or str() fallback
|
||||
"""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
item_type = item.get("type")
|
||||
if item_type == "text":
|
||||
parts.append(item.get("text", ""))
|
||||
elif item_type == "image_url":
|
||||
parts.append("[image]")
|
||||
elif item_type:
|
||||
parts.append(f"[{item_type}]")
|
||||
else:
|
||||
parts.append(str(item))
|
||||
else:
|
||||
parts.append(str(item))
|
||||
return "\n".join(part for part in parts if part)
|
||||
try:
|
||||
return json.dumps(content, ensure_ascii=False, sort_keys=True)
|
||||
except TypeError:
|
||||
return str(content)
|
||||
|
||||
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||
"""Generate a concise summary of conversation turns.
|
||||
|
||||
@@ -129,7 +93,7 @@ class ContextCompressor:
|
||||
parts = []
|
||||
for msg in turns_to_summarize:
|
||||
role = msg.get("role", "unknown")
|
||||
content = self._content_to_text(msg.get("content"))
|
||||
content = msg.get("content") or ""
|
||||
if len(content) > 2000:
|
||||
content = content[:1000] + "\n...[truncated]...\n" + content[-500:]
|
||||
tool_calls = msg.get("tool_calls", [])
|
||||
|
||||
@@ -131,6 +131,14 @@ PLATFORM_HINTS = {
|
||||
"files arrive as downloadable documents. You can also include image "
|
||||
"URLs in markdown format  and they will be sent as photos."
|
||||
),
|
||||
"email": (
|
||||
"You are communicating via email. Write clear, well-structured responses "
|
||||
"suitable for email. Use plain text formatting (no markdown). "
|
||||
"Keep responses concise but complete. You can send file attachments — "
|
||||
"include MEDIA:/absolute/path/to/file in your response. The subject line "
|
||||
"is preserved for threading. Do not include greetings or sign-offs unless "
|
||||
"contextually appropriate."
|
||||
),
|
||||
"cli": (
|
||||
"You are a CLI AI Agent. Try not to use markdown but simple text "
|
||||
"renderable inside a terminal."
|
||||
|
||||
@@ -626,6 +626,10 @@ code_execution:
|
||||
delegation:
|
||||
max_iterations: 50 # Max tool-calling turns per child (default: 50)
|
||||
default_toolsets: ["terminal", "file", "web"] # Default toolsets for subagents
|
||||
# model: "google/gemini-3-flash-preview" # Override model for subagents (empty = inherit parent)
|
||||
# provider: "openrouter" # Override provider for subagents (empty = inherit parent)
|
||||
# # Resolves full credentials (base_url, api_key) automatically.
|
||||
# # Supported: openrouter, nous, zai, kimi-coding, minimax
|
||||
|
||||
# =============================================================================
|
||||
# Honcho Integration (Cross-Session User Modeling)
|
||||
@@ -670,6 +674,11 @@ display:
|
||||
# Works over SSH. Most terminals can be configured to flash the taskbar or play a sound.
|
||||
bell_on_complete: false
|
||||
|
||||
# Show model reasoning/thinking before each response.
|
||||
# When enabled, a dim box shows the model's thought process above the response.
|
||||
# Toggle at runtime with /reasoning show or /reasoning hide.
|
||||
show_reasoning: false
|
||||
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
# Skin / Theme
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
95
cli.py
95
cli.py
@@ -205,6 +205,7 @@ def load_cli_config() -> Dict[str, Any]:
|
||||
"display": {
|
||||
"compact": False,
|
||||
"resume_display": "full",
|
||||
"show_reasoning": False,
|
||||
"skin": "default",
|
||||
},
|
||||
"clarify": {
|
||||
@@ -217,6 +218,8 @@ def load_cli_config() -> Dict[str, Any]:
|
||||
"delegation": {
|
||||
"max_iterations": 45, # Max tool-calling turns per child agent
|
||||
"default_toolsets": ["terminal", "file", "web"], # Default toolsets for subagents
|
||||
"model": "", # Subagent model override (empty = inherit parent model)
|
||||
"provider": "", # Subagent provider override (empty = inherit parent provider)
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1121,6 +1124,8 @@ class HermesCLI:
|
||||
self.resume_display = CLI_CONFIG["display"].get("resume_display", "full")
|
||||
# bell_on_complete: play terminal bell (\a) when agent finishes a response
|
||||
self.bell_on_complete = CLI_CONFIG["display"].get("bell_on_complete", False)
|
||||
# show_reasoning: display model thinking/reasoning before the response
|
||||
self.show_reasoning = CLI_CONFIG["display"].get("show_reasoning", False)
|
||||
self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose")
|
||||
|
||||
# Configuration - priority: CLI args > env vars > config file
|
||||
@@ -1495,6 +1500,7 @@ class HermesCLI:
|
||||
platform="cli",
|
||||
session_db=self._session_db,
|
||||
clarify_callback=self._clarify_callback,
|
||||
reasoning_callback=self._on_reasoning if self.show_reasoning else None,
|
||||
honcho_session_key=self.session_id,
|
||||
fallback_model=self._fallback_model,
|
||||
thinking_callback=self._on_thinking,
|
||||
@@ -2848,6 +2854,8 @@ class HermesCLI:
|
||||
self._show_gateway_status()
|
||||
elif cmd_lower == "/verbose":
|
||||
self._toggle_verbose()
|
||||
elif cmd_lower.startswith("/reasoning"):
|
||||
self._handle_reasoning_command(cmd_original)
|
||||
elif cmd_lower == "/compress":
|
||||
self._manual_compress()
|
||||
elif cmd_lower == "/usage":
|
||||
@@ -3073,6 +3081,75 @@ class HermesCLI:
|
||||
}
|
||||
self.console.print(labels.get(self.tool_progress_mode, ""))
|
||||
|
||||
def _handle_reasoning_command(self, cmd: str):
|
||||
"""Handle /reasoning — manage effort level and display toggle.
|
||||
|
||||
Usage:
|
||||
/reasoning Show current effort level and display state
|
||||
/reasoning <level> Set reasoning effort (none, low, medium, high, xhigh)
|
||||
/reasoning show|on Show model thinking/reasoning in output
|
||||
/reasoning hide|off Hide model thinking/reasoning from output
|
||||
"""
|
||||
parts = cmd.strip().split(maxsplit=1)
|
||||
|
||||
if len(parts) < 2:
|
||||
# Show current state
|
||||
rc = self.reasoning_config
|
||||
if rc is None:
|
||||
level = "medium (default)"
|
||||
elif rc.get("enabled") is False:
|
||||
level = "none (disabled)"
|
||||
else:
|
||||
level = rc.get("effort", "medium")
|
||||
display_state = "on" if self.show_reasoning else "off"
|
||||
_cprint(f" {_GOLD}Reasoning effort: {level}{_RST}")
|
||||
_cprint(f" {_GOLD}Reasoning display: {display_state}{_RST}")
|
||||
_cprint(f" {_DIM}Usage: /reasoning <none|low|medium|high|xhigh|show|hide>{_RST}")
|
||||
return
|
||||
|
||||
arg = parts[1].strip().lower()
|
||||
|
||||
# Display toggle
|
||||
if arg in ("show", "on"):
|
||||
self.show_reasoning = True
|
||||
if self.agent:
|
||||
self.agent.reasoning_callback = self._on_reasoning
|
||||
_cprint(f" {_GOLD}Reasoning display: ON{_RST}")
|
||||
_cprint(f" {_DIM}Model thinking will be shown during and after each response.{_RST}")
|
||||
return
|
||||
if arg in ("hide", "off"):
|
||||
self.show_reasoning = False
|
||||
if self.agent:
|
||||
self.agent.reasoning_callback = None
|
||||
_cprint(f" {_GOLD}Reasoning display: OFF{_RST}")
|
||||
return
|
||||
|
||||
# Effort level change
|
||||
parsed = _parse_reasoning_config(arg)
|
||||
if parsed is None:
|
||||
_cprint(f" {_DIM}(._.) Unknown argument: {arg}{_RST}")
|
||||
_cprint(f" {_DIM}Valid levels: none, low, minimal, medium, high, xhigh{_RST}")
|
||||
_cprint(f" {_DIM}Display: show, hide{_RST}")
|
||||
return
|
||||
|
||||
self.reasoning_config = parsed
|
||||
self.agent = None # Force agent re-init with new reasoning config
|
||||
|
||||
if save_config_value("agent.reasoning_effort", arg):
|
||||
_cprint(f" {_GOLD}Reasoning effort set to '{arg}' (saved to config){_RST}")
|
||||
else:
|
||||
_cprint(f" {_GOLD}Reasoning effort set to '{arg}' (session only){_RST}")
|
||||
|
||||
def _on_reasoning(self, reasoning_text: str):
|
||||
"""Callback for intermediate reasoning display during tool-call loops."""
|
||||
lines = reasoning_text.strip().splitlines()
|
||||
if len(lines) > 5:
|
||||
preview = "\n".join(lines[:5])
|
||||
preview += f"\n ... ({len(lines) - 5} more lines)"
|
||||
else:
|
||||
preview = reasoning_text.strip()
|
||||
_cprint(f" {_DIM}[thinking] {preview}{_RST}")
|
||||
|
||||
def _manual_compress(self):
|
||||
"""Manually trigger context compression on the current conversation."""
|
||||
if not self.conversation_history or len(self.conversation_history) < 4:
|
||||
@@ -3542,6 +3619,24 @@ class HermesCLI:
|
||||
if response and pending_message:
|
||||
response = response + "\n\n---\n_[Interrupted - processing new message]_"
|
||||
|
||||
# Display reasoning (thinking) box if enabled and available
|
||||
if self.show_reasoning and result:
|
||||
reasoning = result.get("last_reasoning")
|
||||
if reasoning:
|
||||
w = shutil.get_terminal_size().columns
|
||||
r_label = " Reasoning "
|
||||
r_fill = w - 2 - len(r_label)
|
||||
r_top = f"{_DIM}┌─{r_label}{'─' * max(r_fill - 1, 0)}┐{_RST}"
|
||||
r_bot = f"{_DIM}└{'─' * (w - 2)}┘{_RST}"
|
||||
# Collapse long reasoning: show first 10 lines
|
||||
lines = reasoning.strip().splitlines()
|
||||
if len(lines) > 10:
|
||||
display_reasoning = "\n".join(lines[:10])
|
||||
display_reasoning += f"\n{_DIM} ... ({len(lines) - 10} more lines){_RST}"
|
||||
else:
|
||||
display_reasoning = reasoning.strip()
|
||||
_cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}")
|
||||
|
||||
if response:
|
||||
# Use a Rich Panel for the response box — adapts to terminal
|
||||
# width at render time instead of hard-coding border length.
|
||||
|
||||
@@ -103,6 +103,7 @@ def _deliver_result(job: dict, content: str) -> None:
|
||||
"slack": Platform.SLACK,
|
||||
"whatsapp": Platform.WHATSAPP,
|
||||
"signal": Platform.SIGNAL,
|
||||
"email": Platform.EMAIL,
|
||||
}
|
||||
platform = platform_map.get(platform_name.lower())
|
||||
if not platform:
|
||||
|
||||
@@ -18,9 +18,14 @@ Benchmarks (eval-only):
|
||||
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
|
||||
"""
|
||||
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
try:
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
except ImportError:
|
||||
# atroposlib not installed — environments are unavailable but
|
||||
# submodules like tool_call_parsers can still be imported directly.
|
||||
pass
|
||||
|
||||
__all__ = [
|
||||
"AgentResult",
|
||||
|
||||
@@ -249,23 +249,62 @@ class HermesAgentLoop:
|
||||
reasoning = _extract_reasoning_from_message(assistant_msg)
|
||||
reasoning_per_turn.append(reasoning)
|
||||
|
||||
# Check for tool calls -- standard OpenAI spec
|
||||
# Check for tool calls -- standard OpenAI spec.
|
||||
# Fallback: if response has no structured tool_calls but content
|
||||
# contains raw tool call tags (e.g. <tool_call>), parse them using
|
||||
# hermes-agent's standalone parsers. This handles the case where
|
||||
# ManagedServer's ToolCallTranslator couldn't parse because vLLM
|
||||
# isn't installed.
|
||||
if (
|
||||
not assistant_msg.tool_calls
|
||||
and assistant_msg.content
|
||||
and self.tool_schemas
|
||||
and "<tool_call>" in (assistant_msg.content or "")
|
||||
):
|
||||
try:
|
||||
from environments.tool_call_parsers import get_parser
|
||||
fallback_parser = get_parser("hermes")
|
||||
parsed_content, parsed_calls = fallback_parser.parse(
|
||||
assistant_msg.content
|
||||
)
|
||||
if parsed_calls:
|
||||
assistant_msg.tool_calls = parsed_calls
|
||||
if parsed_content is not None:
|
||||
assistant_msg.content = parsed_content
|
||||
logger.debug(
|
||||
"Fallback parser extracted %d tool calls from raw content",
|
||||
len(parsed_calls),
|
||||
)
|
||||
except Exception:
|
||||
pass # Fall through to no tool calls
|
||||
|
||||
if assistant_msg.tool_calls:
|
||||
# Normalize tool calls to dicts — they may come as objects
|
||||
# (OpenAI API) or dicts (vLLM ToolCallTranslator).
|
||||
def _tc_to_dict(tc):
|
||||
if isinstance(tc, dict):
|
||||
return {
|
||||
"id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.get("function", {}).get("name", tc.get("name", "")),
|
||||
"arguments": tc.get("function", {}).get("arguments", tc.get("arguments", "{}")),
|
||||
},
|
||||
}
|
||||
return {
|
||||
"id": tc.id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
|
||||
# Build the assistant message dict for conversation history
|
||||
msg_dict: Dict[str, Any] = {
|
||||
"role": "assistant",
|
||||
"content": assistant_msg.content or "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tc.id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
for tc in assistant_msg.tool_calls
|
||||
],
|
||||
"tool_calls": [_tc_to_dict(tc) for tc in assistant_msg.tool_calls],
|
||||
}
|
||||
|
||||
# Preserve reasoning_content for multi-turn chat template handling
|
||||
@@ -278,8 +317,13 @@ class HermesAgentLoop:
|
||||
|
||||
# Execute each tool call via hermes-agent's dispatch
|
||||
for tc in assistant_msg.tool_calls:
|
||||
tool_name = tc.function.name
|
||||
tool_args_raw = tc.function.arguments
|
||||
# Handle both object (OpenAI) and dict (vLLM) formats
|
||||
if isinstance(tc, dict):
|
||||
tool_name = tc.get("function", {}).get("name", tc.get("name", ""))
|
||||
tool_args_raw = tc.get("function", {}).get("arguments", tc.get("arguments", "{}"))
|
||||
else:
|
||||
tool_name = tc.function.name
|
||||
tool_args_raw = tc.function.arguments
|
||||
|
||||
# Validate tool name
|
||||
if tool_name not in self.valid_tool_names:
|
||||
@@ -390,10 +434,11 @@ class HermesAgentLoop:
|
||||
pass
|
||||
|
||||
# Add tool response to conversation
|
||||
tc_id = tc.get("id", "") if isinstance(tc, dict) else tc.id
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.id,
|
||||
"tool_call_id": tc_id,
|
||||
"content": tool_result,
|
||||
}
|
||||
)
|
||||
|
||||
38
environments/benchmarks/tblite/local.yaml
Normal file
38
environments/benchmarks/tblite/local.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
|
||||
#
|
||||
# Runs tasks in Docker containers on the local machine.
|
||||
# Sandboxed like Modal but no cloud costs. Good for dev/testing.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml
|
||||
#
|
||||
# # Override concurrency:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml \
|
||||
# --env.eval_concurrency 4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8 # max 8 tasks at once
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: false
|
||||
wandb_name: "openthoughts-tblite-local"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-sonnet-4"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
40
environments/benchmarks/tblite/local_vllm.yaml
Normal file
40
environments/benchmarks/tblite/local_vllm.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Local vLLM Backend
|
||||
#
|
||||
# Runs against a local vLLM server with Docker sandboxes.
|
||||
#
|
||||
# Start the vLLM server from the atropos directory:
|
||||
# python -m example_trainer.vllm_api_server \
|
||||
# --model Qwen/Qwen3-4B-Instruct-2507 \
|
||||
# --port 9001 \
|
||||
# --gpu-memory-utilization 0.8 \
|
||||
# --max-model-len=32000
|
||||
#
|
||||
# Then run:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local_vllm.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 16000
|
||||
agent_temperature: 0.6
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8
|
||||
tool_call_parser: "hermes"
|
||||
system_prompt: "You are an expert terminal agent. You MUST use the provided tools to complete tasks. Use the terminal tool to run shell commands, read_file to read files, write_file to write files, search_files to search, and patch to edit files. Do NOT write out solutions as text - execute them using the tools. Always start by exploring the environment with terminal commands."
|
||||
tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
use_wandb: false
|
||||
wandb_name: "tblite-qwen3-4b-instruct"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local"
|
||||
|
||||
openai:
|
||||
base_url: "http://localhost:9001"
|
||||
model_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
server_type: "vllm"
|
||||
health_check: false
|
||||
@@ -127,6 +127,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||
"causes blocking calls to deadlock inside the thread pool.",
|
||||
)
|
||||
|
||||
# --- Eval concurrency ---
|
||||
eval_concurrency: int = Field(
|
||||
default=0,
|
||||
description="Maximum number of tasks to evaluate in parallel. "
|
||||
"0 means unlimited (all tasks run concurrently). "
|
||||
"Set to 8 for local backends to avoid overwhelming the machine.",
|
||||
)
|
||||
|
||||
|
||||
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||
MODAL_INCOMPATIBLE_TASKS = {
|
||||
@@ -201,7 +209,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
|
||||
# Agent settings -- TB2 tasks are complex, need many turns
|
||||
max_agent_turns=60,
|
||||
max_token_length=16000,
|
||||
max_token_length=***
|
||||
agent_temperature=0.6,
|
||||
system_prompt=None,
|
||||
|
||||
@@ -225,7 +233,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
|
||||
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||
tokenizer_name="NousRe...1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="terminal-bench-2",
|
||||
ensure_scores_are_not_same=False, # Binary rewards may all be 0 or 1
|
||||
@@ -237,7 +245,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
api_key=os.get...EY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
@@ -438,8 +446,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
"error": "no_image",
|
||||
}
|
||||
|
||||
# --- 2. Register per-task Modal image override ---
|
||||
register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
|
||||
# --- 2. Register per-task image override ---
|
||||
# Set both modal_image and docker_image so the task image is used
|
||||
# regardless of which backend is configured.
|
||||
register_task_env_overrides(task_id, {
|
||||
"modal_image": modal_image,
|
||||
"docker_image": modal_image,
|
||||
"cwd": "/app",
|
||||
})
|
||||
logger.info(
|
||||
"Task %s: registered image override for task_id %s",
|
||||
task_name, task_id[:8],
|
||||
@@ -454,17 +468,37 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
messages.append({"role": "user", "content": self.format_prompt(eval_item)})
|
||||
|
||||
# --- 4. Run agent loop ---
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
# Use ManagedServer (Phase 2) for vLLM/SGLang backends to get
|
||||
# token-level tracking via /generate. Falls back to direct
|
||||
# ServerManager (Phase 1) for OpenAI endpoints.
|
||||
if self._use_managed_server():
|
||||
async with self.server.managed_server(
|
||||
tokenizer=self.tokenizer,
|
||||
preserve_think_blocks=bool(self.config.thinking_mode),
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# --- 5. Verify -- run test suite in the agent's sandbox ---
|
||||
# Skip verification if the agent produced no meaningful output
|
||||
@@ -479,446 +513,3 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
reward = 0.0
|
||||
else:
|
||||
# Run tests in a thread so the blocking ctx.terminal() calls
|
||||
# don't freeze the entire event loop (which would stall all
|
||||
# other tasks, tqdm updates, and timeout timers).
|
||||
ctx = ToolContext(task_id)
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
reward = await loop.run_in_executor(
|
||||
None, # default thread pool
|
||||
self._run_tests, eval_item, ctx, task_name,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Task %s: test verification failed: %s", task_name, e)
|
||||
reward = 0.0
|
||||
finally:
|
||||
ctx.cleanup()
|
||||
|
||||
passed = reward == 1.0
|
||||
status = "PASS" if passed else "FAIL"
|
||||
elapsed = time.time() - task_start
|
||||
tqdm.write(f" [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)")
|
||||
logger.info(
|
||||
"Task %s: reward=%.1f, turns=%d, finished=%s",
|
||||
task_name, reward, result.turns_used, result.finished_naturally,
|
||||
)
|
||||
|
||||
out = {
|
||||
"passed": passed,
|
||||
"reward": reward,
|
||||
"task_name": task_name,
|
||||
"category": category,
|
||||
"turns_used": result.turns_used,
|
||||
"finished_naturally": result.finished_naturally,
|
||||
"messages": result.messages,
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
except Exception as e:
|
||||
elapsed = time.time() - task_start
|
||||
logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
|
||||
tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
|
||||
out = {
|
||||
"passed": False, "reward": 0.0,
|
||||
"task_name": task_name, "category": category,
|
||||
"error": str(e),
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
finally:
|
||||
# --- Cleanup: clear overrides, sandbox, and temp files ---
|
||||
clear_task_env_overrides(task_id)
|
||||
try:
|
||||
cleanup_vm(task_id)
|
||||
except Exception as e:
|
||||
logger.debug("VM cleanup for %s: %s", task_id[:8], e)
|
||||
if task_dir and task_dir.exists():
|
||||
shutil.rmtree(task_dir, ignore_errors=True)
|
||||
|
||||
def _run_tests(
|
||||
self, item: Dict[str, Any], ctx: ToolContext, task_name: str
|
||||
) -> float:
|
||||
"""
|
||||
Upload and execute the test suite in the agent's sandbox, then
|
||||
download the verifier output locally to read the reward.
|
||||
|
||||
Follows Harbor's verification pattern:
|
||||
1. Upload tests/ directory into the sandbox
|
||||
2. Execute test.sh inside the sandbox
|
||||
3. Download /logs/verifier/ directory to a local temp dir
|
||||
4. Read reward.txt locally with native Python I/O
|
||||
|
||||
Downloading locally avoids issues with the file_read tool on
|
||||
the Modal VM and matches how Harbor handles verification.
|
||||
|
||||
TB2 test scripts (test.sh) typically:
|
||||
1. Install pytest via uv/pip
|
||||
2. Run pytest against the test files in /tests/
|
||||
3. Write results to /logs/verifier/reward.txt
|
||||
|
||||
Args:
|
||||
item: The TB2 task dict (contains tests_tar, test_sh)
|
||||
ctx: ToolContext scoped to this task's sandbox
|
||||
task_name: For logging
|
||||
|
||||
Returns:
|
||||
1.0 if tests pass, 0.0 otherwise
|
||||
"""
|
||||
tests_tar = item.get("tests_tar", "")
|
||||
test_sh = item.get("test_sh", "")
|
||||
|
||||
if not test_sh:
|
||||
logger.warning("Task %s: no test_sh content, reward=0", task_name)
|
||||
return 0.0
|
||||
|
||||
# Create required directories in the sandbox
|
||||
ctx.terminal("mkdir -p /tests /logs/verifier")
|
||||
|
||||
# Upload test files into the sandbox (binary-safe via base64)
|
||||
if tests_tar:
|
||||
tests_temp = Path(tempfile.mkdtemp(prefix=f"tb2-tests-{task_name}-"))
|
||||
try:
|
||||
_extract_base64_tar(tests_tar, tests_temp)
|
||||
ctx.upload_dir(str(tests_temp), "/tests")
|
||||
except Exception as e:
|
||||
logger.warning("Task %s: failed to upload test files: %s", task_name, e)
|
||||
finally:
|
||||
shutil.rmtree(tests_temp, ignore_errors=True)
|
||||
|
||||
# Write the test runner script (test.sh)
|
||||
ctx.write_file("/tests/test.sh", test_sh)
|
||||
ctx.terminal("chmod +x /tests/test.sh")
|
||||
|
||||
# Execute the test suite
|
||||
logger.info(
|
||||
"Task %s: running test suite (timeout=%ds)",
|
||||
task_name, self.config.test_timeout,
|
||||
)
|
||||
test_result = ctx.terminal(
|
||||
"bash /tests/test.sh",
|
||||
timeout=self.config.test_timeout,
|
||||
)
|
||||
|
||||
exit_code = test_result.get("exit_code", -1)
|
||||
output = test_result.get("output", "")
|
||||
|
||||
# Download the verifier output directory locally, then read reward.txt
|
||||
# with native Python I/O. This avoids issues with file_read on the
|
||||
# Modal VM and matches Harbor's verification pattern.
|
||||
reward = 0.0
|
||||
local_verifier_dir = Path(tempfile.mkdtemp(prefix=f"tb2-verifier-{task_name}-"))
|
||||
try:
|
||||
ctx.download_dir("/logs/verifier", str(local_verifier_dir))
|
||||
|
||||
reward_file = local_verifier_dir / "reward.txt"
|
||||
if reward_file.exists() and reward_file.stat().st_size > 0:
|
||||
content = reward_file.read_text().strip()
|
||||
if content == "1":
|
||||
reward = 1.0
|
||||
elif content == "0":
|
||||
reward = 0.0
|
||||
else:
|
||||
# Unexpected content -- try parsing as float
|
||||
try:
|
||||
reward = float(content)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
"Task %s: reward.txt content unexpected (%r), "
|
||||
"falling back to exit_code=%d",
|
||||
task_name, content, exit_code,
|
||||
)
|
||||
reward = 1.0 if exit_code == 0 else 0.0
|
||||
else:
|
||||
# reward.txt not written -- fall back to exit code
|
||||
logger.warning(
|
||||
"Task %s: reward.txt not found after download, "
|
||||
"falling back to exit_code=%d",
|
||||
task_name, exit_code,
|
||||
)
|
||||
reward = 1.0 if exit_code == 0 else 0.0
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Task %s: failed to download verifier dir: %s, "
|
||||
"falling back to exit_code=%d",
|
||||
task_name, e, exit_code,
|
||||
)
|
||||
reward = 1.0 if exit_code == 0 else 0.0
|
||||
finally:
|
||||
shutil.rmtree(local_verifier_dir, ignore_errors=True)
|
||||
|
||||
# Log test output for debugging failures
|
||||
if reward == 0.0:
|
||||
output_preview = output[-500:] if output else "(no output)"
|
||||
logger.info(
|
||||
"Task %s: FAIL (exit_code=%d)\n%s",
|
||||
task_name, exit_code, output_preview,
|
||||
)
|
||||
|
||||
return reward
|
||||
|
||||
# =========================================================================
|
||||
# Evaluate -- main entry point for the eval subcommand
|
||||
# =========================================================================
|
||||
|
||||
async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||
"""
|
||||
Wrap rollout_and_score_eval with a per-task wall-clock timeout.
|
||||
|
||||
If the task exceeds task_timeout seconds, it's automatically scored
|
||||
as FAIL. This prevents any single task from hanging indefinitely.
|
||||
"""
|
||||
task_name = item.get("task_name", "unknown")
|
||||
category = item.get("category", "unknown")
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
self.rollout_and_score_eval(item),
|
||||
timeout=self.config.task_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
from tqdm import tqdm
|
||||
elapsed = self.config.task_timeout
|
||||
tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
|
||||
logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
|
||||
out = {
|
||||
"passed": False, "reward": 0.0,
|
||||
"task_name": task_name, "category": category,
|
||||
"error": f"timeout ({elapsed}s)",
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
async def evaluate(self, *args, **kwargs) -> None:
|
||||
"""
|
||||
Run Terminal-Bench 2.0 evaluation over all tasks.
|
||||
|
||||
This is the main entry point when invoked via:
|
||||
python environments/terminalbench2_env.py evaluate
|
||||
|
||||
Runs all tasks through rollout_and_score_eval() via asyncio.gather()
|
||||
(same pattern as GPQA and other Atropos eval envs). Each task is
|
||||
wrapped with a wall-clock timeout so hung tasks auto-fail.
|
||||
|
||||
Suppresses noisy Modal/terminal output (HERMES_QUIET) so the tqdm
|
||||
bar stays visible.
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Route all logging through tqdm.write() so the progress bar stays
|
||||
# pinned at the bottom while log lines scroll above it.
|
||||
from tqdm import tqdm
|
||||
|
||||
class _TqdmHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
try:
|
||||
tqdm.write(self.format(record))
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
handler = _TqdmHandler()
|
||||
handler.setFormatter(logging.Formatter(
|
||||
"%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
))
|
||||
root = logging.getLogger()
|
||||
root.handlers = [handler] # Replace any existing handlers
|
||||
root.setLevel(logging.INFO)
|
||||
|
||||
# Silence noisy third-party loggers that flood the output
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING) # Every HTTP request
|
||||
logging.getLogger("openai").setLevel(logging.WARNING) # OpenAI client retries
|
||||
logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment
|
||||
logging.getLogger("rex_image_builder").setLevel(logging.WARNING) # Image builds
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Starting Terminal-Bench 2.0 Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Dataset: {self.config.dataset_name}")
|
||||
print(f" Total tasks: {len(self.all_eval_items)}")
|
||||
print(f" Max agent turns: {self.config.max_agent_turns}")
|
||||
print(f" Task timeout: {self.config.task_timeout}s")
|
||||
print(f" Terminal backend: {self.config.terminal_backend}")
|
||||
print(f" Tool thread pool: {self.config.tool_pool_size}")
|
||||
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
|
||||
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
|
||||
print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Semaphore to limit concurrent Modal sandbox creations.
|
||||
# Without this, all 86 tasks fire simultaneously, each creating a Modal
|
||||
# sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
|
||||
# calls (App.lookup, etc.) deadlock when too many are created at once.
|
||||
semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
|
||||
|
||||
async def _eval_with_semaphore(item):
|
||||
async with semaphore:
|
||||
return await self._eval_with_timeout(item)
|
||||
|
||||
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
|
||||
total_tasks = len(self.all_eval_items)
|
||||
eval_tasks = [
|
||||
asyncio.ensure_future(_eval_with_semaphore(item))
|
||||
for item in self.all_eval_items
|
||||
]
|
||||
|
||||
results = []
|
||||
passed_count = 0
|
||||
pbar = tqdm(total=total_tasks, desc="Evaluating TB2", dynamic_ncols=True)
|
||||
try:
|
||||
for coro in asyncio.as_completed(eval_tasks):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
if result and result.get("passed"):
|
||||
passed_count += 1
|
||||
done = len(results)
|
||||
pct = (passed_count / done * 100) if done else 0
|
||||
pbar.set_postfix_str(f"pass={passed_count}/{done} ({pct:.1f}%)")
|
||||
pbar.update(1)
|
||||
except (KeyboardInterrupt, asyncio.CancelledError):
|
||||
pbar.close()
|
||||
print(f"\n\nInterrupted! Cleaning up {len(eval_tasks)} tasks...")
|
||||
# Cancel all pending tasks
|
||||
for task in eval_tasks:
|
||||
task.cancel()
|
||||
# Let cancellations propagate (finally blocks run cleanup_vm)
|
||||
await asyncio.gather(*eval_tasks, return_exceptions=True)
|
||||
# Belt-and-suspenders: clean up any remaining sandboxes
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
print("All sandboxes cleaned up.")
|
||||
return
|
||||
finally:
|
||||
pbar.close()
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Filter out None results (shouldn't happen, but be safe)
|
||||
valid_results = [r for r in results if r is not None]
|
||||
|
||||
if not valid_results:
|
||||
print("Warning: No valid evaluation results obtained")
|
||||
return
|
||||
|
||||
# ---- Compute metrics ----
|
||||
total = len(valid_results)
|
||||
passed = sum(1 for r in valid_results if r.get("passed"))
|
||||
overall_pass_rate = passed / total if total > 0 else 0.0
|
||||
|
||||
# Per-category breakdown
|
||||
cat_results: Dict[str, List[Dict]] = defaultdict(list)
|
||||
for r in valid_results:
|
||||
cat_results[r.get("category", "unknown")].append(r)
|
||||
|
||||
# Build metrics dict
|
||||
eval_metrics = {
|
||||
"eval/pass_rate": overall_pass_rate,
|
||||
"eval/total_tasks": total,
|
||||
"eval/passed_tasks": passed,
|
||||
"eval/evaluation_time_seconds": end_time - start_time,
|
||||
}
|
||||
|
||||
# Per-category metrics
|
||||
for category, cat_items in sorted(cat_results.items()):
|
||||
cat_passed = sum(1 for r in cat_items if r.get("passed"))
|
||||
cat_total = len(cat_items)
|
||||
cat_pass_rate = cat_passed / cat_total if cat_total > 0 else 0.0
|
||||
cat_key = category.replace(" ", "_").replace("-", "_").lower()
|
||||
eval_metrics[f"eval/pass_rate_{cat_key}"] = cat_pass_rate
|
||||
|
||||
# Store metrics for wandb_log
|
||||
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
|
||||
|
||||
# ---- Print summary ----
|
||||
print(f"\n{'='*60}")
|
||||
print("Terminal-Bench 2.0 Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(f"Overall Pass Rate: {overall_pass_rate:.4f} ({passed}/{total})")
|
||||
print(f"Evaluation Time: {end_time - start_time:.1f} seconds")
|
||||
|
||||
print("\nCategory Breakdown:")
|
||||
for category, cat_items in sorted(cat_results.items()):
|
||||
cat_passed = sum(1 for r in cat_items if r.get("passed"))
|
||||
cat_total = len(cat_items)
|
||||
cat_rate = cat_passed / cat_total if cat_total > 0 else 0.0
|
||||
print(f" {category}: {cat_rate:.1%} ({cat_passed}/{cat_total})")
|
||||
|
||||
# Print individual task results
|
||||
print("\nTask Results:")
|
||||
for r in sorted(valid_results, key=lambda x: x.get("task_name", "")):
|
||||
status = "PASS" if r.get("passed") else "FAIL"
|
||||
turns = r.get("turns_used", "?")
|
||||
error = r.get("error", "")
|
||||
extra = f" (error: {error})" if error else ""
|
||||
print(f" [{status}] {r['task_name']} (turns={turns}){extra}")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Build sample records for evaluate_log (includes full conversations)
|
||||
samples = [
|
||||
{
|
||||
"task_name": r.get("task_name"),
|
||||
"category": r.get("category"),
|
||||
"passed": r.get("passed"),
|
||||
"reward": r.get("reward"),
|
||||
"turns_used": r.get("turns_used"),
|
||||
"error": r.get("error"),
|
||||
"messages": r.get("messages"),
|
||||
}
|
||||
for r in valid_results
|
||||
]
|
||||
|
||||
# Log evaluation results
|
||||
try:
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
generation_parameters={
|
||||
"temperature": self.config.agent_temperature,
|
||||
"max_tokens": self.config.max_token_length,
|
||||
"max_agent_turns": self.config.max_agent_turns,
|
||||
"terminal_backend": self.config.terminal_backend,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error logging evaluation results: {e}")
|
||||
|
||||
# Close streaming file
|
||||
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||
self._streaming_file.close()
|
||||
print(f" Live results saved to: {self._streaming_path}")
|
||||
|
||||
# Kill all remaining sandboxes. Timed-out tasks leave orphaned thread
|
||||
# pool workers still executing commands -- cleanup_all stops them.
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
print("\nCleaning up all sandboxes...")
|
||||
cleanup_all_environments()
|
||||
|
||||
# Shut down the tool thread pool so orphaned workers from timed-out
|
||||
# tasks are killed immediately instead of retrying against dead
|
||||
# sandboxes and spamming the console with TimeoutError warnings.
|
||||
from environments.agent_loop import _tool_executor
|
||||
_tool_executor.shutdown(wait=False, cancel_futures=True)
|
||||
print("Done.")
|
||||
|
||||
# =========================================================================
|
||||
# Wandb logging
|
||||
# =========================================================================
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log TB2-specific metrics to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
# Add stored eval metrics
|
||||
for metric_name, metric_value in self.eval_metrics:
|
||||
wandb_metrics[metric_name] = metric_value
|
||||
self.eval_metrics = []
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TerminalBench2EvalEnv.cli()
|
||||
|
||||
@@ -229,6 +229,12 @@ class HermesAgentBaseEnv(BaseEnv):
|
||||
from environments.agent_loop import resize_tool_pool
|
||||
resize_tool_pool(config.tool_pool_size)
|
||||
|
||||
# Set tool_parser on the ServerManager so ManagedServer uses it
|
||||
# for bidirectional tool call translation (raw text ↔ OpenAI tool_calls).
|
||||
if hasattr(self.server, 'tool_parser'):
|
||||
self.server.tool_parser = config.tool_call_parser
|
||||
print(f"🔧 Tool parser: {config.tool_call_parser}")
|
||||
|
||||
# Current group's resolved tools (set in collect_trajectories)
|
||||
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
|
||||
|
||||
@@ -466,22 +472,14 @@ class HermesAgentBaseEnv(BaseEnv):
|
||||
# Run the agent loop
|
||||
result: AgentResult
|
||||
if self._use_managed_server():
|
||||
# Phase 2: ManagedServer with parser -- exact tokens + logprobs
|
||||
# Load the tool call parser from registry based on config
|
||||
from environments.tool_call_parsers import get_parser
|
||||
try:
|
||||
tc_parser = get_parser(self.config.tool_call_parser)
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
"Tool call parser '%s' not found, falling back to 'hermes'",
|
||||
self.config.tool_call_parser,
|
||||
)
|
||||
tc_parser = get_parser("hermes")
|
||||
|
||||
# Phase 2: ManagedServer with ToolCallTranslator -- exact tokens + logprobs
|
||||
# tool_parser is set on ServerManager in __init__ and passed through
|
||||
# to ManagedServer, which uses ToolCallTranslator for bidirectional
|
||||
# translation between raw text and OpenAI tool_calls.
|
||||
try:
|
||||
async with self.server.managed_server(
|
||||
tokenizer=self.tokenizer,
|
||||
tool_call_parser=tc_parser,
|
||||
preserve_think_blocks=bool(self.config.thinking_mode),
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
|
||||
@@ -114,11 +114,27 @@ def _patch_swerex_modal():
|
||||
self._worker = _AsyncWorker()
|
||||
self._worker.start()
|
||||
|
||||
# Pre-build a modal.Image with pip fix for Modal's legacy image builder.
|
||||
# Modal requires `python -m pip` to work during image build, but some
|
||||
# task images (e.g., TBLite's broken-python) have intentionally broken pip.
|
||||
# Fix: remove stale pip dist-info and reinstall via ensurepip before Modal
|
||||
# tries to use it. This is a no-op for images where pip already works.
|
||||
import modal as _modal
|
||||
image_spec = self.config.image
|
||||
if isinstance(image_spec, str):
|
||||
image_spec = _modal.Image.from_registry(
|
||||
image_spec,
|
||||
setup_dockerfile_commands=[
|
||||
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
|
||||
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
|
||||
],
|
||||
)
|
||||
|
||||
# Create AND start the deployment entirely on the worker's loop/thread
|
||||
# so all gRPC channels and async state are bound to that loop
|
||||
async def _create_and_start():
|
||||
deployment = ModalDeployment(
|
||||
image=self.config.image,
|
||||
image=image_spec,
|
||||
startup_timeout=self.config.startup_timeout,
|
||||
runtime_timeout=self.config.runtime_timeout,
|
||||
deployment_timeout=self.config.deployment_timeout,
|
||||
|
||||
@@ -61,7 +61,7 @@ def build_channel_directory(adapters: Dict[Any, Any]) -> Dict[str, Any]:
|
||||
logger.warning("Channel directory: failed to build %s: %s", platform.value, e)
|
||||
|
||||
# Telegram, WhatsApp & Signal can't enumerate chats -- pull from session history
|
||||
for plat_name in ("telegram", "whatsapp", "signal"):
|
||||
for plat_name in ("telegram", "whatsapp", "signal", "email"):
|
||||
if plat_name not in platforms:
|
||||
platforms[plat_name] = _build_from_sessions(plat_name)
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ class Platform(Enum):
|
||||
SLACK = "slack"
|
||||
SIGNAL = "signal"
|
||||
HOMEASSISTANT = "homeassistant"
|
||||
EMAIL = "email"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -167,6 +168,9 @@ class GatewayConfig:
|
||||
# Signal uses extra dict for config (http_url + account)
|
||||
elif platform == Platform.SIGNAL and config.extra.get("http_url"):
|
||||
connected.append(platform)
|
||||
# Email uses extra dict for config (address + imap_host + smtp_host)
|
||||
elif platform == Platform.EMAIL and config.extra.get("address"):
|
||||
connected.append(platform)
|
||||
return connected
|
||||
|
||||
def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
|
||||
@@ -420,6 +424,28 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
|
||||
if hass_url:
|
||||
config.platforms[Platform.HOMEASSISTANT].extra["url"] = hass_url
|
||||
|
||||
# Email
|
||||
email_addr = os.getenv("EMAIL_ADDRESS")
|
||||
email_pwd = os.getenv("EMAIL_PASSWORD")
|
||||
email_imap = os.getenv("EMAIL_IMAP_HOST")
|
||||
email_smtp = os.getenv("EMAIL_SMTP_HOST")
|
||||
if all([email_addr, email_pwd, email_imap, email_smtp]):
|
||||
if Platform.EMAIL not in config.platforms:
|
||||
config.platforms[Platform.EMAIL] = PlatformConfig()
|
||||
config.platforms[Platform.EMAIL].enabled = True
|
||||
config.platforms[Platform.EMAIL].extra.update({
|
||||
"address": email_addr,
|
||||
"imap_host": email_imap,
|
||||
"smtp_host": email_smtp,
|
||||
})
|
||||
email_home = os.getenv("EMAIL_HOME_ADDRESS")
|
||||
if email_home:
|
||||
config.platforms[Platform.EMAIL].home_channel = HomeChannel(
|
||||
platform=Platform.EMAIL,
|
||||
chat_id=email_home,
|
||||
name=os.getenv("EMAIL_HOME_ADDRESS_NAME", "Home"),
|
||||
)
|
||||
|
||||
# Session settings
|
||||
idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
|
||||
if idle_minutes:
|
||||
|
||||
533
gateway/platforms/email.py
Normal file
533
gateway/platforms/email.py
Normal file
@@ -0,0 +1,533 @@
|
||||
"""
|
||||
Email platform adapter for the Hermes gateway.
|
||||
|
||||
Allows users to interact with Hermes by sending emails.
|
||||
Uses IMAP to receive and SMTP to send messages.
|
||||
|
||||
Environment variables:
|
||||
EMAIL_IMAP_HOST — IMAP server host (e.g., imap.gmail.com)
|
||||
EMAIL_IMAP_PORT — IMAP server port (default: 993)
|
||||
EMAIL_SMTP_HOST — SMTP server host (e.g., smtp.gmail.com)
|
||||
EMAIL_SMTP_PORT — SMTP server port (default: 587)
|
||||
EMAIL_ADDRESS — Email address for the agent
|
||||
EMAIL_PASSWORD — Email password or app-specific password
|
||||
EMAIL_POLL_INTERVAL — Seconds between mailbox checks (default: 15)
|
||||
EMAIL_ALLOWED_USERS — Comma-separated list of allowed sender addresses
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import email as email_lib
|
||||
import imaplib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import smtplib
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from email.header import decode_header
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.base import MIMEBase
|
||||
from email import encoders
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from gateway.platforms.base import (
|
||||
BasePlatformAdapter,
|
||||
MessageEvent,
|
||||
MessageType,
|
||||
SendResult,
|
||||
cache_document_from_bytes,
|
||||
cache_image_from_bytes,
|
||||
)
|
||||
from gateway.config import Platform, PlatformConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Gmail-safe max length per email body
|
||||
MAX_MESSAGE_LENGTH = 50_000
|
||||
|
||||
# Supported image extensions for inline detection
|
||||
_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
|
||||
|
||||
|
||||
def check_email_requirements() -> bool:
|
||||
"""Check if email platform dependencies are available."""
|
||||
addr = os.getenv("EMAIL_ADDRESS")
|
||||
pwd = os.getenv("EMAIL_PASSWORD")
|
||||
imap = os.getenv("EMAIL_IMAP_HOST")
|
||||
smtp = os.getenv("EMAIL_SMTP_HOST")
|
||||
if not all([addr, pwd, imap, smtp]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _decode_header_value(raw: str) -> str:
|
||||
"""Decode an RFC 2047 encoded email header into a plain string."""
|
||||
parts = decode_header(raw)
|
||||
decoded = []
|
||||
for part, charset in parts:
|
||||
if isinstance(part, bytes):
|
||||
decoded.append(part.decode(charset or "utf-8", errors="replace"))
|
||||
else:
|
||||
decoded.append(part)
|
||||
return " ".join(decoded)
|
||||
|
||||
|
||||
def _extract_text_body(msg: email_lib.message.Message) -> str:
|
||||
"""Extract the plain-text body from a potentially multipart email."""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
disposition = str(part.get("Content-Disposition", ""))
|
||||
# Skip attachments
|
||||
if "attachment" in disposition:
|
||||
continue
|
||||
if content_type == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
return payload.decode(charset, errors="replace")
|
||||
# Fallback: try text/html and strip tags
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
disposition = str(part.get("Content-Disposition", ""))
|
||||
if "attachment" in disposition:
|
||||
continue
|
||||
if content_type == "text/html":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
html = payload.decode(charset, errors="replace")
|
||||
return _strip_html(html)
|
||||
return ""
|
||||
else:
|
||||
payload = msg.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
text = payload.decode(charset, errors="replace")
|
||||
if msg.get_content_type() == "text/html":
|
||||
return _strip_html(text)
|
||||
return text
|
||||
return ""
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
"""Naive HTML tag stripper for fallback text extraction."""
|
||||
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<p[^>]*>", "\n", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"</p>", "\n", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
text = re.sub(r" ", " ", text)
|
||||
text = re.sub(r"&", "&", text)
|
||||
text = re.sub(r"<", "<", text)
|
||||
text = re.sub(r">", ">", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _extract_email_address(raw: str) -> str:
|
||||
"""Extract bare email address from 'Name <addr>' format."""
|
||||
match = re.search(r"<([^>]+)>", raw)
|
||||
if match:
|
||||
return match.group(1).strip().lower()
|
||||
return raw.strip().lower()
|
||||
|
||||
|
||||
def _extract_attachments(msg: email_lib.message.Message) -> List[Dict[str, Any]]:
|
||||
"""Extract attachment metadata and cache files locally."""
|
||||
attachments = []
|
||||
if not msg.is_multipart():
|
||||
return attachments
|
||||
|
||||
for part in msg.walk():
|
||||
disposition = str(part.get("Content-Disposition", ""))
|
||||
if "attachment" not in disposition and "inline" not in disposition:
|
||||
continue
|
||||
# Skip text/plain and text/html body parts
|
||||
content_type = part.get_content_type()
|
||||
if content_type in ("text/plain", "text/html") and "attachment" not in disposition:
|
||||
continue
|
||||
|
||||
filename = part.get_filename()
|
||||
if filename:
|
||||
filename = _decode_header_value(filename)
|
||||
else:
|
||||
ext = part.get_content_subtype() or "bin"
|
||||
filename = f"attachment.{ext}"
|
||||
|
||||
payload = part.get_payload(decode=True)
|
||||
if not payload:
|
||||
continue
|
||||
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext in _IMAGE_EXTS:
|
||||
cached_path = cache_image_from_bytes(payload, ext)
|
||||
attachments.append({
|
||||
"path": cached_path,
|
||||
"filename": filename,
|
||||
"type": "image",
|
||||
"media_type": content_type,
|
||||
})
|
||||
else:
|
||||
cached_path = cache_document_from_bytes(payload, filename)
|
||||
attachments.append({
|
||||
"path": cached_path,
|
||||
"filename": filename,
|
||||
"type": "document",
|
||||
"media_type": content_type,
|
||||
})
|
||||
|
||||
return attachments
|
||||
|
||||
|
||||
class EmailAdapter(BasePlatformAdapter):
|
||||
"""Email gateway adapter using IMAP (receive) and SMTP (send)."""
|
||||
|
||||
def __init__(self, config: PlatformConfig):
|
||||
super().__init__(config, Platform.EMAIL)
|
||||
|
||||
self._address = os.getenv("EMAIL_ADDRESS", "")
|
||||
self._password = os.getenv("EMAIL_PASSWORD", "")
|
||||
self._imap_host = os.getenv("EMAIL_IMAP_HOST", "")
|
||||
self._imap_port = int(os.getenv("EMAIL_IMAP_PORT", "993"))
|
||||
self._smtp_host = os.getenv("EMAIL_SMTP_HOST", "")
|
||||
self._smtp_port = int(os.getenv("EMAIL_SMTP_PORT", "587"))
|
||||
self._poll_interval = int(os.getenv("EMAIL_POLL_INTERVAL", "15"))
|
||||
|
||||
# Track message IDs we've already processed to avoid duplicates
|
||||
self._seen_uids: set = set()
|
||||
self._poll_task: Optional[asyncio.Task] = None
|
||||
|
||||
# Map chat_id (sender email) -> last subject + message-id for threading
|
||||
self._thread_context: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
logger.info("[Email] Adapter initialized for %s", self._address)
|
||||
|
||||
async def connect(self) -> bool:
|
||||
"""Connect to the IMAP server and start polling for new messages."""
|
||||
try:
|
||||
# Test IMAP connection
|
||||
imap = imaplib.IMAP4_SSL(self._imap_host, self._imap_port)
|
||||
imap.login(self._address, self._password)
|
||||
# Mark all existing messages as seen so we only process new ones
|
||||
imap.select("INBOX")
|
||||
status, data = imap.search(None, "ALL")
|
||||
if status == "OK" and data[0]:
|
||||
for uid in data[0].split():
|
||||
self._seen_uids.add(uid)
|
||||
imap.logout()
|
||||
logger.info("[Email] IMAP connection test passed. %d existing messages skipped.", len(self._seen_uids))
|
||||
except Exception as e:
|
||||
logger.error("[Email] IMAP connection failed: %s", e)
|
||||
return False
|
||||
|
||||
try:
|
||||
# Test SMTP connection
|
||||
smtp = smtplib.SMTP(self._smtp_host, self._smtp_port)
|
||||
smtp.starttls()
|
||||
smtp.login(self._address, self._password)
|
||||
smtp.quit()
|
||||
logger.info("[Email] SMTP connection test passed.")
|
||||
except Exception as e:
|
||||
logger.error("[Email] SMTP connection failed: %s", e)
|
||||
return False
|
||||
|
||||
self._running = True
|
||||
self._poll_task = asyncio.create_task(self._poll_loop())
|
||||
print(f"[Email] Connected as {self._address}")
|
||||
return True
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Stop polling and disconnect."""
|
||||
self._running = False
|
||||
if self._poll_task:
|
||||
self._poll_task.cancel()
|
||||
try:
|
||||
await self._poll_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._poll_task = None
|
||||
logger.info("[Email] Disconnected.")
|
||||
|
||||
async def _poll_loop(self) -> None:
|
||||
"""Poll IMAP for new messages at regular intervals."""
|
||||
while self._running:
|
||||
try:
|
||||
await self._check_inbox()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("[Email] Poll error: %s", e)
|
||||
await asyncio.sleep(self._poll_interval)
|
||||
|
||||
async def _check_inbox(self) -> None:
|
||||
"""Check INBOX for unseen messages and dispatch them."""
|
||||
# Run IMAP operations in a thread to avoid blocking the event loop
|
||||
loop = asyncio.get_running_loop()
|
||||
messages = await loop.run_in_executor(None, self._fetch_new_messages)
|
||||
for msg_data in messages:
|
||||
await self._dispatch_message(msg_data)
|
||||
|
||||
def _fetch_new_messages(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch new (unseen) messages from IMAP. Runs in executor thread."""
|
||||
results = []
|
||||
try:
|
||||
imap = imaplib.IMAP4_SSL(self._imap_host, self._imap_port)
|
||||
imap.login(self._address, self._password)
|
||||
imap.select("INBOX")
|
||||
|
||||
status, data = imap.search(None, "UNSEEN")
|
||||
if status != "OK" or not data[0]:
|
||||
imap.logout()
|
||||
return results
|
||||
|
||||
for uid in data[0].split():
|
||||
if uid in self._seen_uids:
|
||||
continue
|
||||
self._seen_uids.add(uid)
|
||||
|
||||
status, msg_data = imap.fetch(uid, "(RFC822)")
|
||||
if status != "OK":
|
||||
continue
|
||||
|
||||
raw_email = msg_data[0][1]
|
||||
msg = email_lib.message_from_bytes(raw_email)
|
||||
|
||||
sender_raw = msg.get("From", "")
|
||||
sender_addr = _extract_email_address(sender_raw)
|
||||
sender_name = _decode_header_value(sender_raw)
|
||||
# Remove email from name if present
|
||||
if "<" in sender_name:
|
||||
sender_name = sender_name.split("<")[0].strip().strip('"')
|
||||
|
||||
subject = _decode_header_value(msg.get("Subject", "(no subject)"))
|
||||
message_id = msg.get("Message-ID", "")
|
||||
in_reply_to = msg.get("In-Reply-To", "")
|
||||
body = _extract_text_body(msg)
|
||||
attachments = _extract_attachments(msg)
|
||||
|
||||
results.append({
|
||||
"uid": uid,
|
||||
"sender_addr": sender_addr,
|
||||
"sender_name": sender_name,
|
||||
"subject": subject,
|
||||
"message_id": message_id,
|
||||
"in_reply_to": in_reply_to,
|
||||
"body": body,
|
||||
"attachments": attachments,
|
||||
"date": msg.get("Date", ""),
|
||||
})
|
||||
|
||||
imap.logout()
|
||||
except Exception as e:
|
||||
logger.error("[Email] IMAP fetch error: %s", e)
|
||||
return results
|
||||
|
||||
async def _dispatch_message(self, msg_data: Dict[str, Any]) -> None:
|
||||
"""Convert a fetched email into a MessageEvent and dispatch it."""
|
||||
sender_addr = msg_data["sender_addr"]
|
||||
|
||||
# Skip self-messages
|
||||
if sender_addr == self._address.lower():
|
||||
return
|
||||
|
||||
subject = msg_data["subject"]
|
||||
body = msg_data["body"].strip()
|
||||
attachments = msg_data["attachments"]
|
||||
|
||||
# Build message text: include subject as context
|
||||
text = body
|
||||
if subject and not subject.startswith("Re:"):
|
||||
text = f"[Subject: {subject}]\n\n{body}"
|
||||
|
||||
# Determine message type and media
|
||||
media_urls = []
|
||||
media_types = []
|
||||
msg_type = MessageType.TEXT
|
||||
|
||||
for att in attachments:
|
||||
media_urls.append(att["path"])
|
||||
media_types.append(att["media_type"])
|
||||
if att["type"] == "image":
|
||||
msg_type = MessageType.PHOTO
|
||||
|
||||
# Store thread context for reply threading
|
||||
self._thread_context[sender_addr] = {
|
||||
"subject": subject,
|
||||
"message_id": msg_data["message_id"],
|
||||
}
|
||||
|
||||
source = self.build_source(
|
||||
chat_id=sender_addr,
|
||||
chat_name=msg_data["sender_name"] or sender_addr,
|
||||
chat_type="dm",
|
||||
user_id=sender_addr,
|
||||
user_name=msg_data["sender_name"] or sender_addr,
|
||||
)
|
||||
|
||||
event = MessageEvent(
|
||||
text=text or "(empty email)",
|
||||
message_type=msg_type,
|
||||
source=source,
|
||||
message_id=msg_data["message_id"],
|
||||
media_urls=media_urls,
|
||||
media_types=media_types,
|
||||
reply_to_message_id=msg_data["in_reply_to"] or None,
|
||||
)
|
||||
|
||||
logger.info("[Email] New message from %s: %s", sender_addr, subject)
|
||||
await self.handle_message(event)
|
||||
|
||||
async def send(
|
||||
self,
|
||||
chat_id: str,
|
||||
content: str,
|
||||
reply_to: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> SendResult:
|
||||
"""Send an email reply to the given address."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
message_id = await loop.run_in_executor(
|
||||
None, self._send_email, chat_id, content, reply_to
|
||||
)
|
||||
return SendResult(success=True, message_id=message_id)
|
||||
except Exception as e:
|
||||
logger.error("[Email] Send failed to %s: %s", chat_id, e)
|
||||
return SendResult(success=False, error=str(e))
|
||||
|
||||
def _send_email(
|
||||
self,
|
||||
to_addr: str,
|
||||
body: str,
|
||||
reply_to_msg_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Send an email via SMTP. Runs in executor thread."""
|
||||
msg = MIMEMultipart()
|
||||
msg["From"] = self._address
|
||||
msg["To"] = to_addr
|
||||
|
||||
# Thread context for reply
|
||||
ctx = self._thread_context.get(to_addr, {})
|
||||
subject = ctx.get("subject", "Hermes Agent")
|
||||
if not subject.startswith("Re:"):
|
||||
subject = f"Re: {subject}"
|
||||
msg["Subject"] = subject
|
||||
|
||||
# Threading headers
|
||||
original_msg_id = reply_to_msg_id or ctx.get("message_id")
|
||||
if original_msg_id:
|
||||
msg["In-Reply-To"] = original_msg_id
|
||||
msg["References"] = original_msg_id
|
||||
|
||||
msg_id = f"<hermes-{uuid.uuid4().hex[:12]}@{self._address.split('@')[1]}>"
|
||||
msg["Message-ID"] = msg_id
|
||||
|
||||
msg.attach(MIMEText(body, "plain", "utf-8"))
|
||||
|
||||
smtp = smtplib.SMTP(self._smtp_host, self._smtp_port)
|
||||
smtp.starttls()
|
||||
smtp.login(self._address, self._password)
|
||||
smtp.send_message(msg)
|
||||
smtp.quit()
|
||||
|
||||
logger.info("[Email] Sent reply to %s (subject: %s)", to_addr, subject)
|
||||
return msg_id
|
||||
|
||||
async def send_typing(self, chat_id: str) -> None:
|
||||
"""Email has no typing indicator — no-op."""
|
||||
pass
|
||||
|
||||
async def send_image(
|
||||
self,
|
||||
chat_id: str,
|
||||
image_url: str,
|
||||
caption: Optional[str] = None,
|
||||
reply_to: Optional[str] = None,
|
||||
) -> SendResult:
|
||||
"""Send an image URL as part of an email body."""
|
||||
text = caption or ""
|
||||
text += f"\n\nImage: {image_url}"
|
||||
return await self.send(chat_id, text.strip(), reply_to)
|
||||
|
||||
async def send_document(
|
||||
self,
|
||||
chat_id: str,
|
||||
file_path: str,
|
||||
caption: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
reply_to: Optional[str] = None,
|
||||
) -> SendResult:
|
||||
"""Send a file as an email attachment."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
message_id = await loop.run_in_executor(
|
||||
None,
|
||||
self._send_email_with_attachment,
|
||||
chat_id,
|
||||
caption or "",
|
||||
file_path,
|
||||
file_name,
|
||||
)
|
||||
return SendResult(success=True, message_id=message_id)
|
||||
except Exception as e:
|
||||
logger.error("[Email] Send document failed: %s", e)
|
||||
return SendResult(success=False, error=str(e))
|
||||
|
||||
def _send_email_with_attachment(
|
||||
self,
|
||||
to_addr: str,
|
||||
body: str,
|
||||
file_path: str,
|
||||
file_name: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Send an email with a file attachment via SMTP."""
|
||||
msg = MIMEMultipart()
|
||||
msg["From"] = self._address
|
||||
msg["To"] = to_addr
|
||||
|
||||
ctx = self._thread_context.get(to_addr, {})
|
||||
subject = ctx.get("subject", "Hermes Agent")
|
||||
if not subject.startswith("Re:"):
|
||||
subject = f"Re: {subject}"
|
||||
msg["Subject"] = subject
|
||||
|
||||
original_msg_id = ctx.get("message_id")
|
||||
if original_msg_id:
|
||||
msg["In-Reply-To"] = original_msg_id
|
||||
msg["References"] = original_msg_id
|
||||
|
||||
msg_id = f"<hermes-{uuid.uuid4().hex[:12]}@{self._address.split('@')[1]}>"
|
||||
msg["Message-ID"] = msg_id
|
||||
|
||||
if body:
|
||||
msg.attach(MIMEText(body, "plain", "utf-8"))
|
||||
|
||||
# Attach file
|
||||
p = Path(file_path)
|
||||
fname = file_name or p.name
|
||||
with open(p, "rb") as f:
|
||||
part = MIMEBase("application", "octet-stream")
|
||||
part.set_payload(f.read())
|
||||
encoders.encode_base64(part)
|
||||
part.add_header("Content-Disposition", f"attachment; filename={fname}")
|
||||
msg.attach(part)
|
||||
|
||||
smtp = smtplib.SMTP(self._smtp_host, self._smtp_port)
|
||||
smtp.starttls()
|
||||
smtp.login(self._address, self._password)
|
||||
smtp.send_message(msg)
|
||||
smtp.quit()
|
||||
|
||||
return msg_id
|
||||
|
||||
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
|
||||
"""Return basic info about the email chat."""
|
||||
ctx = self._thread_context.get(chat_id, {})
|
||||
return {
|
||||
"name": chat_id,
|
||||
"type": "dm",
|
||||
"chat_id": chat_id,
|
||||
"subject": ctx.get("subject", ""),
|
||||
}
|
||||
@@ -672,6 +672,13 @@ class GatewayRunner:
|
||||
return None
|
||||
return HomeAssistantAdapter(config)
|
||||
|
||||
elif platform == Platform.EMAIL:
|
||||
from gateway.platforms.email import EmailAdapter, check_email_requirements
|
||||
if not check_email_requirements():
|
||||
logger.warning("Email: EMAIL_ADDRESS, EMAIL_PASSWORD, EMAIL_IMAP_HOST, or EMAIL_SMTP_HOST not set")
|
||||
return None
|
||||
return EmailAdapter(config)
|
||||
|
||||
return None
|
||||
|
||||
def _is_user_authorized(self, source: SessionSource) -> bool:
|
||||
@@ -701,6 +708,7 @@ class GatewayRunner:
|
||||
Platform.WHATSAPP: "WHATSAPP_ALLOWED_USERS",
|
||||
Platform.SLACK: "SLACK_ALLOWED_USERS",
|
||||
Platform.SIGNAL: "SIGNAL_ALLOWED_USERS",
|
||||
Platform.EMAIL: "EMAIL_ALLOWED_USERS",
|
||||
}
|
||||
platform_allow_all_map = {
|
||||
Platform.TELEGRAM: "TELEGRAM_ALLOW_ALL_USERS",
|
||||
@@ -708,6 +716,7 @@ class GatewayRunner:
|
||||
Platform.WHATSAPP: "WHATSAPP_ALLOW_ALL_USERS",
|
||||
Platform.SLACK: "SLACK_ALLOW_ALL_USERS",
|
||||
Platform.SIGNAL: "SIGNAL_ALLOW_ALL_USERS",
|
||||
Platform.EMAIL: "EMAIL_ALLOW_ALL_USERS",
|
||||
}
|
||||
|
||||
# Per-platform allow-all flag (e.g., DISCORD_ALLOW_ALL_USERS=true)
|
||||
@@ -2014,6 +2023,7 @@ class GatewayRunner:
|
||||
Platform.SLACK: "hermes-slack",
|
||||
Platform.SIGNAL: "hermes-signal",
|
||||
Platform.HOMEASSISTANT: "hermes-homeassistant",
|
||||
Platform.EMAIL: "hermes-email",
|
||||
}
|
||||
platform_toolsets_config = {}
|
||||
try:
|
||||
@@ -2034,6 +2044,7 @@ class GatewayRunner:
|
||||
Platform.SLACK: "slack",
|
||||
Platform.SIGNAL: "signal",
|
||||
Platform.HOMEASSISTANT: "homeassistant",
|
||||
Platform.EMAIL: "email",
|
||||
}.get(source.platform, "telegram")
|
||||
|
||||
config_toolsets = platform_toolsets_config.get(platform_config_key)
|
||||
@@ -2826,6 +2837,7 @@ class GatewayRunner:
|
||||
Platform.SLACK: "hermes-slack",
|
||||
Platform.SIGNAL: "hermes-signal",
|
||||
Platform.HOMEASSISTANT: "hermes-homeassistant",
|
||||
Platform.EMAIL: "hermes-email",
|
||||
}
|
||||
|
||||
# Try to load platform_toolsets from config
|
||||
@@ -2849,6 +2861,7 @@ class GatewayRunner:
|
||||
Platform.SLACK: "slack",
|
||||
Platform.SIGNAL: "signal",
|
||||
Platform.HOMEASSISTANT: "homeassistant",
|
||||
Platform.EMAIL: "email",
|
||||
}.get(source.platform, "telegram")
|
||||
|
||||
# Use config override if present (list of toolsets), otherwise hardcoded default
|
||||
|
||||
@@ -35,6 +35,7 @@ COMMANDS_BY_CATEGORY = {
|
||||
"/prompt": "View/set custom system prompt",
|
||||
"/personality": "Set a predefined personality",
|
||||
"/verbose": "Cycle tool progress display: off → new → all → verbose",
|
||||
"/reasoning": "Manage reasoning effort and display (usage: /reasoning [level|show|hide])",
|
||||
"/skin": "Show or change the display skin/theme",
|
||||
},
|
||||
"Tools & Skills": {
|
||||
|
||||
@@ -143,6 +143,7 @@ DEFAULT_CONFIG = {
|
||||
"personality": "kawaii",
|
||||
"resume_display": "full",
|
||||
"bell_on_complete": False,
|
||||
"show_reasoning": False,
|
||||
"skin": "default",
|
||||
},
|
||||
|
||||
@@ -182,7 +183,16 @@ DEFAULT_CONFIG = {
|
||||
"memory_char_limit": 2200, # ~800 tokens at 2.75 chars/token
|
||||
"user_char_limit": 1375, # ~500 tokens at 2.75 chars/token
|
||||
},
|
||||
|
||||
|
||||
# Subagent delegation — override the provider:model used by delegate_task
|
||||
# so child agents can run on a different (cheaper/faster) provider and model.
|
||||
# Uses the same runtime provider resolution as CLI/gateway startup, so all
|
||||
# configured providers (OpenRouter, Nous, Z.ai, Kimi, etc.) are supported.
|
||||
"delegation": {
|
||||
"model": "", # e.g. "google/gemini-3-flash-preview" (empty = inherit parent model)
|
||||
"provider": "", # e.g. "openrouter" (empty = inherit parent provider + credentials)
|
||||
},
|
||||
|
||||
# Ephemeral prefill messages file — JSON list of {role, content} dicts
|
||||
# injected at the start of every API call for few-shot priming.
|
||||
# Never saved to sessions, logs, or trajectories.
|
||||
@@ -1025,6 +1035,14 @@ def show_config():
|
||||
print(f" Max turns: {config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])}")
|
||||
print(f" Toolsets: {', '.join(config.get('toolsets', ['all']))}")
|
||||
|
||||
# Display
|
||||
print()
|
||||
print(color("◆ Display", Colors.CYAN, Colors.BOLD))
|
||||
display = config.get('display', {})
|
||||
print(f" Personality: {display.get('personality', 'kawaii')}")
|
||||
print(f" Reasoning: {'on' if display.get('show_reasoning', False) else 'off'}")
|
||||
print(f" Bell: {'on' if display.get('bell_on_complete', False) else 'off'}")
|
||||
|
||||
# Terminal
|
||||
print()
|
||||
print(color("◆ Terminal", Colors.CYAN, Colors.BOLD))
|
||||
|
||||
@@ -518,6 +518,32 @@ _PLATFORMS = [
|
||||
"emoji": "📡",
|
||||
"token_var": "SIGNAL_HTTP_URL",
|
||||
},
|
||||
{
|
||||
"key": "email",
|
||||
"label": "Email",
|
||||
"emoji": "📧",
|
||||
"token_var": "EMAIL_ADDRESS",
|
||||
"setup_instructions": [
|
||||
"1. Use a dedicated email account for your Hermes agent",
|
||||
"2. For Gmail: enable 2FA, then create an App Password at",
|
||||
" https://myaccount.google.com/apppasswords",
|
||||
"3. For other providers: use your email password or app-specific password",
|
||||
"4. IMAP must be enabled on your email account",
|
||||
],
|
||||
"vars": [
|
||||
{"name": "EMAIL_ADDRESS", "prompt": "Email address", "password": False,
|
||||
"help": "The email address Hermes will use (e.g., hermes@gmail.com)."},
|
||||
{"name": "EMAIL_PASSWORD", "prompt": "Email password (or app password)", "password": True,
|
||||
"help": "For Gmail, use an App Password (not your regular password)."},
|
||||
{"name": "EMAIL_IMAP_HOST", "prompt": "IMAP host", "password": False,
|
||||
"help": "e.g., imap.gmail.com for Gmail, outlook.office365.com for Outlook."},
|
||||
{"name": "EMAIL_SMTP_HOST", "prompt": "SMTP host", "password": False,
|
||||
"help": "e.g., smtp.gmail.com for Gmail, smtp.office365.com for Outlook."},
|
||||
{"name": "EMAIL_ALLOWED_USERS", "prompt": "Allowed sender emails (comma-separated)", "password": False,
|
||||
"is_allowlist": True,
|
||||
"help": "Only emails from these addresses will be processed."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -543,6 +569,15 @@ def _platform_status(platform: dict) -> str:
|
||||
if val or account:
|
||||
return "partially configured"
|
||||
return "not configured"
|
||||
if platform.get("key") == "email":
|
||||
pwd = get_env_value("EMAIL_PASSWORD")
|
||||
imap = get_env_value("EMAIL_IMAP_HOST")
|
||||
smtp = get_env_value("EMAIL_SMTP_HOST")
|
||||
if all([val, pwd, imap, smtp]):
|
||||
return "configured"
|
||||
if any([val, pwd, imap, smtp]):
|
||||
return "partially configured"
|
||||
return "not configured"
|
||||
if val:
|
||||
return "configured"
|
||||
return "not configured"
|
||||
|
||||
@@ -292,12 +292,41 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
|
||||
tool_status = []
|
||||
|
||||
# OpenRouter (required for vision, moa)
|
||||
# Vision — works with OpenRouter, Nous OAuth, Codex OAuth, or OpenAI endpoint
|
||||
_has_vision = False
|
||||
_vision_via = None
|
||||
if get_env_value('OPENROUTER_API_KEY'):
|
||||
_has_vision, _vision_via = True, "OpenRouter/Gemini"
|
||||
else:
|
||||
try:
|
||||
_vauth_path = Path(os.path.expanduser("~/.hermes/auth.json"))
|
||||
if _vauth_path.is_file():
|
||||
import json as _vjson
|
||||
_vauth = _vjson.loads(_vauth_path.read_text())
|
||||
if _vauth.get("active_provider") == "nous":
|
||||
_np = _vauth.get("providers", {}).get("nous", {})
|
||||
if _np.get("agent_key") or _np.get("access_token"):
|
||||
_has_vision, _vision_via = True, "Nous Portal/Gemini"
|
||||
elif _vauth.get("active_provider") == "openai-codex":
|
||||
_cp = _vauth.get("providers", {}).get("openai-codex", {})
|
||||
if _cp.get("tokens", {}).get("access_token"):
|
||||
_has_vision, _vision_via = True, "OpenAI Codex"
|
||||
except Exception:
|
||||
pass
|
||||
if not _has_vision:
|
||||
_oai_base = get_env_value('OPENAI_BASE_URL') or ""
|
||||
if get_env_value('OPENAI_API_KEY') and "api.openai.com" in _oai_base.lower():
|
||||
_has_vision, _vision_via = True, "OpenAI"
|
||||
|
||||
if _has_vision:
|
||||
tool_status.append(("Vision (image analysis)", True, None))
|
||||
else:
|
||||
tool_status.append(("Vision (image analysis)", False, "run 'hermes setup' to configure"))
|
||||
|
||||
# Mixture of Agents — requires OpenRouter specifically (calls multiple models)
|
||||
if get_env_value('OPENROUTER_API_KEY'):
|
||||
tool_status.append(("Mixture of Agents", True, None))
|
||||
else:
|
||||
tool_status.append(("Vision (image analysis)", False, "OPENROUTER_API_KEY"))
|
||||
tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY"))
|
||||
|
||||
# Firecrawl (web tools)
|
||||
@@ -889,22 +918,104 @@ def setup_model_provider(config: dict):
|
||||
|
||||
# else: provider_idx == 9 (Keep current) — only shown when a provider already exists
|
||||
|
||||
# ── OpenRouter API Key for tools (if not already set) ──
|
||||
# Tools (vision, web, MoA) use OpenRouter independently of the main provider.
|
||||
# Prompt for OpenRouter key if not set and a non-OpenRouter provider was chosen.
|
||||
if selected_provider in ("nous", "nous-api", "openai-codex", "custom", "zai", "kimi-coding", "minimax", "minimax-cn") and not get_env_value("OPENROUTER_API_KEY"):
|
||||
print()
|
||||
print_header("OpenRouter API Key (for tools)")
|
||||
print_info("Tools like vision analysis, web search, and MoA use OpenRouter")
|
||||
print_info("independently of your main inference provider.")
|
||||
print_info("Get your API key at: https://openrouter.ai/keys")
|
||||
# ── Vision & Image Analysis Setup ──
|
||||
# Vision requires a multimodal-capable provider. Check whether the user's
|
||||
# chosen provider already covers it — if so, skip the prompt entirely.
|
||||
_vision_needs_setup = True
|
||||
|
||||
api_key = prompt(" OpenRouter API key (optional, press Enter to skip)", password=True)
|
||||
if api_key:
|
||||
save_env_value("OPENROUTER_API_KEY", api_key)
|
||||
print_success("OpenRouter API key saved (for tools)")
|
||||
if selected_provider == "openrouter":
|
||||
# OpenRouter → Gemini for vision, already configured
|
||||
_vision_needs_setup = False
|
||||
elif selected_provider == "nous":
|
||||
# Nous Portal OAuth → Gemini via Nous, already configured
|
||||
_vision_needs_setup = False
|
||||
elif selected_provider == "openai-codex":
|
||||
# Codex OAuth → gpt-5.3-codex supports vision
|
||||
_vision_needs_setup = False
|
||||
elif selected_provider == "custom":
|
||||
_custom_base = (get_env_value("OPENAI_BASE_URL") or "").lower()
|
||||
if "api.openai.com" in _custom_base:
|
||||
# Direct OpenAI endpoint — show vision model picker
|
||||
print()
|
||||
print_header("Vision Model")
|
||||
print_info("Your OpenAI endpoint supports vision. Pick a model for image analysis:")
|
||||
_oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"]
|
||||
_vm_choices = _oai_vision_models + [f"Keep default (gpt-4o-mini)"]
|
||||
_vm_idx = prompt_choice("Select vision model:", _vm_choices, len(_vm_choices) - 1)
|
||||
if _vm_idx < len(_oai_vision_models):
|
||||
save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx])
|
||||
print_success(f"Vision model set to {_oai_vision_models[_vm_idx]}")
|
||||
_vision_needs_setup = False
|
||||
|
||||
# Even for providers without native vision, check if existing credentials
|
||||
# from a previous setup already cover it (e.g. user had OpenRouter before
|
||||
# switching to z.ai)
|
||||
if _vision_needs_setup:
|
||||
if get_env_value("OPENROUTER_API_KEY"):
|
||||
_vision_needs_setup = False
|
||||
else:
|
||||
print_info("Skipped - some tools (vision, web scraping) won't work without this")
|
||||
# Check for Nous Portal OAuth in auth.json
|
||||
try:
|
||||
_auth_path = Path.home() / ".hermes" / "auth.json"
|
||||
if _auth_path.is_file():
|
||||
import json as _json
|
||||
_auth_data = _json.loads(_auth_path.read_text())
|
||||
if _auth_data.get("active_provider") == "nous":
|
||||
_nous_p = _auth_data.get("providers", {}).get("nous", {})
|
||||
if _nous_p.get("agent_key") or _nous_p.get("access_token"):
|
||||
_vision_needs_setup = False
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if _vision_needs_setup:
|
||||
_prov_names = {
|
||||
"nous-api": "Nous Portal API key",
|
||||
"zai": "Z.AI / GLM",
|
||||
"kimi-coding": "Kimi / Moonshot",
|
||||
"minimax": "MiniMax",
|
||||
"minimax-cn": "MiniMax CN",
|
||||
"custom": "your custom endpoint",
|
||||
}
|
||||
_prov_display = _prov_names.get(selected_provider, selected_provider or "your provider")
|
||||
|
||||
print()
|
||||
print_header("Vision & Image Analysis (optional)")
|
||||
print_info(f"Vision requires a multimodal-capable provider. {_prov_display}")
|
||||
print_info("doesn't natively support it. Choose how to enable vision,")
|
||||
print_info("or skip to configure later.")
|
||||
print()
|
||||
|
||||
_vision_choices = [
|
||||
"OpenRouter — uses Gemini (free tier at openrouter.ai/keys)",
|
||||
"OpenAI — enter API key & choose a vision model",
|
||||
"Skip for now",
|
||||
]
|
||||
_vision_idx = prompt_choice("Configure vision:", _vision_choices, 2)
|
||||
|
||||
if _vision_idx == 0: # OpenRouter
|
||||
_or_key = prompt(" OpenRouter API key", password=True)
|
||||
if _or_key:
|
||||
save_env_value("OPENROUTER_API_KEY", _or_key)
|
||||
print_success("OpenRouter key saved — vision will use Gemini")
|
||||
else:
|
||||
print_info("Skipped — vision won't be available")
|
||||
elif _vision_idx == 1: # OpenAI
|
||||
_oai_key = prompt(" OpenAI API key", password=True)
|
||||
if _oai_key:
|
||||
save_env_value("OPENAI_API_KEY", _oai_key)
|
||||
save_env_value("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
_oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"]
|
||||
_vm_choices = _oai_vision_models + ["Use default (gpt-4o-mini)"]
|
||||
_vm_idx = prompt_choice("Select vision model:", _vm_choices, 0)
|
||||
if _vm_idx < len(_oai_vision_models):
|
||||
save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx])
|
||||
print_success(f"Vision configured with OpenAI ({_oai_vision_models[_vm_idx]})")
|
||||
else:
|
||||
print_success("Vision configured with OpenAI (gpt-4o-mini)")
|
||||
else:
|
||||
print_info("Skipped — vision won't be available")
|
||||
else:
|
||||
print_info("Skipped — add later with 'hermes config set OPENROUTER_API_KEY ...'")
|
||||
|
||||
# ── Model Selection (adapts based on provider) ──
|
||||
if selected_provider != "custom": # Custom already prompted for model name
|
||||
|
||||
@@ -22,6 +22,8 @@ PLATFORMS = {
|
||||
"discord": "💬 Discord",
|
||||
"slack": "💼 Slack",
|
||||
"whatsapp": "📱 WhatsApp",
|
||||
"signal": "📡 Signal",
|
||||
"email": "📧 Email",
|
||||
}
|
||||
|
||||
# ─── Config Helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -208,6 +208,7 @@ def show_status(args):
|
||||
"WhatsApp": ("WHATSAPP_ENABLED", None),
|
||||
"Signal": ("SIGNAL_HTTP_URL", "SIGNAL_HOME_CHANNEL"),
|
||||
"Slack": ("SLACK_BOT_TOKEN", None),
|
||||
"Email": ("EMAIL_ADDRESS", "EMAIL_HOME_ADDRESS"),
|
||||
}
|
||||
|
||||
for name, (token_var, home_var) in platforms.items():
|
||||
|
||||
@@ -108,6 +108,8 @@ PLATFORMS = {
|
||||
"discord": {"label": "💬 Discord", "default_toolset": "hermes-discord"},
|
||||
"slack": {"label": "💼 Slack", "default_toolset": "hermes-slack"},
|
||||
"whatsapp": {"label": "📱 WhatsApp", "default_toolset": "hermes-whatsapp"},
|
||||
"signal": {"label": "📡 Signal", "default_toolset": "hermes-signal"},
|
||||
"email": {"label": "📧 Email", "default_toolset": "hermes-email"},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ dependencies = [
|
||||
[project.optional-dependencies]
|
||||
modal = ["swe-rex[modal]>=1.4.0"]
|
||||
daytona = ["daytona>=0.148.0"]
|
||||
dev = ["pytest", "pytest-asyncio", "mcp>=1.2.0"]
|
||||
dev = ["pytest", "pytest-asyncio", "pytest-xdist", "mcp>=1.2.0"]
|
||||
messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]
|
||||
cron = ["croniter"]
|
||||
slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]
|
||||
@@ -84,4 +84,4 @@ testpaths = ["tests"]
|
||||
markers = [
|
||||
"integration: marks tests requiring external services (API keys, Modal, etc.)",
|
||||
]
|
||||
addopts = "-m 'not integration'"
|
||||
addopts = "-m 'not integration' -n auto"
|
||||
|
||||
30
run_agent.py
30
run_agent.py
@@ -173,6 +173,7 @@ class AIAgent:
|
||||
session_id: str = None,
|
||||
tool_progress_callback: callable = None,
|
||||
thinking_callback: callable = None,
|
||||
reasoning_callback: callable = None,
|
||||
clarify_callback: callable = None,
|
||||
step_callback: callable = None,
|
||||
max_tokens: int = None,
|
||||
@@ -260,6 +261,7 @@ class AIAgent:
|
||||
|
||||
self.tool_progress_callback = tool_progress_callback
|
||||
self.thinking_callback = thinking_callback
|
||||
self.reasoning_callback = reasoning_callback
|
||||
self.clarify_callback = clarify_callback
|
||||
self.step_callback = step_callback
|
||||
self._last_reported_tool = None # Track for "new tool" mode
|
||||
@@ -1779,6 +1781,7 @@ class AIAgent:
|
||||
allowed_keys = {
|
||||
"model", "instructions", "input", "tools", "store",
|
||||
"reasoning", "include", "max_output_tokens", "temperature",
|
||||
"tool_choice", "parallel_tool_calls", "prompt_cache_key",
|
||||
}
|
||||
normalized: Dict[str, Any] = {
|
||||
"model": model,
|
||||
@@ -1804,6 +1807,12 @@ class AIAgent:
|
||||
if isinstance(temperature, (int, float)):
|
||||
normalized["temperature"] = float(temperature)
|
||||
|
||||
# Pass through tool_choice, parallel_tool_calls, prompt_cache_key
|
||||
for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
|
||||
val = api_kwargs.get(passthrough_key)
|
||||
if val is not None:
|
||||
normalized[passthrough_key] = val
|
||||
|
||||
if allow_stream:
|
||||
stream = api_kwargs.get("stream")
|
||||
if stream is not None and stream is not True:
|
||||
@@ -2420,6 +2429,12 @@ class AIAgent:
|
||||
preview = reasoning_text[:100] + "..." if len(reasoning_text) > 100 else reasoning_text
|
||||
logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {preview}")
|
||||
|
||||
if reasoning_text and self.reasoning_callback:
|
||||
try:
|
||||
self.reasoning_callback(reasoning_text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
msg = {
|
||||
"role": "assistant",
|
||||
"content": assistant_message.content or "",
|
||||
@@ -3454,7 +3469,7 @@ class AIAgent:
|
||||
|
||||
api_start_time = time.time()
|
||||
retry_count = 0
|
||||
max_retries = 6 # Increased to allow longer backoff periods
|
||||
max_retries = 3
|
||||
compression_attempts = 0
|
||||
max_compression_attempts = 3
|
||||
codex_auth_retry_attempted = False
|
||||
@@ -3924,8 +3939,11 @@ class AIAgent:
|
||||
# These indicate a problem with the request itself (bad model ID,
|
||||
# invalid API key, forbidden, etc.) and will never succeed on retry.
|
||||
# Note: 413 and context-length errors are excluded — handled above.
|
||||
# Also catch local validation errors (ValueError, TypeError) — these
|
||||
# are programming bugs, not transient failures.
|
||||
is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
|
||||
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
|
||||
is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [
|
||||
is_client_error = (is_local_validation_error or is_client_status_error or any(phrase in error_msg for phrase in [
|
||||
'error code: 401', 'error code: 403',
|
||||
'error code: 404', 'error code: 422',
|
||||
'is not a valid model', 'invalid model', 'model not found',
|
||||
@@ -4470,9 +4488,17 @@ class AIAgent:
|
||||
if final_response and not interrupted:
|
||||
self._honcho_sync(original_user_message, final_response)
|
||||
|
||||
# Extract reasoning from the last assistant message (if any)
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
|
||||
# Build result with interrupt info if applicable
|
||||
result = {
|
||||
"final_response": final_response,
|
||||
"last_reasoning": last_reasoning,
|
||||
"messages": messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": completed,
|
||||
|
||||
@@ -115,7 +115,7 @@ A config for this would look like:
|
||||
|
||||
Reference: Pre-Tokenized Dataset Documentation.
|
||||
|
||||
We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
We recommend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
|
||||
In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
|
||||
|
||||
@@ -583,7 +583,7 @@ A config for this would look like:
|
||||
|
||||
Reference: Pre-Tokenized Dataset Documentation.
|
||||
|
||||
We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
We recommend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
|
||||
In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
|
||||
|
||||
@@ -796,7 +796,7 @@ A config for this would look like:
|
||||
|
||||
Reference: Pre-Tokenized Dataset Documentation.
|
||||
|
||||
We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
We recommend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
|
||||
|
||||
In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
|
||||
|
||||
|
||||
@@ -1387,7 +1387,7 @@ trainer = SFTTrainer(
|
||||
For **advanced installation instructions** or if you see weird errors during installations:
|
||||
|
||||
1. Install `torch` and `triton`. Go to <https://pytorch.org> to install it. For example `pip install torch torchvision torchaudio triton`
|
||||
2. Confirm if CUDA is installated correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
|
||||
2. Confirm if CUDA is installed correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
|
||||
3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to <https://github.com/facebookresearch/xformers>. Another option is to install `flash-attn` for Ampere GPUs.
|
||||
4. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful.
|
||||
5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
|
||||
@@ -1824,7 +1824,7 @@ For LLMs, datasets are collections of data that can be used to train our models.
|
||||
[datasets-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide)
|
||||
{% endcontent-ref %}
|
||||
|
||||
For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer ouput as well.
|
||||
For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer output as well.
|
||||
|
||||
## 4. Understand Training Hyperparameters
|
||||
|
||||
@@ -13280,7 +13280,7 @@ if __name__ == '__main__':
|
||||
## :detective: Extra Findings & Tips
|
||||
|
||||
1. We find using lower KV cache quantization (4bit) seems to degrade generation quality via empirical tests - more tests need to be done, but we suggest using `q8_0` cache quantization. The goal of quantization is to support longer context lengths since the KV cache uses quite a bit of memory.
|
||||
2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dyanmic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
|
||||
2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dynamic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
|
||||
3. Using `llama.cpp` 's Flash Attention backend does result in somewhat faster decoding speeds. Use `-DGGML_CUDA_FA_ALL_QUANTS=ON` when compiling. Note it's also best to set your CUDA architecture as found in <https://developer.nvidia.com/cuda-gpus> to reduce compilation times, then set it via `-DCMAKE_CUDA_ARCHITECTURES="80"` 
|
||||
4. Using a `min_p=0.01`is probably enough. `llama.cpp`defaults to 0.1, which is probably not necessary. Since a temperature of 0.3 is used anyways, we most likely will very unlikely sample low probability tokens, so removing very unlikely tokens is a good idea. DeepSeek recommends 0.0 temperature for coding tasks.
|
||||
|
||||
@@ -16682,7 +16682,7 @@ Advanced flags which might be useful if you see breaking finetunes, or you want
|
||||
|
||||
<table><thead><tr><th width="397.4666748046875">Environment variable</th><th>Purpose</th><th data-hidden></th></tr></thead><tbody><tr><td><code>os.environ["UNSLOTH_RETURN_LOGITS"] = "1"</code></td><td>Forcibly returns logits - useful for evaluation if logits are needed.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"</code></td><td>Disables auto compiler. Could be useful to debug incorrect finetune results.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"</code></td><td>Disables fast generation for generic models.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_ENABLE_LOGGING"] = "1"</code></td><td>Enables auto compiler logging - useful to see which functions are compiled or not.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"</code></td><td>On float16 machines, use float32 and not float16 mixed precision. Useful for Gemma 3.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_STUDIO_DISABLED"] = "1"</code></td><td>Disables extra features.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DEBUG"] = "1"</code></td><td>Turns on extremely verbose <code>torch.compile</code>logs.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_MAXIMUM"] = "0"</code></td><td>Enables maximum <code>torch.compile</code>optimizations - not recommended.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_IGNORE_ERRORS"] = "1"</code></td><td>Can turn this off to enable fullgraph parsing.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FULLGRAPH"] = "0"</code></td><td>Enable <code>torch.compile</code> fullgraph mode</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_AUTO_UPDATES"] = "1"</code></td><td>Forces no updates to <code>unsloth-zoo</code></td><td></td></tr></tbody></table>
|
||||
|
||||
Another possiblity is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
|
||||
Another possibility is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
|
||||
|
||||
```python
|
||||
model, tokenizer = FastVisionModel.from_pretrained(
|
||||
|
||||
@@ -855,7 +855,7 @@ To run Unsloth directly on Windows:
|
||||
For **advanced installation instructions** or if you see weird errors during installations:
|
||||
|
||||
1. Install `torch` and `triton`. Go to <https://pytorch.org> to install it. For example `pip install torch torchvision torchaudio triton`
|
||||
2. Confirm if CUDA is installated correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
|
||||
2. Confirm if CUDA is installed correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
|
||||
3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to <https://github.com/facebookresearch/xformers>. Another option is to install `flash-attn` for Ampere GPUs.
|
||||
4. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful.
|
||||
5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
|
||||
@@ -2994,7 +2994,7 @@ if __name__ == '__main__':
|
||||
## :detective: Extra Findings & Tips
|
||||
|
||||
1. We find using lower KV cache quantization (4bit) seems to degrade generation quality via empirical tests - more tests need to be done, but we suggest using `q8_0` cache quantization. The goal of quantization is to support longer context lengths since the KV cache uses quite a bit of memory.
|
||||
2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dyanmic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
|
||||
2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dynamic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
|
||||
3. Using `llama.cpp` 's Flash Attention backend does result in somewhat faster decoding speeds. Use `-DGGML_CUDA_FA_ALL_QUANTS=ON` when compiling. Note it's also best to set your CUDA architecture as found in <https://developer.nvidia.com/cuda-gpus> to reduce compilation times, then set it via `-DCMAKE_CUDA_ARCHITECTURES="80"` 
|
||||
4. Using a `min_p=0.01`is probably enough. `llama.cpp`defaults to 0.1, which is probably not necessary. Since a temperature of 0.3 is used anyways, we most likely will very unlikely sample low probability tokens, so removing very unlikely tokens is a good idea. DeepSeek recommends 0.0 temperature for coding tasks.
|
||||
|
||||
@@ -3509,7 +3509,7 @@ Advanced flags which might be useful if you see breaking finetunes, or you want
|
||||
|
||||
<table><thead><tr><th width="397.4666748046875">Environment variable</th><th>Purpose</th><th data-hidden></th></tr></thead><tbody><tr><td><code>os.environ["UNSLOTH_RETURN_LOGITS"] = "1"</code></td><td>Forcibly returns logits - useful for evaluation if logits are needed.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"</code></td><td>Disables auto compiler. Could be useful to debug incorrect finetune results.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"</code></td><td>Disables fast generation for generic models.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_ENABLE_LOGGING"] = "1"</code></td><td>Enables auto compiler logging - useful to see which functions are compiled or not.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"</code></td><td>On float16 machines, use float32 and not float16 mixed precision. Useful for Gemma 3.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_STUDIO_DISABLED"] = "1"</code></td><td>Disables extra features.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DEBUG"] = "1"</code></td><td>Turns on extremely verbose <code>torch.compile</code>logs.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_MAXIMUM"] = "0"</code></td><td>Enables maximum <code>torch.compile</code>optimizations - not recommended.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_IGNORE_ERRORS"] = "1"</code></td><td>Can turn this off to enable fullgraph parsing.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FULLGRAPH"] = "0"</code></td><td>Enable <code>torch.compile</code> fullgraph mode</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_AUTO_UPDATES"] = "1"</code></td><td>Forces no updates to <code>unsloth-zoo</code></td><td></td></tr></tbody></table>
|
||||
|
||||
Another possiblity is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
|
||||
Another possibility is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
|
||||
|
||||
**Examples:**
|
||||
|
||||
@@ -9120,7 +9120,7 @@ For LLMs, datasets are collections of data that can be used to train our models.
|
||||
[datasets-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide)
|
||||
{% endcontent-ref %}
|
||||
|
||||
For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer ouput as well.
|
||||
For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer output as well.
|
||||
|
||||
## 4. Understand Training Hyperparameters
|
||||
|
||||
|
||||
@@ -115,70 +115,6 @@ class TestCompress:
|
||||
assert result[-2]["content"] == msgs[-2]["content"]
|
||||
|
||||
|
||||
class TestContentToText:
|
||||
"""Test _content_to_text handles all content types without crashing."""
|
||||
|
||||
def test_string_passthrough(self, compressor):
|
||||
assert compressor._content_to_text("hello") == "hello"
|
||||
|
||||
def test_none_returns_empty(self, compressor):
|
||||
assert compressor._content_to_text(None) == ""
|
||||
|
||||
def test_multimodal_text_parts(self, compressor):
|
||||
content = [
|
||||
{"type": "text", "text": "describe this image"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
|
||||
]
|
||||
result = compressor._content_to_text(content)
|
||||
assert "describe this image" in result
|
||||
assert "[image]" in result
|
||||
|
||||
def test_multimodal_mixed_types(self, compressor):
|
||||
content = [
|
||||
{"type": "text", "text": "first part"},
|
||||
{"type": "audio", "audio": {"data": "..."}},
|
||||
{"type": "text", "text": "second part"},
|
||||
]
|
||||
result = compressor._content_to_text(content)
|
||||
assert "first part" in result
|
||||
assert "[audio]" in result
|
||||
assert "second part" in result
|
||||
|
||||
def test_dict_content_json_serialized(self, compressor):
|
||||
content = {"key": "value"}
|
||||
result = compressor._content_to_text(content)
|
||||
assert "key" in result
|
||||
assert "value" in result
|
||||
|
||||
def test_multimodal_in_generate_summary(self):
|
||||
"""Multimodal user messages should not crash _generate_summary."""
|
||||
mock_client = MagicMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [MagicMock()]
|
||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: image was discussed"
|
||||
mock_client.chat.completions.create.return_value = mock_response
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "What is in this image?"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
|
||||
]},
|
||||
{"role": "assistant", "content": "I see a cat."},
|
||||
{"role": "user", "content": "thanks"},
|
||||
]
|
||||
|
||||
summary = c._generate_summary(messages)
|
||||
assert isinstance(summary, str)
|
||||
# The prompt sent to the model should contain the text, not raw list
|
||||
prompt = mock_client.chat.completions.create.call_args.kwargs["messages"][0]["content"]
|
||||
assert "What is in this image?" in prompt
|
||||
assert "[image]" in prompt
|
||||
|
||||
|
||||
class TestGenerateSummaryNoneContent:
|
||||
"""Regression: content=None (from tool-call-only assistant messages) must not crash."""
|
||||
|
||||
|
||||
1034
tests/gateway/test_email.py
Normal file
1034
tests/gateway/test_email.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -11,7 +11,7 @@ EXPECTED_COMMANDS = {
|
||||
"/help", "/tools", "/toolsets", "/model", "/provider", "/prompt",
|
||||
"/personality", "/clear", "/history", "/new", "/reset", "/retry",
|
||||
"/undo", "/save", "/config", "/cron", "/skills", "/platforms",
|
||||
"/verbose", "/compress", "/title", "/usage", "/insights", "/paste",
|
||||
"/verbose", "/reasoning", "/compress", "/title", "/usage", "/insights", "/paste",
|
||||
"/reload-mcp", "/rollback", "/background", "/skin", "/quit",
|
||||
}
|
||||
|
||||
|
||||
486
tests/test_agent_loop.py
Normal file
486
tests/test_agent_loop.py
Normal file
@@ -0,0 +1,486 @@
|
||||
"""
|
||||
Tests for environments/agent_loop.py — HermesAgentLoop.
|
||||
|
||||
Tests the multi-turn agent engine using mocked servers, without needing
|
||||
real API keys or running servers.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure repo root is importable
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
try:
|
||||
from environments.agent_loop import (
|
||||
AgentResult,
|
||||
HermesAgentLoop,
|
||||
ToolError,
|
||||
_extract_reasoning_from_message,
|
||||
resize_tool_pool,
|
||||
)
|
||||
except ImportError:
|
||||
pytest.skip("atroposlib not installed", allow_module_level=True)
|
||||
|
||||
|
||||
# ─── Mock server infrastructure ─────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockFunction:
|
||||
name: str
|
||||
arguments: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockToolCall:
|
||||
id: str
|
||||
function: MockFunction
|
||||
type: str = "function"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockMessage:
|
||||
content: Optional[str]
|
||||
role: str = "assistant"
|
||||
tool_calls: Optional[List[MockToolCall]] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
reasoning: Optional[str] = None
|
||||
reasoning_details: Optional[list] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockChoice:
|
||||
message: MockMessage
|
||||
finish_reason: str = "stop"
|
||||
index: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockChatCompletion:
|
||||
choices: List[MockChoice]
|
||||
id: str = "chatcmpl-mock"
|
||||
model: str = "mock-model"
|
||||
|
||||
|
||||
class MockServer:
|
||||
"""
|
||||
Mock server that returns pre-configured responses in sequence.
|
||||
Mimics the chat_completion() interface.
|
||||
"""
|
||||
|
||||
def __init__(self, responses: List[MockChatCompletion]):
|
||||
self.responses = responses
|
||||
self.call_count = 0
|
||||
self.call_history: List[Dict[str, Any]] = []
|
||||
|
||||
async def chat_completion(self, **kwargs) -> MockChatCompletion:
|
||||
self.call_history.append(kwargs)
|
||||
if self.call_count >= len(self.responses):
|
||||
# Return a simple text response if we run out
|
||||
return MockChatCompletion(
|
||||
choices=[MockChoice(message=MockMessage(content="Done."))]
|
||||
)
|
||||
resp = self.responses[self.call_count]
|
||||
self.call_count += 1
|
||||
return resp
|
||||
|
||||
|
||||
def make_text_response(content: str) -> MockChatCompletion:
|
||||
"""Create a simple text-only response (no tool calls)."""
|
||||
return MockChatCompletion(
|
||||
choices=[MockChoice(message=MockMessage(content=content))]
|
||||
)
|
||||
|
||||
|
||||
def make_tool_response(
|
||||
tool_name: str,
|
||||
arguments: dict,
|
||||
content: str = "",
|
||||
tool_call_id: str = "call_001",
|
||||
) -> MockChatCompletion:
|
||||
"""Create a response with a single tool call."""
|
||||
return MockChatCompletion(
|
||||
choices=[
|
||||
MockChoice(
|
||||
message=MockMessage(
|
||||
content=content,
|
||||
tool_calls=[
|
||||
MockToolCall(
|
||||
id=tool_call_id,
|
||||
function=MockFunction(
|
||||
name=tool_name,
|
||||
arguments=json.dumps(arguments),
|
||||
),
|
||||
)
|
||||
],
|
||||
),
|
||||
finish_reason="tool_calls",
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# ─── Tests ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAgentResult:
|
||||
def test_defaults(self):
|
||||
result = AgentResult(messages=[])
|
||||
assert result.messages == []
|
||||
assert result.managed_state is None
|
||||
assert result.turns_used == 0
|
||||
assert result.finished_naturally is False
|
||||
assert result.reasoning_per_turn == []
|
||||
assert result.tool_errors == []
|
||||
|
||||
|
||||
class TestExtractReasoning:
|
||||
def test_reasoning_content_field(self):
|
||||
msg = MockMessage(content="hello", reasoning_content="I think...")
|
||||
assert _extract_reasoning_from_message(msg) == "I think..."
|
||||
|
||||
def test_reasoning_field(self):
|
||||
msg = MockMessage(content="hello", reasoning="Let me consider...")
|
||||
assert _extract_reasoning_from_message(msg) == "Let me consider..."
|
||||
|
||||
def test_reasoning_details(self):
|
||||
detail = MagicMock()
|
||||
detail.text = "Detail reasoning"
|
||||
msg = MockMessage(content="hello", reasoning_details=[detail])
|
||||
assert _extract_reasoning_from_message(msg) == "Detail reasoning"
|
||||
|
||||
def test_reasoning_details_dict_format(self):
|
||||
msg = MockMessage(
|
||||
content="hello",
|
||||
reasoning_details=[{"text": "Dict reasoning"}],
|
||||
)
|
||||
assert _extract_reasoning_from_message(msg) == "Dict reasoning"
|
||||
|
||||
def test_no_reasoning(self):
|
||||
msg = MockMessage(content="hello")
|
||||
assert _extract_reasoning_from_message(msg) is None
|
||||
|
||||
def test_reasoning_content_takes_priority(self):
|
||||
msg = MockMessage(
|
||||
content="hello",
|
||||
reasoning_content="First",
|
||||
reasoning="Second",
|
||||
)
|
||||
assert _extract_reasoning_from_message(msg) == "First"
|
||||
|
||||
|
||||
class TestHermesAgentLoop:
|
||||
"""Test the agent loop with mock servers."""
|
||||
|
||||
@pytest.fixture
|
||||
def basic_tools(self):
|
||||
"""Minimal tool schema for testing."""
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "terminal",
|
||||
"description": "Run a command",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string",
|
||||
"description": "Command to run",
|
||||
}
|
||||
},
|
||||
"required": ["command"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string"},
|
||||
},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
@pytest.fixture
|
||||
def valid_names(self):
|
||||
return {"terminal", "read_file", "todo"}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_text_response(self, basic_tools, valid_names):
|
||||
"""Model responds with text only, no tool calls."""
|
||||
server = MockServer([make_text_response("Hello! How can I help?")])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally is True
|
||||
assert result.turns_used == 1
|
||||
assert len(result.messages) >= 2 # user + assistant
|
||||
assert result.messages[-1]["role"] == "assistant"
|
||||
assert result.messages[-1]["content"] == "Hello! How can I help?"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_call_then_text(self, basic_tools, valid_names):
|
||||
"""Model calls a tool, then responds with text."""
|
||||
server = MockServer([
|
||||
make_tool_response("todo", {"todos": [{"id": "1", "content": "test", "status": "pending"}]}),
|
||||
make_text_response("I created a todo for you."),
|
||||
])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Create a todo"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally is True
|
||||
assert result.turns_used == 2
|
||||
# Should have: user, assistant (tool_call), tool (result), assistant (text)
|
||||
roles = [m["role"] for m in result.messages]
|
||||
assert roles == ["user", "assistant", "tool", "assistant"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_max_turns_reached(self, basic_tools, valid_names):
|
||||
"""Model keeps calling tools until max_turns is hit."""
|
||||
# Create responses that always call a tool
|
||||
responses = [
|
||||
make_tool_response("todo", {"todos": [{"id": str(i), "content": f"task {i}", "status": "pending"}]}, tool_call_id=f"call_{i}")
|
||||
for i in range(10)
|
||||
]
|
||||
server = MockServer(responses)
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=3,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Keep going"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally is False
|
||||
assert result.turns_used == 3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unknown_tool_name(self, basic_tools, valid_names):
|
||||
"""Model calls a tool not in valid_tool_names."""
|
||||
server = MockServer([
|
||||
make_tool_response("nonexistent_tool", {"arg": "val"}),
|
||||
make_text_response("OK, that didn't work."),
|
||||
])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Call something weird"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Should record a tool error
|
||||
assert len(result.tool_errors) >= 1
|
||||
assert result.tool_errors[0].tool_name == "nonexistent_tool"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_response(self, basic_tools, valid_names):
|
||||
"""Server returns empty response."""
|
||||
server = MockServer([MockChatCompletion(choices=[])])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally is False
|
||||
assert result.turns_used == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_api_error_handling(self, basic_tools, valid_names):
|
||||
"""Server raises an exception."""
|
||||
|
||||
class FailingServer:
|
||||
async def chat_completion(self, **kwargs):
|
||||
raise ConnectionError("Server unreachable")
|
||||
|
||||
agent = HermesAgentLoop(
|
||||
server=FailingServer(),
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally is False
|
||||
assert result.turns_used == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tools_passed_to_server(self, basic_tools, valid_names):
|
||||
"""Verify tools are passed in the chat_completion kwargs."""
|
||||
server = MockServer([make_text_response("OK")])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
await agent.run(messages)
|
||||
|
||||
assert len(server.call_history) == 1
|
||||
assert "tools" in server.call_history[0]
|
||||
assert server.call_history[0]["tools"] == basic_tools
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extra_body_forwarded(self, basic_tools, valid_names):
|
||||
"""extra_body should be forwarded to server."""
|
||||
extra = {"provider": {"ignore": ["DeepInfra"]}}
|
||||
server = MockServer([make_text_response("OK")])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
extra_body=extra,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
await agent.run(messages)
|
||||
|
||||
assert server.call_history[0].get("extra_body") == extra
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_managed_state_returned(self, basic_tools, valid_names):
|
||||
"""If server has get_state(), result should include managed_state."""
|
||||
server = MockServer([make_text_response("OK")])
|
||||
server.get_state = lambda: {"nodes": [{"test": True}]}
|
||||
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.managed_state is not None
|
||||
assert "nodes" in result.managed_state
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_managed_state_without_get_state(self, basic_tools, valid_names):
|
||||
"""Regular server without get_state() should return None managed_state."""
|
||||
server = MockServer([make_text_response("OK")])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.managed_state is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_tool_blocked(self, basic_tools):
|
||||
"""Memory tool should return error in RL environments."""
|
||||
valid = {"terminal", "read_file", "todo", "memory"}
|
||||
server = MockServer([
|
||||
make_tool_response("memory", {"action": "add", "target": "user", "content": "test"}),
|
||||
make_text_response("Done"),
|
||||
])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Remember this"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Find the tool response
|
||||
tool_msgs = [m for m in result.messages if m["role"] == "tool"]
|
||||
assert len(tool_msgs) >= 1
|
||||
tool_result = json.loads(tool_msgs[0]["content"])
|
||||
assert "error" in tool_result
|
||||
assert "not available" in tool_result["error"].lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_session_search_blocked(self, basic_tools):
|
||||
"""session_search should return error in RL environments."""
|
||||
valid = {"terminal", "read_file", "todo", "session_search"}
|
||||
server = MockServer([
|
||||
make_tool_response("session_search", {"query": "test"}),
|
||||
make_text_response("Done"),
|
||||
])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "Search sessions"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
tool_msgs = [m for m in result.messages if m["role"] == "tool"]
|
||||
assert len(tool_msgs) >= 1
|
||||
tool_result = json.loads(tool_msgs[0]["content"])
|
||||
assert "error" in tool_result
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_reasoning_content_preserved(self, basic_tools, valid_names):
|
||||
"""Reasoning content should be extracted and preserved."""
|
||||
resp = MockChatCompletion(
|
||||
choices=[
|
||||
MockChoice(
|
||||
message=MockMessage(
|
||||
content="The answer is 42.",
|
||||
reasoning_content="Let me think about this step by step...",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
server = MockServer([resp])
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=basic_tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=10,
|
||||
)
|
||||
messages = [{"role": "user", "content": "What is the meaning of life?"}]
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert len(result.reasoning_per_turn) == 1
|
||||
assert result.reasoning_per_turn[0] == "Let me think about this step by step..."
|
||||
|
||||
|
||||
class TestResizeToolPool:
|
||||
def test_resize_works(self):
|
||||
"""resize_tool_pool should not raise."""
|
||||
resize_tool_pool(16) # Small pool for testing
|
||||
resize_tool_pool(128) # Restore default
|
||||
550
tests/test_agent_loop_tool_calling.py
Normal file
550
tests/test_agent_loop_tool_calling.py
Normal file
@@ -0,0 +1,550 @@
|
||||
"""Integration tests for HermesAgentLoop tool calling.
|
||||
|
||||
Tests the full agent loop with real LLM calls via OpenRouter.
|
||||
Uses stepfun/step-3.5-flash:free by default (zero cost), falls back
|
||||
to anthropic/claude-sonnet-4 if the free model is unavailable.
|
||||
|
||||
These tests verify:
|
||||
1. Single tool call: model calls a tool, gets result, responds
|
||||
2. Multi-tool call: model calls multiple tools in one turn
|
||||
3. Multi-turn: model calls tools across multiple turns
|
||||
4. Unknown tool rejection: model calling a non-existent tool gets an error
|
||||
5. Max turns: loop stops when max_turns is reached
|
||||
6. No tools: model responds without calling any tools
|
||||
7. Tool error handling: tool execution errors are captured
|
||||
|
||||
Run:
|
||||
pytest tests/test_agent_loop_tool_calling.py -v
|
||||
pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Set
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure repo root is importable
|
||||
_repo_root = Path(__file__).resolve().parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
try:
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from atroposlib.envs.server_handling.openai_server import OpenAIServer # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("atroposlib not installed", allow_module_level=True)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test infrastructure
|
||||
# =========================================================================
|
||||
|
||||
# Models to try, in order of preference (free first)
|
||||
_MODELS = [
|
||||
"stepfun/step-3.5-flash:free",
|
||||
"google/gemini-2.0-flash-001",
|
||||
"anthropic/claude-sonnet-4",
|
||||
]
|
||||
|
||||
def _get_api_key():
|
||||
key = os.getenv("OPENROUTER_API_KEY", "")
|
||||
if not key:
|
||||
pytest.skip("OPENROUTER_API_KEY not set")
|
||||
return key
|
||||
|
||||
|
||||
def _make_server(model: str = None):
|
||||
"""Create an OpenAI server for testing."""
|
||||
from atroposlib.envs.server_handling.openai_server import OpenAIServer
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
config = APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name=model or _MODELS[0],
|
||||
server_type="openai",
|
||||
api_key=_get_api_key(),
|
||||
health_check=False,
|
||||
)
|
||||
return OpenAIServer(config)
|
||||
|
||||
|
||||
async def _try_models(test_fn):
|
||||
"""Try running a test with each model until one works."""
|
||||
last_error = None
|
||||
for model in _MODELS:
|
||||
try:
|
||||
server = _make_server(model)
|
||||
return await test_fn(server, model)
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if "rate" in str(e).lower() or "limit" in str(e).lower():
|
||||
continue # Rate limited, try next model
|
||||
raise # Real error
|
||||
pytest.skip(f"All models failed. Last error: {last_error}")
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Fake tools for testing
|
||||
# =========================================================================
|
||||
|
||||
# Simple calculator tool
|
||||
CALC_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calculate",
|
||||
"description": "Calculate a math expression. Returns the numeric result.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"expression": {
|
||||
"type": "string",
|
||||
"description": "Math expression to evaluate, e.g. '2 + 3'"
|
||||
}
|
||||
},
|
||||
"required": ["expression"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Weather lookup tool
|
||||
WEATHER_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a city. Returns temperature and conditions.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City name, e.g. 'Tokyo'"
|
||||
}
|
||||
},
|
||||
"required": ["city"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Lookup tool (always succeeds)
|
||||
LOOKUP_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "lookup",
|
||||
"description": "Look up a fact. Returns a short answer string.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "What to look up"
|
||||
}
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Error tool (always fails)
|
||||
ERROR_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "failing_tool",
|
||||
"description": "A tool that always fails with an error.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input": {"type": "string"}
|
||||
},
|
||||
"required": ["input"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str:
|
||||
"""Handle fake tool calls for testing."""
|
||||
if tool_name == "calculate":
|
||||
expr = args.get("expression", "0")
|
||||
try:
|
||||
# Safe eval for simple math
|
||||
result = eval(expr, {"__builtins__": {}}, {})
|
||||
return json.dumps({"result": result})
|
||||
except Exception as e:
|
||||
return json.dumps({"error": str(e)})
|
||||
|
||||
elif tool_name == "get_weather":
|
||||
city = args.get("city", "Unknown")
|
||||
# Return canned weather
|
||||
return json.dumps({
|
||||
"city": city,
|
||||
"temperature": 22,
|
||||
"conditions": "sunny",
|
||||
"humidity": 45,
|
||||
})
|
||||
|
||||
elif tool_name == "lookup":
|
||||
query = args.get("query", "")
|
||||
return json.dumps({"answer": f"The answer to '{query}' is 42."})
|
||||
|
||||
elif tool_name == "failing_tool":
|
||||
raise RuntimeError("This tool always fails!")
|
||||
|
||||
return json.dumps({"error": f"Unknown tool: {tool_name}"})
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests
|
||||
# =========================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_tool_call():
|
||||
"""Model should call a single tool, get the result, and respond."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert isinstance(result, AgentResult)
|
||||
assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}"
|
||||
|
||||
# Verify a tool call happened
|
||||
tool_calls_found = False
|
||||
for msg in result.messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
if tc["function"]["name"] == "get_weather":
|
||||
tool_calls_found = True
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
assert "city" in args
|
||||
assert tool_calls_found, "Model should have called get_weather"
|
||||
|
||||
# Verify tool result is in conversation
|
||||
tool_results = [m for m in result.messages if m.get("role") == "tool"]
|
||||
assert len(tool_results) >= 1, "Should have at least one tool result"
|
||||
|
||||
# Verify the final response references the weather
|
||||
final_msg = result.messages[-1]
|
||||
assert final_msg["role"] == "assistant"
|
||||
assert final_msg["content"], "Final response should have content"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_tool_single_turn():
|
||||
"""Model should call multiple tools in a single turn."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[WEATHER_TOOL, CALC_TOOL],
|
||||
valid_tool_names={"get_weather", "calculate"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": (
|
||||
"I need two things at once: "
|
||||
"1) What's the weather in Paris? Use get_weather. "
|
||||
"2) What is 15 * 7? Use calculate. "
|
||||
"Call BOTH tools in a single response."
|
||||
)},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Count distinct tools called
|
||||
tools_called = set()
|
||||
for msg in result.messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
tools_called.add(tc["function"]["name"])
|
||||
|
||||
# At minimum, both tools should have been called (maybe in different turns)
|
||||
assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}"
|
||||
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_turn_conversation():
|
||||
"""Agent should handle multiple turns of tool calls."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[LOOKUP_TOOL, CALC_TOOL],
|
||||
valid_tool_names={"lookup", "calculate"},
|
||||
max_turns=10,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": (
|
||||
"First, use the lookup tool to look up 'meaning of life'. "
|
||||
"Then use calculate to compute 6 * 7. "
|
||||
"Do these in separate tool calls, one at a time."
|
||||
)},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Should have used both tools
|
||||
tools_called = set()
|
||||
for msg in result.messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
tools_called.add(tc["function"]["name"])
|
||||
|
||||
assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}"
|
||||
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
|
||||
|
||||
# Should finish naturally
|
||||
assert result.finished_naturally, "Should finish naturally after answering"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unknown_tool_rejected():
|
||||
"""If the model calls a tool not in valid_tool_names, it gets an error."""
|
||||
|
||||
async def _run(server, model):
|
||||
# Only allow "calculate" but give schema for both
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[CALC_TOOL, WEATHER_TOOL],
|
||||
valid_tool_names={"calculate"}, # weather NOT allowed
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What's the weather in London? Use get_weather."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Check if get_weather was called and rejected
|
||||
if result.tool_errors:
|
||||
weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"]
|
||||
assert len(weather_errors) > 0, "get_weather should have been rejected"
|
||||
assert "Unknown tool" in weather_errors[0].error
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_max_turns_limit():
|
||||
"""Agent should stop after max_turns even if model keeps calling tools."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[LOOKUP_TOOL],
|
||||
valid_tool_names={"lookup"},
|
||||
max_turns=2, # Very low limit
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": (
|
||||
"Keep looking up facts. Look up 'fact 1', then 'fact 2', "
|
||||
"then 'fact 3', then 'fact 4'. Do them one at a time."
|
||||
)},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}"
|
||||
assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_tools_direct_response():
|
||||
"""When no tools are useful, model should respond directly."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally, "Should finish naturally with a direct response"
|
||||
assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}"
|
||||
|
||||
final = result.messages[-1]
|
||||
assert final["role"] == "assistant"
|
||||
assert final["content"], "Should have text content"
|
||||
assert "4" in final["content"], "Should contain the answer '4'"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_error_handling():
|
||||
"""Tool execution errors should be captured and reported to the model."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[ERROR_TOOL],
|
||||
valid_tool_names={"failing_tool"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "Please call the failing_tool with input 'test'."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# The tool error should be recorded
|
||||
assert len(result.tool_errors) >= 1, "Should have at least one tool error"
|
||||
assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error
|
||||
|
||||
# The error should be in the conversation as a tool result
|
||||
tool_results = [m for m in result.messages if m.get("role") == "tool"]
|
||||
assert len(tool_results) >= 1
|
||||
error_result = json.loads(tool_results[0]["content"])
|
||||
assert "error" in error_result
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_result_structure():
|
||||
"""Verify the AgentResult has all expected fields populated."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[CALC_TOOL],
|
||||
valid_tool_names={"calculate"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What is 3 + 4? Use the calculate tool."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Structural checks
|
||||
assert isinstance(result, AgentResult)
|
||||
assert isinstance(result.messages, list)
|
||||
assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)"
|
||||
assert isinstance(result.turns_used, int)
|
||||
assert result.turns_used > 0
|
||||
assert isinstance(result.finished_naturally, bool)
|
||||
assert isinstance(result.tool_errors, list)
|
||||
assert isinstance(result.reasoning_per_turn, list)
|
||||
|
||||
# Messages should follow OpenAI format
|
||||
for msg in result.messages:
|
||||
assert "role" in msg, f"Message missing 'role': {msg}"
|
||||
assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_conversation_history_preserved():
|
||||
"""The full conversation history should be in result.messages."""
|
||||
|
||||
async def _run(server, model):
|
||||
agent = HermesAgentLoop(
|
||||
server=server,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful weather assistant."},
|
||||
{"role": "user", "content": "What's the weather in Berlin? Use get_weather."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# System message should be preserved
|
||||
assert result.messages[0]["role"] == "system"
|
||||
assert "weather assistant" in result.messages[0]["content"]
|
||||
|
||||
# User message should be preserved
|
||||
assert result.messages[1]["role"] == "user"
|
||||
assert "Berlin" in result.messages[1]["content"]
|
||||
|
||||
# Should have assistant + tool + assistant sequence
|
||||
roles = [m["role"] for m in result.messages]
|
||||
assert "tool" in roles, "Should have tool results in conversation"
|
||||
|
||||
return result
|
||||
|
||||
await _try_models(_run)
|
||||
359
tests/test_agent_loop_vllm.py
Normal file
359
tests/test_agent_loop_vllm.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""Integration tests for HermesAgentLoop with a local vLLM server.
|
||||
|
||||
Tests the full Phase 2 flow: ManagedServer + tool calling with a real
|
||||
vLLM backend, producing actual token IDs and logprobs for RL training.
|
||||
|
||||
Requires a running vLLM server. Start one from the atropos directory:
|
||||
|
||||
python -m example_trainer.vllm_api_server \
|
||||
--model Qwen/Qwen3-4B-Thinking-2507 \
|
||||
--port 9001 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--max-model-len=32000
|
||||
|
||||
Tests are automatically skipped if the server is not reachable.
|
||||
|
||||
Run:
|
||||
pytest tests/test_agent_loop_vllm.py -v
|
||||
pytest tests/test_agent_loop_vllm.py -v -k "single"
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
# Ensure repo root is importable
|
||||
_repo_root = Path(__file__).resolve().parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
try:
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
except ImportError:
|
||||
pytest.skip("atroposlib not installed", allow_module_level=True)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Configuration
|
||||
# =========================================================================
|
||||
|
||||
VLLM_HOST = "localhost"
|
||||
VLLM_PORT = 9001
|
||||
VLLM_BASE_URL = f"http://{VLLM_HOST}:{VLLM_PORT}"
|
||||
VLLM_MODEL = "Qwen/Qwen3-4B-Thinking-2507"
|
||||
|
||||
|
||||
def _vllm_is_running() -> bool:
|
||||
"""Check if the vLLM server is reachable."""
|
||||
try:
|
||||
r = requests.get(f"{VLLM_BASE_URL}/health", timeout=3)
|
||||
return r.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# Skip all tests in this module if vLLM is not running
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not _vllm_is_running(),
|
||||
reason=(
|
||||
f"vLLM server not reachable at {VLLM_BASE_URL}. "
|
||||
"Start it with: python -m example_trainer.vllm_api_server "
|
||||
f"--model {VLLM_MODEL} --port {VLLM_PORT} "
|
||||
"--gpu-memory-utilization 0.8 --max-model-len=32000"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Server setup
|
||||
# =========================================================================
|
||||
|
||||
def _make_server_manager():
|
||||
"""Create a ServerManager pointing to the local vLLM server."""
|
||||
from atroposlib.envs.server_handling.server_manager import (
|
||||
ServerManager,
|
||||
APIServerConfig,
|
||||
)
|
||||
|
||||
config = APIServerConfig(
|
||||
base_url=VLLM_BASE_URL,
|
||||
model_name=VLLM_MODEL,
|
||||
server_type="vllm",
|
||||
health_check=False,
|
||||
)
|
||||
sm = ServerManager([config], tool_parser="hermes")
|
||||
sm.servers[0].server_healthy = True
|
||||
return sm
|
||||
|
||||
|
||||
def _get_tokenizer():
|
||||
"""Load the tokenizer for the model."""
|
||||
from transformers import AutoTokenizer
|
||||
return AutoTokenizer.from_pretrained(VLLM_MODEL)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Fake tools
|
||||
# =========================================================================
|
||||
|
||||
WEATHER_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather for a city. Returns temperature and conditions.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City name, e.g. 'Tokyo'",
|
||||
}
|
||||
},
|
||||
"required": ["city"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
CALC_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "calculate",
|
||||
"description": "Calculate a math expression. Returns the numeric result.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"expression": {
|
||||
"type": "string",
|
||||
"description": "Math expression, e.g. '2 + 3'",
|
||||
}
|
||||
},
|
||||
"required": ["expression"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str:
|
||||
"""Handle fake tool calls for testing."""
|
||||
if tool_name == "get_weather":
|
||||
city = args.get("city", "Unknown")
|
||||
return json.dumps({
|
||||
"city": city,
|
||||
"temperature": 22,
|
||||
"conditions": "sunny",
|
||||
"humidity": 45,
|
||||
})
|
||||
elif tool_name == "calculate":
|
||||
expr = args.get("expression", "0")
|
||||
try:
|
||||
result = eval(expr, {"__builtins__": {}}, {})
|
||||
return json.dumps({"result": result})
|
||||
except Exception as e:
|
||||
return json.dumps({"error": str(e)})
|
||||
return json.dumps({"error": f"Unknown tool: {tool_name}"})
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests
|
||||
# =========================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vllm_single_tool_call():
|
||||
"""vLLM model calls a tool, gets result, responds — full Phase 2 flow."""
|
||||
sm = _make_server_manager()
|
||||
tokenizer = _get_tokenizer()
|
||||
|
||||
async with sm.managed_server(tokenizer=tokenizer) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.6,
|
||||
max_tokens=1000,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert isinstance(result, AgentResult)
|
||||
assert result.turns_used >= 2, f"Expected at least 2 turns, got {result.turns_used}"
|
||||
|
||||
# Verify tool call happened
|
||||
tool_calls_found = False
|
||||
for msg in result.messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
if tc["function"]["name"] == "get_weather":
|
||||
tool_calls_found = True
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
assert "city" in args
|
||||
assert tool_calls_found, "Model should have called get_weather"
|
||||
|
||||
# Verify tool results in conversation
|
||||
tool_results = [m for m in result.messages if m.get("role") == "tool"]
|
||||
assert len(tool_results) >= 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vllm_multi_tool_calls():
|
||||
"""vLLM model calls multiple tools across turns."""
|
||||
sm = _make_server_manager()
|
||||
tokenizer = _get_tokenizer()
|
||||
|
||||
async with sm.managed_server(tokenizer=tokenizer) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=[WEATHER_TOOL, CALC_TOOL],
|
||||
valid_tool_names={"get_weather", "calculate"},
|
||||
max_turns=10,
|
||||
temperature=0.6,
|
||||
max_tokens=1000,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": (
|
||||
"I need two things: "
|
||||
"1) What's the weather in Paris? Use get_weather. "
|
||||
"2) What is 15 * 7? Use calculate."
|
||||
)},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Both tools should be called
|
||||
tools_called = set()
|
||||
for msg in result.messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
tools_called.add(tc["function"]["name"])
|
||||
|
||||
assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}"
|
||||
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vllm_managed_server_produces_nodes():
|
||||
"""ManagedServer should produce SequenceNodes with tokens and logprobs."""
|
||||
sm = _make_server_manager()
|
||||
tokenizer = _get_tokenizer()
|
||||
|
||||
async with sm.managed_server(tokenizer=tokenizer) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.6,
|
||||
max_tokens=1000,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What's the weather in Berlin? Use get_weather."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Get the managed state — should have SequenceNodes
|
||||
state = managed.get_state()
|
||||
|
||||
assert state is not None, "ManagedServer should return state"
|
||||
nodes = state.get("nodes", [])
|
||||
assert len(nodes) >= 1, f"Should have at least 1 node, got {len(nodes)}"
|
||||
|
||||
node = nodes[0]
|
||||
assert hasattr(node, "tokens"), "Node should have tokens"
|
||||
assert hasattr(node, "logprobs"), "Node should have logprobs"
|
||||
assert len(node.tokens) > 0, "Tokens should not be empty"
|
||||
assert len(node.logprobs) > 0, "Logprobs should not be empty"
|
||||
assert len(node.tokens) == len(node.logprobs), (
|
||||
f"Tokens ({len(node.tokens)}) and logprobs ({len(node.logprobs)}) should have same length"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vllm_no_tools_direct_response():
|
||||
"""vLLM model should respond directly when no tools are needed."""
|
||||
sm = _make_server_manager()
|
||||
tokenizer = _get_tokenizer()
|
||||
|
||||
async with sm.managed_server(tokenizer=tokenizer) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=[WEATHER_TOOL],
|
||||
valid_tool_names={"get_weather"},
|
||||
max_turns=5,
|
||||
temperature=0.6,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What is 2 + 2? Answer directly, no tools."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
assert result.finished_naturally, "Should finish naturally"
|
||||
assert result.turns_used == 1, f"Should take 1 turn, took {result.turns_used}"
|
||||
|
||||
final = result.messages[-1]
|
||||
assert final["role"] == "assistant"
|
||||
assert final["content"], "Should have content"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vllm_thinking_content_extracted():
|
||||
"""Qwen3-Thinking model should produce reasoning content."""
|
||||
sm = _make_server_manager()
|
||||
tokenizer = _get_tokenizer()
|
||||
|
||||
async with sm.managed_server(
|
||||
tokenizer=tokenizer,
|
||||
preserve_think_blocks=True,
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=[CALC_TOOL],
|
||||
valid_tool_names={"calculate"},
|
||||
max_turns=5,
|
||||
temperature=0.6,
|
||||
max_tokens=1000,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "What is 123 * 456? Use the calculate tool."},
|
||||
]
|
||||
|
||||
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Qwen3-Thinking should generate <think> blocks
|
||||
# Check if any content contains thinking markers
|
||||
has_thinking = False
|
||||
for msg in result.messages:
|
||||
content = msg.get("content", "") or ""
|
||||
if "<think>" in content or "</think>" in content:
|
||||
has_thinking = True
|
||||
break
|
||||
|
||||
# Also check reasoning_per_turn
|
||||
has_reasoning = any(r for r in result.reasoning_per_turn if r)
|
||||
|
||||
# At least one of these should be true for a thinking model
|
||||
assert has_thinking or has_reasoning, (
|
||||
"Qwen3-Thinking should produce <think> blocks or reasoning content"
|
||||
)
|
||||
178
tests/test_managed_server_tool_support.py
Normal file
178
tests/test_managed_server_tool_support.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
Tests for ManagedServer tool_call_parser integration.
|
||||
|
||||
Validates that:
|
||||
1. ManagedServer accepts tool_call_parser parameter (tool_call_support branch)
|
||||
2. ServerManager.managed_server() passes tool_call_parser through
|
||||
3. The parser's parse() output is correctly attached to ChatCompletion responses
|
||||
4. hermes-agent's tool_call_parsers are compatible with ManagedServer's expectations
|
||||
|
||||
These tests verify the contract between hermes-agent's environments/ code
|
||||
and atroposlib's ManagedServer. They detect API incompatibilities early.
|
||||
"""
|
||||
|
||||
import inspect
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
try:
|
||||
import atroposlib # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("atroposlib not installed", allow_module_level=True)
|
||||
|
||||
|
||||
class TestManagedServerAPI:
|
||||
"""Test that ManagedServer's API matches what hermes-agent expects."""
|
||||
|
||||
def test_managed_server_init_signature(self):
|
||||
"""ManagedServer should accept tool_call_parser parameter."""
|
||||
from atroposlib.envs.server_handling.managed_server import ManagedServer
|
||||
|
||||
sig = inspect.signature(ManagedServer.__init__)
|
||||
params = list(sig.parameters.keys())
|
||||
|
||||
# Core params that must exist
|
||||
assert "self" in params
|
||||
assert "server" in params
|
||||
assert "tokenizer" in params
|
||||
assert "track_tree" in params
|
||||
|
||||
# tool_call_parser — required for tool_call_support branch
|
||||
# If this fails, atroposlib hasn't been updated to tool_call_support
|
||||
has_tool_parser = "tool_call_parser" in params
|
||||
if not has_tool_parser:
|
||||
pytest.skip(
|
||||
"ManagedServer does not have tool_call_parser param — "
|
||||
"baseline atroposlib (pre tool_call_support branch)"
|
||||
)
|
||||
|
||||
def test_server_manager_managed_server_signature(self):
|
||||
"""ServerManager.managed_server() should accept tool_call_parser."""
|
||||
from atroposlib.envs.server_handling.server_manager import ServerManager
|
||||
|
||||
sig = inspect.signature(ServerManager.managed_server)
|
||||
params = list(sig.parameters.keys())
|
||||
|
||||
assert "self" in params
|
||||
assert "tokenizer" in params
|
||||
|
||||
has_tool_parser = "tool_call_parser" in params
|
||||
if not has_tool_parser:
|
||||
pytest.skip(
|
||||
"ServerManager.managed_server() does not have tool_call_parser param — "
|
||||
"baseline atroposlib (pre tool_call_support branch)"
|
||||
)
|
||||
|
||||
def test_managed_server_chat_template_kwargs(self):
|
||||
"""ManagedServer should have CHAT_TEMPLATE_KWARGS for forwarding tools/thinking."""
|
||||
from atroposlib.envs.server_handling.managed_server import ManagedServer
|
||||
|
||||
if not hasattr(ManagedServer, "CHAT_TEMPLATE_KWARGS"):
|
||||
pytest.skip(
|
||||
"ManagedServer does not have CHAT_TEMPLATE_KWARGS — "
|
||||
"baseline atroposlib (pre tool_call_support branch)"
|
||||
)
|
||||
|
||||
kwargs = ManagedServer.CHAT_TEMPLATE_KWARGS
|
||||
assert "tools" in kwargs, "tools must be in CHAT_TEMPLATE_KWARGS"
|
||||
|
||||
def test_no_get_logprobs_method(self):
|
||||
"""get_logprobs should be removed in tool_call_support branch."""
|
||||
from atroposlib.envs.server_handling.managed_server import ManagedServer
|
||||
|
||||
# In baseline, get_logprobs exists. In tool_call_support, it's removed.
|
||||
# We just note the state — not a hard fail either way.
|
||||
has_get_logprobs = hasattr(ManagedServer, "get_logprobs")
|
||||
if has_get_logprobs:
|
||||
pytest.skip(
|
||||
"ManagedServer still has get_logprobs — baseline atroposlib"
|
||||
)
|
||||
|
||||
|
||||
class TestParserCompatibility:
|
||||
"""Test that hermes-agent's parsers match ManagedServer's expectations."""
|
||||
|
||||
def test_parser_parse_returns_correct_format(self):
|
||||
"""
|
||||
ManagedServer expects parser.parse(text) -> (content, tool_calls)
|
||||
where tool_calls is a list of objects with .id, .function.name, .function.arguments
|
||||
"""
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>'
|
||||
content, tool_calls = parser.parse(text)
|
||||
|
||||
assert tool_calls is not None
|
||||
assert len(tool_calls) == 1
|
||||
|
||||
tc = tool_calls[0]
|
||||
# ManagedServer accesses these attrs directly
|
||||
assert hasattr(tc, "id")
|
||||
assert hasattr(tc, "function")
|
||||
assert hasattr(tc.function, "name")
|
||||
assert hasattr(tc.function, "arguments")
|
||||
|
||||
def test_parser_no_tools_returns_none(self):
|
||||
"""ManagedServer checks `if parsed_tool_calls:` — None should be falsy."""
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
content, tool_calls = parser.parse("Just text, no tools")
|
||||
assert tool_calls is None
|
||||
|
||||
def test_parser_content_is_string_or_none(self):
|
||||
"""ManagedServer uses `parsed_content or ""` — must be str or None."""
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
|
||||
# With tool calls
|
||||
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>'
|
||||
content, _ = parser.parse(text)
|
||||
assert content is None or isinstance(content, str)
|
||||
|
||||
# Without tool calls
|
||||
content2, _ = parser.parse("Just text")
|
||||
assert isinstance(content2, str)
|
||||
|
||||
|
||||
class TestBaseEnvCompatibility:
|
||||
"""Test that hermes_base_env.py's managed_server() call matches the API."""
|
||||
|
||||
def test_hermes_base_env_managed_server_call_pattern(self):
|
||||
"""
|
||||
Verify that hermes_base_env.py passes tool_call_parser to managed_server().
|
||||
This is a source-level check — the actual managed_server() call must match.
|
||||
"""
|
||||
import ast
|
||||
|
||||
base_env_path = Path(__file__).parent.parent / "environments" / "hermes_base_env.py"
|
||||
source = base_env_path.read_text()
|
||||
tree = ast.parse(source)
|
||||
|
||||
# Find the managed_server() call
|
||||
found_tool_call_parser_kwarg = False
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Call):
|
||||
# Look for self.server.managed_server(...)
|
||||
if isinstance(node.func, ast.Attribute) and node.func.attr == "managed_server":
|
||||
for kw in node.keywords:
|
||||
if kw.arg == "tool_call_parser":
|
||||
found_tool_call_parser_kwarg = True
|
||||
|
||||
assert found_tool_call_parser_kwarg, (
|
||||
"hermes_base_env.py should pass tool_call_parser= to managed_server()"
|
||||
)
|
||||
|
||||
def test_hermes_base_env_uses_get_parser(self):
|
||||
"""Verify hermes_base_env imports and uses get_parser from tool_call_parsers."""
|
||||
base_env_path = Path(__file__).parent.parent / "environments" / "hermes_base_env.py"
|
||||
source = base_env_path.read_text()
|
||||
|
||||
assert "from environments.tool_call_parsers import get_parser" in source
|
||||
assert "get_parser(" in source
|
||||
422
tests/test_reasoning_command.py
Normal file
422
tests/test_reasoning_command.py
Normal file
@@ -0,0 +1,422 @@
|
||||
"""Tests for the combined /reasoning command.
|
||||
|
||||
Covers both reasoning effort level management and reasoning display toggle,
|
||||
plus the reasoning extraction and display pipeline from run_agent through CLI.
|
||||
|
||||
Combines functionality from:
|
||||
- PR #789 (Aum08Desai): reasoning effort level management
|
||||
- PR #790 (0xbyt4): reasoning display toggle and rendering
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Effort level parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestParseReasoningConfig(unittest.TestCase):
|
||||
"""Verify _parse_reasoning_config handles all effort levels."""
|
||||
|
||||
def _parse(self, effort):
|
||||
from cli import _parse_reasoning_config
|
||||
return _parse_reasoning_config(effort)
|
||||
|
||||
def test_none_disables(self):
|
||||
result = self._parse("none")
|
||||
self.assertEqual(result, {"enabled": False})
|
||||
|
||||
def test_valid_levels(self):
|
||||
for level in ("low", "medium", "high", "xhigh", "minimal"):
|
||||
result = self._parse(level)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertTrue(result.get("enabled"))
|
||||
self.assertEqual(result["effort"], level)
|
||||
|
||||
def test_empty_returns_none(self):
|
||||
self.assertIsNone(self._parse(""))
|
||||
self.assertIsNone(self._parse(" "))
|
||||
|
||||
def test_unknown_returns_none(self):
|
||||
self.assertIsNone(self._parse("ultra"))
|
||||
self.assertIsNone(self._parse("turbo"))
|
||||
|
||||
def test_case_insensitive(self):
|
||||
result = self._parse("HIGH")
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result["effort"], "high")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# /reasoning command handler (combined effort + display)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHandleReasoningCommand(unittest.TestCase):
|
||||
"""Test the combined _handle_reasoning_command method."""
|
||||
|
||||
def _make_cli(self, reasoning_config=None, show_reasoning=False):
|
||||
"""Create a minimal CLI stub with the reasoning attributes."""
|
||||
stub = SimpleNamespace(
|
||||
reasoning_config=reasoning_config,
|
||||
show_reasoning=show_reasoning,
|
||||
agent=MagicMock(),
|
||||
)
|
||||
return stub
|
||||
|
||||
def test_show_enables_display(self):
|
||||
stub = self._make_cli(show_reasoning=False)
|
||||
# Simulate /reasoning show
|
||||
arg = "show"
|
||||
if arg in ("show", "on"):
|
||||
stub.show_reasoning = True
|
||||
stub.agent.reasoning_callback = lambda x: None
|
||||
self.assertTrue(stub.show_reasoning)
|
||||
|
||||
def test_hide_disables_display(self):
|
||||
stub = self._make_cli(show_reasoning=True)
|
||||
# Simulate /reasoning hide
|
||||
arg = "hide"
|
||||
if arg in ("hide", "off"):
|
||||
stub.show_reasoning = False
|
||||
stub.agent.reasoning_callback = None
|
||||
self.assertFalse(stub.show_reasoning)
|
||||
self.assertIsNone(stub.agent.reasoning_callback)
|
||||
|
||||
def test_on_enables_display(self):
|
||||
stub = self._make_cli(show_reasoning=False)
|
||||
arg = "on"
|
||||
if arg in ("show", "on"):
|
||||
stub.show_reasoning = True
|
||||
self.assertTrue(stub.show_reasoning)
|
||||
|
||||
def test_off_disables_display(self):
|
||||
stub = self._make_cli(show_reasoning=True)
|
||||
arg = "off"
|
||||
if arg in ("hide", "off"):
|
||||
stub.show_reasoning = False
|
||||
self.assertFalse(stub.show_reasoning)
|
||||
|
||||
def test_effort_level_sets_config(self):
|
||||
"""Setting an effort level should update reasoning_config."""
|
||||
from cli import _parse_reasoning_config
|
||||
stub = self._make_cli()
|
||||
arg = "high"
|
||||
parsed = _parse_reasoning_config(arg)
|
||||
stub.reasoning_config = parsed
|
||||
self.assertEqual(stub.reasoning_config, {"enabled": True, "effort": "high"})
|
||||
|
||||
def test_effort_none_disables_reasoning(self):
|
||||
from cli import _parse_reasoning_config
|
||||
stub = self._make_cli()
|
||||
parsed = _parse_reasoning_config("none")
|
||||
stub.reasoning_config = parsed
|
||||
self.assertEqual(stub.reasoning_config, {"enabled": False})
|
||||
|
||||
def test_invalid_argument_rejected(self):
|
||||
"""Invalid arguments should be rejected (parsed returns None)."""
|
||||
from cli import _parse_reasoning_config
|
||||
parsed = _parse_reasoning_config("turbo")
|
||||
self.assertIsNone(parsed)
|
||||
|
||||
def test_no_args_shows_status(self):
|
||||
"""With no args, should show current state (no crash)."""
|
||||
stub = self._make_cli(reasoning_config=None, show_reasoning=False)
|
||||
rc = stub.reasoning_config
|
||||
if rc is None:
|
||||
level = "medium (default)"
|
||||
elif rc.get("enabled") is False:
|
||||
level = "none (disabled)"
|
||||
else:
|
||||
level = rc.get("effort", "medium")
|
||||
display_state = "on" if stub.show_reasoning else "off"
|
||||
self.assertEqual(level, "medium (default)")
|
||||
self.assertEqual(display_state, "off")
|
||||
|
||||
def test_status_with_disabled_reasoning(self):
|
||||
stub = self._make_cli(reasoning_config={"enabled": False}, show_reasoning=True)
|
||||
rc = stub.reasoning_config
|
||||
if rc is None:
|
||||
level = "medium (default)"
|
||||
elif rc.get("enabled") is False:
|
||||
level = "none (disabled)"
|
||||
else:
|
||||
level = rc.get("effort", "medium")
|
||||
self.assertEqual(level, "none (disabled)")
|
||||
|
||||
def test_status_with_explicit_level(self):
|
||||
stub = self._make_cli(
|
||||
reasoning_config={"enabled": True, "effort": "xhigh"},
|
||||
show_reasoning=True,
|
||||
)
|
||||
rc = stub.reasoning_config
|
||||
level = rc.get("effort", "medium")
|
||||
self.assertEqual(level, "xhigh")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reasoning extraction and result dict
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLastReasoningInResult(unittest.TestCase):
|
||||
"""Verify reasoning extraction from the messages list."""
|
||||
|
||||
def _build_messages(self, reasoning=None):
|
||||
return [
|
||||
{"role": "user", "content": "hello"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hi there!",
|
||||
"reasoning": reasoning,
|
||||
"finish_reason": "stop",
|
||||
},
|
||||
]
|
||||
|
||||
def test_reasoning_present(self):
|
||||
messages = self._build_messages(reasoning="Let me think...")
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
self.assertEqual(last_reasoning, "Let me think...")
|
||||
|
||||
def test_reasoning_none(self):
|
||||
messages = self._build_messages(reasoning=None)
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
self.assertIsNone(last_reasoning)
|
||||
|
||||
def test_picks_last_assistant(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "...", "reasoning": "first thought"},
|
||||
{"role": "tool", "content": "result"},
|
||||
{"role": "assistant", "content": "done!", "reasoning": "final thought"},
|
||||
]
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
self.assertEqual(last_reasoning, "final thought")
|
||||
|
||||
def test_empty_reasoning_treated_as_none(self):
|
||||
messages = self._build_messages(reasoning="")
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
self.assertIsNone(last_reasoning)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reasoning display collapse
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestReasoningCollapse(unittest.TestCase):
|
||||
"""Verify long reasoning is collapsed to 10 lines in the box."""
|
||||
|
||||
def test_short_reasoning_not_collapsed(self):
|
||||
reasoning = "\n".join(f"Line {i}" for i in range(5))
|
||||
lines = reasoning.strip().splitlines()
|
||||
self.assertLessEqual(len(lines), 10)
|
||||
|
||||
def test_long_reasoning_collapsed(self):
|
||||
reasoning = "\n".join(f"Line {i}" for i in range(25))
|
||||
lines = reasoning.strip().splitlines()
|
||||
self.assertTrue(len(lines) > 10)
|
||||
if len(lines) > 10:
|
||||
display = "\n".join(lines[:10])
|
||||
display += f"\n ... ({len(lines) - 10} more lines)"
|
||||
display_lines = display.splitlines()
|
||||
self.assertEqual(len(display_lines), 11)
|
||||
self.assertIn("15 more lines", display_lines[-1])
|
||||
|
||||
def test_exactly_10_lines_not_collapsed(self):
|
||||
reasoning = "\n".join(f"Line {i}" for i in range(10))
|
||||
lines = reasoning.strip().splitlines()
|
||||
self.assertEqual(len(lines), 10)
|
||||
self.assertFalse(len(lines) > 10)
|
||||
|
||||
def test_intermediate_callback_collapses_to_5(self):
|
||||
"""_on_reasoning shows max 5 lines."""
|
||||
reasoning = "\n".join(f"Step {i}" for i in range(12))
|
||||
lines = reasoning.strip().splitlines()
|
||||
if len(lines) > 5:
|
||||
preview = "\n".join(lines[:5])
|
||||
preview += f"\n ... ({len(lines) - 5} more lines)"
|
||||
else:
|
||||
preview = reasoning.strip()
|
||||
preview_lines = preview.splitlines()
|
||||
self.assertEqual(len(preview_lines), 6)
|
||||
self.assertIn("7 more lines", preview_lines[-1])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reasoning callback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestReasoningCallback(unittest.TestCase):
|
||||
"""Verify reasoning_callback invocation."""
|
||||
|
||||
def test_callback_invoked_with_reasoning(self):
|
||||
captured = []
|
||||
agent = MagicMock()
|
||||
agent.reasoning_callback = lambda t: captured.append(t)
|
||||
agent._extract_reasoning = MagicMock(return_value="deep thought")
|
||||
|
||||
reasoning_text = agent._extract_reasoning(MagicMock())
|
||||
if reasoning_text and agent.reasoning_callback:
|
||||
agent.reasoning_callback(reasoning_text)
|
||||
self.assertEqual(captured, ["deep thought"])
|
||||
|
||||
def test_callback_not_invoked_without_reasoning(self):
|
||||
captured = []
|
||||
agent = MagicMock()
|
||||
agent.reasoning_callback = lambda t: captured.append(t)
|
||||
agent._extract_reasoning = MagicMock(return_value=None)
|
||||
|
||||
reasoning_text = agent._extract_reasoning(MagicMock())
|
||||
if reasoning_text and agent.reasoning_callback:
|
||||
agent.reasoning_callback(reasoning_text)
|
||||
self.assertEqual(captured, [])
|
||||
|
||||
def test_callback_none_does_not_crash(self):
|
||||
reasoning_text = "some thought"
|
||||
callback = None
|
||||
if reasoning_text and callback:
|
||||
callback(reasoning_text)
|
||||
# No exception = pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Real provider format extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestExtractReasoningFormats(unittest.TestCase):
|
||||
"""Test _extract_reasoning with real provider response formats."""
|
||||
|
||||
def _get_extractor(self):
|
||||
from run_agent import AIAgent
|
||||
return AIAgent._extract_reasoning
|
||||
|
||||
def test_openrouter_reasoning_details(self):
|
||||
extract = self._get_extractor()
|
||||
msg = SimpleNamespace(
|
||||
reasoning=None,
|
||||
reasoning_content=None,
|
||||
reasoning_details=[
|
||||
{"type": "reasoning.summary", "summary": "Analyzing Python lists."},
|
||||
],
|
||||
)
|
||||
result = extract(None, msg)
|
||||
self.assertIn("Python lists", result)
|
||||
|
||||
def test_deepseek_reasoning_field(self):
|
||||
extract = self._get_extractor()
|
||||
msg = SimpleNamespace(
|
||||
reasoning="Solving step by step.\nx + y = 8.",
|
||||
reasoning_content=None,
|
||||
)
|
||||
result = extract(None, msg)
|
||||
self.assertIn("x + y = 8", result)
|
||||
|
||||
def test_moonshot_reasoning_content(self):
|
||||
extract = self._get_extractor()
|
||||
msg = SimpleNamespace(
|
||||
reasoning_content="Explaining async/await.",
|
||||
)
|
||||
result = extract(None, msg)
|
||||
self.assertIn("async/await", result)
|
||||
|
||||
def test_no_reasoning_returns_none(self):
|
||||
extract = self._get_extractor()
|
||||
msg = SimpleNamespace(content="Hello!")
|
||||
result = extract(None, msg)
|
||||
self.assertIsNone(result)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config defaults
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestConfigDefault(unittest.TestCase):
|
||||
"""Verify config default for show_reasoning."""
|
||||
|
||||
def test_default_config_has_show_reasoning(self):
|
||||
from hermes_cli.config import DEFAULT_CONFIG
|
||||
display = DEFAULT_CONFIG.get("display", {})
|
||||
self.assertIn("show_reasoning", display)
|
||||
self.assertFalse(display["show_reasoning"])
|
||||
|
||||
|
||||
class TestCommandRegistered(unittest.TestCase):
|
||||
"""Verify /reasoning is in the COMMANDS dict."""
|
||||
|
||||
def test_reasoning_in_commands(self):
|
||||
from hermes_cli.commands import COMMANDS
|
||||
self.assertIn("/reasoning", COMMANDS)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEndToEndPipeline(unittest.TestCase):
|
||||
"""Simulate the full pipeline: extraction -> result dict -> display."""
|
||||
|
||||
def test_openrouter_claude_pipeline(self):
|
||||
from run_agent import AIAgent
|
||||
|
||||
api_message = SimpleNamespace(
|
||||
role="assistant",
|
||||
content="Lists support append().",
|
||||
tool_calls=None,
|
||||
reasoning=None,
|
||||
reasoning_content=None,
|
||||
reasoning_details=[
|
||||
{"type": "reasoning.summary", "summary": "Python list methods."},
|
||||
],
|
||||
)
|
||||
|
||||
reasoning = AIAgent._extract_reasoning(None, api_message)
|
||||
self.assertIsNotNone(reasoning)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "How do I add items?"},
|
||||
{"role": "assistant", "content": api_message.content, "reasoning": reasoning},
|
||||
]
|
||||
|
||||
last_reasoning = None
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "assistant" and msg.get("reasoning"):
|
||||
last_reasoning = msg["reasoning"]
|
||||
break
|
||||
|
||||
result = {
|
||||
"final_response": api_message.content,
|
||||
"last_reasoning": last_reasoning,
|
||||
}
|
||||
|
||||
self.assertIn("last_reasoning", result)
|
||||
self.assertIn("Python list methods", result["last_reasoning"])
|
||||
|
||||
def test_no_reasoning_model_pipeline(self):
|
||||
from run_agent import AIAgent
|
||||
|
||||
api_message = SimpleNamespace(content="Paris.", tool_calls=None)
|
||||
reasoning = AIAgent._extract_reasoning(None, api_message)
|
||||
self.assertIsNone(reasoning)
|
||||
|
||||
result = {"final_response": api_message.content, "last_reasoning": reasoning}
|
||||
self.assertIsNone(result["last_reasoning"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
159
tests/test_tool_call_parsers.py
Normal file
159
tests/test_tool_call_parsers.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
Tests for environments/tool_call_parsers/ — client-side tool call parsers.
|
||||
|
||||
These parsers extract structured tool_calls from raw model output text.
|
||||
Used in Phase 2 (VLLM/generate) where the server returns raw tokens.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure repo root is importable
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
try:
|
||||
from environments.tool_call_parsers import (
|
||||
ParseResult,
|
||||
ToolCallParser,
|
||||
get_parser,
|
||||
list_parsers,
|
||||
)
|
||||
except ImportError:
|
||||
pytest.skip("atroposlib not installed", allow_module_level=True)
|
||||
|
||||
|
||||
# ─── Registry tests ─────────────────────────────────────────────────────
|
||||
|
||||
class TestParserRegistry:
|
||||
def test_list_parsers_returns_nonempty(self):
|
||||
parsers = list_parsers()
|
||||
assert len(parsers) > 0
|
||||
|
||||
def test_hermes_parser_registered(self):
|
||||
parsers = list_parsers()
|
||||
assert "hermes" in parsers
|
||||
|
||||
def test_get_parser_returns_instance(self):
|
||||
parser = get_parser("hermes")
|
||||
assert isinstance(parser, ToolCallParser)
|
||||
|
||||
def test_get_parser_unknown_raises(self):
|
||||
with pytest.raises(KeyError):
|
||||
get_parser("nonexistent_parser_xyz")
|
||||
|
||||
def test_all_registered_parsers_instantiate(self):
|
||||
"""Every registered parser should be importable and instantiable."""
|
||||
for name in list_parsers():
|
||||
parser = get_parser(name)
|
||||
assert isinstance(parser, ToolCallParser)
|
||||
assert hasattr(parser, "parse")
|
||||
|
||||
|
||||
# ─── Hermes parser tests ────────────────────────────────────────────────
|
||||
|
||||
class TestHermesParser:
|
||||
@pytest.fixture
|
||||
def parser(self):
|
||||
return get_parser("hermes")
|
||||
|
||||
def test_no_tool_call(self, parser):
|
||||
text = "Hello, I can help you with that."
|
||||
content, tool_calls = parser.parse(text)
|
||||
assert content == text
|
||||
assert tool_calls is None
|
||||
|
||||
def test_single_tool_call(self, parser):
|
||||
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls -la"}}</tool_call>'
|
||||
content, tool_calls = parser.parse(text)
|
||||
assert tool_calls is not None
|
||||
assert len(tool_calls) == 1
|
||||
assert tool_calls[0].function.name == "terminal"
|
||||
args = json.loads(tool_calls[0].function.arguments)
|
||||
assert args["command"] == "ls -la"
|
||||
|
||||
def test_tool_call_with_surrounding_text(self, parser):
|
||||
text = 'Let me check that for you.\n<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>'
|
||||
content, tool_calls = parser.parse(text)
|
||||
assert tool_calls is not None
|
||||
assert len(tool_calls) == 1
|
||||
assert tool_calls[0].function.name == "terminal"
|
||||
# Content should have the surrounding text
|
||||
if content is not None:
|
||||
assert "check that" in content or content.strip() != ""
|
||||
|
||||
def test_multiple_tool_calls(self, parser):
|
||||
text = (
|
||||
'<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>\n'
|
||||
'<tool_call>{"name": "read_file", "arguments": {"path": "test.py"}}</tool_call>'
|
||||
)
|
||||
content, tool_calls = parser.parse(text)
|
||||
assert tool_calls is not None
|
||||
assert len(tool_calls) == 2
|
||||
names = {tc.function.name for tc in tool_calls}
|
||||
assert "terminal" in names
|
||||
assert "read_file" in names
|
||||
|
||||
def test_tool_call_ids_are_unique(self, parser):
|
||||
text = (
|
||||
'<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>\n'
|
||||
'<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>'
|
||||
)
|
||||
_, tool_calls = parser.parse(text)
|
||||
assert tool_calls is not None
|
||||
ids = [tc.id for tc in tool_calls]
|
||||
assert len(ids) == len(set(ids)), "Tool call IDs must be unique"
|
||||
|
||||
def test_empty_string(self, parser):
|
||||
content, tool_calls = parser.parse("")
|
||||
assert tool_calls is None
|
||||
|
||||
def test_malformed_json_in_tool_call(self, parser):
|
||||
text = '<tool_call>not valid json</tool_call>'
|
||||
content, tool_calls = parser.parse(text)
|
||||
# Should either return None tool_calls or handle gracefully
|
||||
# (implementation may vary — some parsers return error tool calls)
|
||||
|
||||
def test_truncated_tool_call(self, parser):
|
||||
"""Test handling of unclosed tool_call tag (model truncated mid-generation)."""
|
||||
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls -la"}'
|
||||
content, tool_calls = parser.parse(text)
|
||||
# Parser should handle truncated output gracefully
|
||||
# Either parse it successfully or return None
|
||||
|
||||
|
||||
# ─── Parse result contract tests (applies to ALL parsers) ───────────────
|
||||
|
||||
class TestParseResultContract:
|
||||
"""Ensure all parsers conform to the ParseResult contract."""
|
||||
|
||||
@pytest.fixture(params=["hermes"]) # Add more as needed
|
||||
def parser(self, request):
|
||||
return get_parser(request.param)
|
||||
|
||||
def test_returns_tuple_of_two(self, parser):
|
||||
result = parser.parse("hello world")
|
||||
assert isinstance(result, tuple)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_no_tools_returns_none_tool_calls(self, parser):
|
||||
content, tool_calls = parser.parse("Just plain text, no tools.")
|
||||
assert tool_calls is None
|
||||
assert content is not None
|
||||
|
||||
def test_tool_calls_are_proper_objects(self, parser):
|
||||
"""When tool calls are found, they should be ChatCompletionMessageToolCall objects."""
|
||||
# Use hermes format since that's universal
|
||||
text = '<tool_call>{"name": "terminal", "arguments": {"command": "echo hi"}}</tool_call>'
|
||||
content, tool_calls = parser.parse(text)
|
||||
if tool_calls is not None:
|
||||
for tc in tool_calls:
|
||||
assert hasattr(tc, "id")
|
||||
assert hasattr(tc, "function")
|
||||
assert hasattr(tc.function, "name")
|
||||
assert hasattr(tc.function, "arguments")
|
||||
assert tc.id is not None
|
||||
assert isinstance(tc.function.name, str)
|
||||
assert isinstance(tc.function.arguments, str)
|
||||
@@ -23,6 +23,7 @@ from tools.delegate_tool import (
|
||||
delegate_task,
|
||||
_build_child_system_prompt,
|
||||
_strip_blocked_tools,
|
||||
_resolve_delegation_credentials,
|
||||
)
|
||||
|
||||
|
||||
@@ -255,5 +256,287 @@ class TestBlockedTools(unittest.TestCase):
|
||||
self.assertEqual(MAX_DEPTH, 2)
|
||||
|
||||
|
||||
class TestDelegationCredentialResolution(unittest.TestCase):
|
||||
"""Tests for provider:model credential resolution in delegation config."""
|
||||
|
||||
def test_no_provider_returns_none_credentials(self):
|
||||
"""When delegation.provider is empty, all credentials are None (inherit parent)."""
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "", "provider": ""}
|
||||
creds = _resolve_delegation_credentials(cfg, parent)
|
||||
self.assertIsNone(creds["provider"])
|
||||
self.assertIsNone(creds["base_url"])
|
||||
self.assertIsNone(creds["api_key"])
|
||||
self.assertIsNone(creds["api_mode"])
|
||||
self.assertIsNone(creds["model"])
|
||||
|
||||
def test_model_only_no_provider(self):
|
||||
"""When only model is set (no provider), model is returned but credentials are None."""
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "google/gemini-3-flash-preview", "provider": ""}
|
||||
creds = _resolve_delegation_credentials(cfg, parent)
|
||||
self.assertEqual(creds["model"], "google/gemini-3-flash-preview")
|
||||
self.assertIsNone(creds["provider"])
|
||||
self.assertIsNone(creds["base_url"])
|
||||
self.assertIsNone(creds["api_key"])
|
||||
|
||||
@patch("hermes_cli.runtime_provider.resolve_runtime_provider")
|
||||
def test_provider_resolves_full_credentials(self, mock_resolve):
|
||||
"""When delegation.provider is set, full credentials are resolved."""
|
||||
mock_resolve.return_value = {
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_key": "sk-or-test-key",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "google/gemini-3-flash-preview", "provider": "openrouter"}
|
||||
creds = _resolve_delegation_credentials(cfg, parent)
|
||||
self.assertEqual(creds["model"], "google/gemini-3-flash-preview")
|
||||
self.assertEqual(creds["provider"], "openrouter")
|
||||
self.assertEqual(creds["base_url"], "https://openrouter.ai/api/v1")
|
||||
self.assertEqual(creds["api_key"], "sk-or-test-key")
|
||||
self.assertEqual(creds["api_mode"], "chat_completions")
|
||||
mock_resolve.assert_called_once_with(requested="openrouter")
|
||||
|
||||
@patch("hermes_cli.runtime_provider.resolve_runtime_provider")
|
||||
def test_nous_provider_resolves_nous_credentials(self, mock_resolve):
|
||||
"""Nous provider resolves Nous Portal base_url and api_key."""
|
||||
mock_resolve.return_value = {
|
||||
"provider": "nous",
|
||||
"base_url": "https://inference-api.nousresearch.com/v1",
|
||||
"api_key": "nous-agent-key-xyz",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "hermes-3-llama-3.1-8b", "provider": "nous"}
|
||||
creds = _resolve_delegation_credentials(cfg, parent)
|
||||
self.assertEqual(creds["provider"], "nous")
|
||||
self.assertEqual(creds["base_url"], "https://inference-api.nousresearch.com/v1")
|
||||
self.assertEqual(creds["api_key"], "nous-agent-key-xyz")
|
||||
mock_resolve.assert_called_once_with(requested="nous")
|
||||
|
||||
@patch("hermes_cli.runtime_provider.resolve_runtime_provider")
|
||||
def test_provider_resolution_failure_raises_valueerror(self, mock_resolve):
|
||||
"""When provider resolution fails, ValueError is raised with helpful message."""
|
||||
mock_resolve.side_effect = RuntimeError("OPENROUTER_API_KEY not set")
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "some-model", "provider": "openrouter"}
|
||||
with self.assertRaises(ValueError) as ctx:
|
||||
_resolve_delegation_credentials(cfg, parent)
|
||||
self.assertIn("openrouter", str(ctx.exception).lower())
|
||||
self.assertIn("Cannot resolve", str(ctx.exception))
|
||||
|
||||
@patch("hermes_cli.runtime_provider.resolve_runtime_provider")
|
||||
def test_provider_resolves_but_no_api_key_raises(self, mock_resolve):
|
||||
"""When provider resolves but has no API key, ValueError is raised."""
|
||||
mock_resolve.return_value = {
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_key": "",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"model": "some-model", "provider": "openrouter"}
|
||||
with self.assertRaises(ValueError) as ctx:
|
||||
_resolve_delegation_credentials(cfg, parent)
|
||||
self.assertIn("no API key", str(ctx.exception))
|
||||
|
||||
def test_missing_config_keys_inherit_parent(self):
|
||||
"""When config dict has no model/provider keys at all, inherits parent."""
|
||||
parent = _make_mock_parent(depth=0)
|
||||
cfg = {"max_iterations": 45}
|
||||
creds = _resolve_delegation_credentials(cfg, parent)
|
||||
self.assertIsNone(creds["model"])
|
||||
self.assertIsNone(creds["provider"])
|
||||
|
||||
|
||||
class TestDelegationProviderIntegration(unittest.TestCase):
|
||||
"""Integration tests: delegation config → _run_single_child → AIAgent construction."""
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_config_provider_credentials_reach_child_agent(self, mock_creds, mock_cfg):
|
||||
"""When delegation.provider is configured, child agent gets resolved credentials."""
|
||||
mock_cfg.return_value = {
|
||||
"max_iterations": 45,
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": "openrouter",
|
||||
}
|
||||
mock_creds.return_value = {
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_key": "sk-or-delegation-key",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
|
||||
with patch("run_agent.AIAgent") as MockAgent:
|
||||
mock_child = MagicMock()
|
||||
mock_child.run_conversation.return_value = {
|
||||
"final_response": "done", "completed": True, "api_calls": 1
|
||||
}
|
||||
MockAgent.return_value = mock_child
|
||||
|
||||
delegate_task(goal="Test provider routing", parent_agent=parent)
|
||||
|
||||
_, kwargs = MockAgent.call_args
|
||||
self.assertEqual(kwargs["model"], "google/gemini-3-flash-preview")
|
||||
self.assertEqual(kwargs["provider"], "openrouter")
|
||||
self.assertEqual(kwargs["base_url"], "https://openrouter.ai/api/v1")
|
||||
self.assertEqual(kwargs["api_key"], "sk-or-delegation-key")
|
||||
self.assertEqual(kwargs["api_mode"], "chat_completions")
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_cross_provider_delegation(self, mock_creds, mock_cfg):
|
||||
"""Parent on Nous, subagent on OpenRouter — full credential switch."""
|
||||
mock_cfg.return_value = {
|
||||
"max_iterations": 45,
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": "openrouter",
|
||||
}
|
||||
mock_creds.return_value = {
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_key": "sk-or-key",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
parent.provider = "nous"
|
||||
parent.base_url = "https://inference-api.nousresearch.com/v1"
|
||||
parent.api_key = "nous-key-abc"
|
||||
|
||||
with patch("run_agent.AIAgent") as MockAgent:
|
||||
mock_child = MagicMock()
|
||||
mock_child.run_conversation.return_value = {
|
||||
"final_response": "done", "completed": True, "api_calls": 1
|
||||
}
|
||||
MockAgent.return_value = mock_child
|
||||
|
||||
delegate_task(goal="Cross-provider test", parent_agent=parent)
|
||||
|
||||
_, kwargs = MockAgent.call_args
|
||||
# Child should use OpenRouter, NOT Nous
|
||||
self.assertEqual(kwargs["provider"], "openrouter")
|
||||
self.assertEqual(kwargs["base_url"], "https://openrouter.ai/api/v1")
|
||||
self.assertEqual(kwargs["api_key"], "sk-or-key")
|
||||
self.assertNotEqual(kwargs["base_url"], parent.base_url)
|
||||
self.assertNotEqual(kwargs["api_key"], parent.api_key)
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_empty_config_inherits_parent(self, mock_creds, mock_cfg):
|
||||
"""When delegation config is empty, child inherits parent credentials."""
|
||||
mock_cfg.return_value = {"max_iterations": 45, "model": "", "provider": ""}
|
||||
mock_creds.return_value = {
|
||||
"model": None,
|
||||
"provider": None,
|
||||
"base_url": None,
|
||||
"api_key": None,
|
||||
"api_mode": None,
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
|
||||
with patch("run_agent.AIAgent") as MockAgent:
|
||||
mock_child = MagicMock()
|
||||
mock_child.run_conversation.return_value = {
|
||||
"final_response": "done", "completed": True, "api_calls": 1
|
||||
}
|
||||
MockAgent.return_value = mock_child
|
||||
|
||||
delegate_task(goal="Test inherit", parent_agent=parent)
|
||||
|
||||
_, kwargs = MockAgent.call_args
|
||||
self.assertEqual(kwargs["model"], parent.model)
|
||||
self.assertEqual(kwargs["provider"], parent.provider)
|
||||
self.assertEqual(kwargs["base_url"], parent.base_url)
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_credential_error_returns_json_error(self, mock_creds, mock_cfg):
|
||||
"""When credential resolution fails, delegate_task returns a JSON error."""
|
||||
mock_cfg.return_value = {"model": "bad-model", "provider": "nonexistent"}
|
||||
mock_creds.side_effect = ValueError(
|
||||
"Cannot resolve delegation provider 'nonexistent': Unknown provider"
|
||||
)
|
||||
parent = _make_mock_parent(depth=0)
|
||||
|
||||
result = json.loads(delegate_task(goal="Should fail", parent_agent=parent))
|
||||
self.assertIn("error", result)
|
||||
self.assertIn("Cannot resolve", result["error"])
|
||||
self.assertIn("nonexistent", result["error"])
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_batch_mode_all_children_get_credentials(self, mock_creds, mock_cfg):
|
||||
"""In batch mode, all children receive the resolved credentials."""
|
||||
mock_cfg.return_value = {
|
||||
"max_iterations": 45,
|
||||
"model": "meta-llama/llama-4-scout",
|
||||
"provider": "openrouter",
|
||||
}
|
||||
mock_creds.return_value = {
|
||||
"model": "meta-llama/llama-4-scout",
|
||||
"provider": "openrouter",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"api_key": "sk-or-batch",
|
||||
"api_mode": "chat_completions",
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
|
||||
with patch("tools.delegate_tool._run_single_child") as mock_run:
|
||||
mock_run.return_value = {
|
||||
"task_index": 0, "status": "completed",
|
||||
"summary": "Done", "api_calls": 1, "duration_seconds": 1.0
|
||||
}
|
||||
|
||||
tasks = [{"goal": "Task A"}, {"goal": "Task B"}]
|
||||
delegate_task(tasks=tasks, parent_agent=parent)
|
||||
|
||||
for call in mock_run.call_args_list:
|
||||
self.assertEqual(call.kwargs.get("model"), "meta-llama/llama-4-scout")
|
||||
self.assertEqual(call.kwargs.get("override_provider"), "openrouter")
|
||||
self.assertEqual(call.kwargs.get("override_base_url"), "https://openrouter.ai/api/v1")
|
||||
self.assertEqual(call.kwargs.get("override_api_key"), "sk-or-batch")
|
||||
self.assertEqual(call.kwargs.get("override_api_mode"), "chat_completions")
|
||||
|
||||
@patch("tools.delegate_tool._load_config")
|
||||
@patch("tools.delegate_tool._resolve_delegation_credentials")
|
||||
def test_model_only_no_provider_inherits_parent_credentials(self, mock_creds, mock_cfg):
|
||||
"""Setting only model (no provider) changes model but keeps parent credentials."""
|
||||
mock_cfg.return_value = {
|
||||
"max_iterations": 45,
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": "",
|
||||
}
|
||||
mock_creds.return_value = {
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"provider": None,
|
||||
"base_url": None,
|
||||
"api_key": None,
|
||||
"api_mode": None,
|
||||
}
|
||||
parent = _make_mock_parent(depth=0)
|
||||
|
||||
with patch("run_agent.AIAgent") as MockAgent:
|
||||
mock_child = MagicMock()
|
||||
mock_child.run_conversation.return_value = {
|
||||
"final_response": "done", "completed": True, "api_calls": 1
|
||||
}
|
||||
MockAgent.return_value = mock_child
|
||||
|
||||
delegate_task(goal="Model only test", parent_agent=parent)
|
||||
|
||||
_, kwargs = MockAgent.call_args
|
||||
# Model should be overridden
|
||||
self.assertEqual(kwargs["model"], "google/gemini-3-flash-preview")
|
||||
# But provider/base_url/api_key should inherit from parent
|
||||
self.assertEqual(kwargs["provider"], parent.provider)
|
||||
self.assertEqual(kwargs["base_url"], parent.base_url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -2049,6 +2049,65 @@ class TestSamplingErrors:
|
||||
assert "No LLM provider" in result.message
|
||||
assert handler.metrics["errors"] == 1
|
||||
|
||||
def test_empty_choices_returns_error(self):
|
||||
"""LLM returning choices=[] is handled gracefully, not IndexError."""
|
||||
handler = SamplingHandler("ec", {})
|
||||
fake_client = MagicMock()
|
||||
fake_client.chat.completions.create.return_value = SimpleNamespace(
|
||||
choices=[],
|
||||
model="test-model",
|
||||
usage=SimpleNamespace(total_tokens=0),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(fake_client, "default-model"),
|
||||
):
|
||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||
|
||||
assert isinstance(result, ErrorData)
|
||||
assert "empty response" in result.message.lower()
|
||||
assert handler.metrics["errors"] == 1
|
||||
|
||||
def test_none_choices_returns_error(self):
|
||||
"""LLM returning choices=None is handled gracefully, not TypeError."""
|
||||
handler = SamplingHandler("nc", {})
|
||||
fake_client = MagicMock()
|
||||
fake_client.chat.completions.create.return_value = SimpleNamespace(
|
||||
choices=None,
|
||||
model="test-model",
|
||||
usage=SimpleNamespace(total_tokens=0),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(fake_client, "default-model"),
|
||||
):
|
||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||
|
||||
assert isinstance(result, ErrorData)
|
||||
assert "empty response" in result.message.lower()
|
||||
assert handler.metrics["errors"] == 1
|
||||
|
||||
def test_missing_choices_attr_returns_error(self):
|
||||
"""LLM response without choices attribute is handled gracefully."""
|
||||
handler = SamplingHandler("mc", {})
|
||||
fake_client = MagicMock()
|
||||
fake_client.chat.completions.create.return_value = SimpleNamespace(
|
||||
model="test-model",
|
||||
usage=SimpleNamespace(total_tokens=0),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
||||
return_value=(fake_client, "default-model"),
|
||||
):
|
||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||
|
||||
assert isinstance(result, ErrorData)
|
||||
assert "empty response" in result.message.lower()
|
||||
assert handler.metrics["errors"] == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 10. Model whitelist
|
||||
|
||||
271
tests/tools/test_modal_sandbox_fixes.py
Normal file
271
tests/tools/test_modal_sandbox_fixes.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""Tests for Modal sandbox infrastructure fixes (TBLite baseline).
|
||||
|
||||
Covers the 9 bugs discovered while setting up TBLite evaluation:
|
||||
1. Tool resolution — terminal + file tools load with minisweagent
|
||||
2. CWD fix — host paths get replaced with /root for container backends
|
||||
3. ephemeral_disk version check
|
||||
4. Tilde ~ replaced with /root for container backends
|
||||
5. ensurepip fix in patches.py for Modal image builder
|
||||
6. install_pipx stays True for swerex-remote
|
||||
7. /home/ added to host prefix check
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure repo root is importable
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
try:
|
||||
import tools.terminal_tool # noqa: F401
|
||||
_tt_mod = sys.modules["tools.terminal_tool"]
|
||||
except ImportError:
|
||||
pytest.skip("hermes-agent tools not importable (missing deps)", allow_module_level=True)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 1: Tool resolution includes terminal + file tools
|
||||
# =========================================================================
|
||||
|
||||
class TestToolResolution:
|
||||
"""Verify get_tool_definitions returns all expected tools for eval."""
|
||||
|
||||
def _has_minisweagent(self):
|
||||
try:
|
||||
import minisweagent # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def test_terminal_and_file_toolsets_resolve_all_tools(self):
|
||||
"""enabled_toolsets=['terminal', 'file'] should produce 6 tools."""
|
||||
if not self._has_minisweagent():
|
||||
pytest.skip("minisweagent not installed (git submodule update --init)")
|
||||
from model_tools import get_tool_definitions
|
||||
tools = get_tool_definitions(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
quiet_mode=True,
|
||||
)
|
||||
names = {t["function"]["name"] for t in tools}
|
||||
expected = {"terminal", "process", "read_file", "write_file", "search_files", "patch"}
|
||||
assert expected == names, f"Expected {expected}, got {names}"
|
||||
|
||||
def test_terminal_tool_present(self):
|
||||
"""The terminal tool must be present (not silently dropped)."""
|
||||
if not self._has_minisweagent():
|
||||
pytest.skip("minisweagent not installed (git submodule update --init)")
|
||||
from model_tools import get_tool_definitions
|
||||
tools = get_tool_definitions(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
quiet_mode=True,
|
||||
)
|
||||
names = [t["function"]["name"] for t in tools]
|
||||
assert "terminal" in names, (
|
||||
f"terminal tool missing! Only got: {names}. "
|
||||
"Check that minisweagent is installed (git submodule update --init)."
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 2-4: CWD handling for container backends
|
||||
# =========================================================================
|
||||
|
||||
class TestCwdHandling:
|
||||
"""Verify host paths are sanitized for container backends."""
|
||||
|
||||
def test_home_path_replaced_for_modal(self):
|
||||
"""TERMINAL_CWD=/home/user/... should be replaced with /root for modal."""
|
||||
with patch.dict(os.environ, {
|
||||
"TERMINAL_ENV": "modal",
|
||||
"TERMINAL_CWD": "/home/dakota/github/hermes-agent",
|
||||
}):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == "/root", (
|
||||
f"Expected /root, got {config['cwd']}. "
|
||||
"/home/ paths should be replaced for modal backend."
|
||||
)
|
||||
|
||||
def test_users_path_replaced_for_docker(self):
|
||||
"""TERMINAL_CWD=/Users/... should be replaced with /root for docker."""
|
||||
with patch.dict(os.environ, {
|
||||
"TERMINAL_ENV": "docker",
|
||||
"TERMINAL_CWD": "/Users/someone/projects",
|
||||
}):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == "/root", (
|
||||
f"Expected /root, got {config['cwd']}. "
|
||||
"/Users/ paths should be replaced for docker backend."
|
||||
)
|
||||
|
||||
def test_windows_path_replaced_for_modal(self):
|
||||
"""TERMINAL_CWD=C:\\Users\\... should be replaced for modal."""
|
||||
with patch.dict(os.environ, {
|
||||
"TERMINAL_ENV": "modal",
|
||||
"TERMINAL_CWD": "C:\\Users\\someone\\projects",
|
||||
}):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == "/root"
|
||||
|
||||
def test_default_cwd_is_root_for_container_backends(self):
|
||||
"""Container backends should default to /root, not ~."""
|
||||
for backend in ("modal", "docker", "singularity", "daytona"):
|
||||
with patch.dict(os.environ, {"TERMINAL_ENV": backend}, clear=False):
|
||||
# Remove TERMINAL_CWD so it uses default
|
||||
env = os.environ.copy()
|
||||
env.pop("TERMINAL_CWD", None)
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == "/root", (
|
||||
f"Backend {backend}: expected /root default, got {config['cwd']}"
|
||||
)
|
||||
|
||||
def test_local_backend_uses_getcwd(self):
|
||||
"""Local backend should use os.getcwd(), not /root."""
|
||||
with patch.dict(os.environ, {"TERMINAL_ENV": "local"}, clear=False):
|
||||
env = os.environ.copy()
|
||||
env.pop("TERMINAL_CWD", None)
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == os.getcwd()
|
||||
|
||||
def test_ssh_preserves_home_paths(self):
|
||||
"""SSH backend should NOT replace /home/ paths (they're valid remotely)."""
|
||||
with patch.dict(os.environ, {
|
||||
"TERMINAL_ENV": "ssh",
|
||||
"TERMINAL_CWD": "/home/remote-user/work",
|
||||
"TERMINAL_SSH_HOST": "example.com",
|
||||
"TERMINAL_SSH_USER": "user",
|
||||
}):
|
||||
config = _tt_mod._get_env_config()
|
||||
assert config["cwd"] == "/home/remote-user/work", (
|
||||
"SSH backend should preserve /home/ paths"
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 5: ephemeral_disk version check
|
||||
# =========================================================================
|
||||
|
||||
class TestEphemeralDiskCheck:
|
||||
"""Verify ephemeral_disk is only passed when modal supports it."""
|
||||
|
||||
def test_ephemeral_disk_skipped_when_unsupported(self):
|
||||
"""If modal.Sandbox.create doesn't have ephemeral_disk param, skip it."""
|
||||
# Mock the modal import and Sandbox.create signature
|
||||
mock_modal = MagicMock()
|
||||
mock_sandbox_create = MagicMock()
|
||||
# Simulate a signature WITHOUT ephemeral_disk
|
||||
import inspect
|
||||
mock_params = {
|
||||
"args": inspect.Parameter("args", inspect.Parameter.VAR_POSITIONAL),
|
||||
"image": inspect.Parameter("image", inspect.Parameter.KEYWORD_ONLY),
|
||||
"timeout": inspect.Parameter("timeout", inspect.Parameter.KEYWORD_ONLY),
|
||||
"cpu": inspect.Parameter("cpu", inspect.Parameter.KEYWORD_ONLY),
|
||||
"memory": inspect.Parameter("memory", inspect.Parameter.KEYWORD_ONLY),
|
||||
}
|
||||
mock_sig = inspect.Signature(parameters=list(mock_params.values()))
|
||||
|
||||
with patch.dict(os.environ, {"TERMINAL_ENV": "modal"}):
|
||||
config = _tt_mod._get_env_config()
|
||||
# The config has container_disk default of 51200
|
||||
disk = config.get("container_disk", 51200)
|
||||
assert disk > 0, "disk should default to > 0"
|
||||
|
||||
# Simulate the version check logic from terminal_tool.py
|
||||
sandbox_kwargs = {}
|
||||
if disk > 0:
|
||||
try:
|
||||
if "ephemeral_disk" in mock_params:
|
||||
sandbox_kwargs["ephemeral_disk"] = disk
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
assert "ephemeral_disk" not in sandbox_kwargs, (
|
||||
"ephemeral_disk should not be set when Sandbox.create doesn't support it"
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 6: ModalEnvironment defaults
|
||||
# =========================================================================
|
||||
|
||||
class TestModalEnvironmentDefaults:
|
||||
"""Verify ModalEnvironment has correct defaults."""
|
||||
|
||||
def test_default_cwd_is_root(self):
|
||||
"""ModalEnvironment default cwd should be /root, not ~."""
|
||||
from tools.environments.modal import ModalEnvironment
|
||||
import inspect
|
||||
sig = inspect.signature(ModalEnvironment.__init__)
|
||||
cwd_default = sig.parameters["cwd"].default
|
||||
assert cwd_default == "/root", (
|
||||
f"ModalEnvironment cwd default should be /root, got {cwd_default!r}. "
|
||||
"Tilde ~ is not expanded by subprocess.run(cwd=...)."
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 7: ensurepip fix in patches.py
|
||||
# =========================================================================
|
||||
|
||||
class TestEnsurepipFix:
|
||||
"""Verify the pip fix is applied in the patched Modal init."""
|
||||
|
||||
def test_patched_init_creates_image_with_setup_commands(self):
|
||||
"""The patched __init__ should create a modal.Image with pip fix."""
|
||||
try:
|
||||
from environments.patches import _patch_swerex_modal
|
||||
except ImportError:
|
||||
pytest.skip("environments.patches not importable")
|
||||
|
||||
# Check that the patch code references ensurepip
|
||||
import inspect
|
||||
source = inspect.getsource(_patch_swerex_modal)
|
||||
assert "ensurepip" in source, (
|
||||
"patches._patch_swerex_modal should include ensurepip fix "
|
||||
"for Modal's legacy image builder"
|
||||
)
|
||||
assert "setup_dockerfile_commands" in source, (
|
||||
"patches._patch_swerex_modal should use setup_dockerfile_commands "
|
||||
"to fix pip before Modal's bootstrap"
|
||||
)
|
||||
|
||||
def test_patched_init_uses_install_pipx_from_config(self):
|
||||
"""The patched init should respect install_pipx from config."""
|
||||
try:
|
||||
from environments.patches import _patch_swerex_modal
|
||||
except ImportError:
|
||||
pytest.skip("environments.patches not importable")
|
||||
|
||||
import inspect
|
||||
source = inspect.getsource(_patch_swerex_modal)
|
||||
assert "install_pipx" in source, (
|
||||
"patches._patch_swerex_modal should pass install_pipx to ModalDeployment"
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Test 8: Host prefix list completeness
|
||||
# =========================================================================
|
||||
|
||||
class TestHostPrefixList:
|
||||
"""Verify the host prefix list catches common host-only paths."""
|
||||
|
||||
def test_all_common_host_prefixes_caught(self):
|
||||
"""The host prefix check should catch /Users/, /home/, C:\\, C:/."""
|
||||
# Read the actual source to verify the prefixes
|
||||
import inspect
|
||||
source = inspect.getsource(_tt_mod._get_env_config)
|
||||
for prefix in ["/Users/", "/home/", 'C:\\\\"', "C:/"]:
|
||||
# Normalize for source comparison
|
||||
check = prefix.rstrip('"')
|
||||
assert check in source or prefix in source, (
|
||||
f"Host prefix {prefix!r} not found in _get_env_config. "
|
||||
"Container backends need this to avoid using host paths."
|
||||
)
|
||||
@@ -289,7 +289,7 @@ class TestErrorLoggingExcInfo:
|
||||
assert result_data["success"] is False
|
||||
|
||||
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
|
||||
assert any(r.exc_info is not None for r in error_records)
|
||||
assert any(r.exc_info and r.exc_info[0] is not None for r in error_records)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cleanup_error_logs_exc_info(self, tmp_path, caplog):
|
||||
|
||||
@@ -166,10 +166,20 @@ def _run_single_child(
|
||||
max_iterations: int,
|
||||
parent_agent,
|
||||
task_count: int = 1,
|
||||
# Credential overrides from delegation config (provider:model resolution)
|
||||
override_provider: Optional[str] = None,
|
||||
override_base_url: Optional[str] = None,
|
||||
override_api_key: Optional[str] = None,
|
||||
override_api_mode: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Spawn and run a single child agent. Called from within a thread.
|
||||
Returns a structured result dict.
|
||||
|
||||
When override_* params are set (from delegation config), the child uses
|
||||
those credentials instead of inheriting from the parent. This enables
|
||||
routing subagents to a different provider:model pair (e.g. cheap/fast
|
||||
model on OpenRouter while the parent runs on Nous Portal).
|
||||
"""
|
||||
from run_agent import AIAgent
|
||||
|
||||
@@ -199,12 +209,19 @@ def _run_single_child(
|
||||
# count toward the session-wide limit.
|
||||
shared_budget = getattr(parent_agent, "iteration_budget", None)
|
||||
|
||||
# Resolve effective credentials: config override > parent inherit
|
||||
effective_model = model or parent_agent.model
|
||||
effective_provider = override_provider or getattr(parent_agent, "provider", None)
|
||||
effective_base_url = override_base_url or parent_agent.base_url
|
||||
effective_api_key = override_api_key or parent_api_key
|
||||
effective_api_mode = override_api_mode or getattr(parent_agent, "api_mode", None)
|
||||
|
||||
child = AIAgent(
|
||||
base_url=parent_agent.base_url,
|
||||
api_key=parent_api_key,
|
||||
model=model or parent_agent.model,
|
||||
provider=getattr(parent_agent, "provider", None),
|
||||
api_mode=getattr(parent_agent, "api_mode", None),
|
||||
base_url=effective_base_url,
|
||||
api_key=effective_api_key,
|
||||
model=effective_model,
|
||||
provider=effective_provider,
|
||||
api_mode=effective_api_mode,
|
||||
max_iterations=max_iterations,
|
||||
max_tokens=getattr(parent_agent, "max_tokens", None),
|
||||
reasoning_config=getattr(parent_agent, "reasoning_config", None),
|
||||
@@ -327,6 +344,16 @@ def delegate_task(
|
||||
default_max_iter = cfg.get("max_iterations", DEFAULT_MAX_ITERATIONS)
|
||||
effective_max_iter = max_iterations or default_max_iter
|
||||
|
||||
# Resolve delegation credentials (provider:model pair).
|
||||
# When delegation.provider is configured, this resolves the full credential
|
||||
# bundle (base_url, api_key, api_mode) via the same runtime provider system
|
||||
# used by CLI/gateway startup. When unconfigured, returns None values so
|
||||
# children inherit from the parent.
|
||||
try:
|
||||
creds = _resolve_delegation_credentials(cfg, parent_agent)
|
||||
except ValueError as exc:
|
||||
return json.dumps({"error": str(exc)})
|
||||
|
||||
# Normalize to task list
|
||||
if tasks and isinstance(tasks, list):
|
||||
task_list = tasks[:MAX_CONCURRENT_CHILDREN]
|
||||
@@ -358,10 +385,14 @@ def delegate_task(
|
||||
goal=t["goal"],
|
||||
context=t.get("context"),
|
||||
toolsets=t.get("toolsets") or toolsets,
|
||||
model=None,
|
||||
model=creds["model"],
|
||||
max_iterations=effective_max_iter,
|
||||
parent_agent=parent_agent,
|
||||
task_count=1,
|
||||
override_provider=creds["provider"],
|
||||
override_base_url=creds["base_url"],
|
||||
override_api_key=creds["api_key"],
|
||||
override_api_mode=creds["api_mode"],
|
||||
)
|
||||
results.append(result)
|
||||
else:
|
||||
@@ -383,10 +414,14 @@ def delegate_task(
|
||||
goal=t["goal"],
|
||||
context=t.get("context"),
|
||||
toolsets=t.get("toolsets") or toolsets,
|
||||
model=None,
|
||||
model=creds["model"],
|
||||
max_iterations=effective_max_iter,
|
||||
parent_agent=parent_agent,
|
||||
task_count=n_tasks,
|
||||
override_provider=creds["provider"],
|
||||
override_base_url=creds["base_url"],
|
||||
override_api_key=creds["api_key"],
|
||||
override_api_mode=creds["api_mode"],
|
||||
)
|
||||
futures[future] = i
|
||||
|
||||
@@ -444,11 +479,78 @@ def delegate_task(
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
||||
def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
|
||||
"""Resolve credentials for subagent delegation.
|
||||
|
||||
If ``delegation.provider`` is configured, resolves the full credential
|
||||
bundle (base_url, api_key, api_mode, provider) via the runtime provider
|
||||
system — the same path used by CLI/gateway startup. This lets subagents
|
||||
run on a completely different provider:model pair.
|
||||
|
||||
If no provider is configured, returns None values so the child inherits
|
||||
everything from the parent agent.
|
||||
|
||||
Raises ValueError with a user-friendly message on credential failure.
|
||||
"""
|
||||
configured_model = cfg.get("model") or None
|
||||
configured_provider = cfg.get("provider") or None
|
||||
|
||||
if not configured_provider:
|
||||
# No provider override — child inherits everything from parent
|
||||
return {
|
||||
"model": configured_model,
|
||||
"provider": None,
|
||||
"base_url": None,
|
||||
"api_key": None,
|
||||
"api_mode": None,
|
||||
}
|
||||
|
||||
# Provider is configured — resolve full credentials
|
||||
try:
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
runtime = resolve_runtime_provider(requested=configured_provider)
|
||||
except Exception as exc:
|
||||
raise ValueError(
|
||||
f"Cannot resolve delegation provider '{configured_provider}': {exc}. "
|
||||
f"Check that the provider is configured (API key set, valid provider name). "
|
||||
f"Available providers: openrouter, nous, zai, kimi-coding, minimax."
|
||||
) from exc
|
||||
|
||||
api_key = runtime.get("api_key", "")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
f"Delegation provider '{configured_provider}' resolved but has no API key. "
|
||||
f"Set the appropriate environment variable or run 'hermes login'."
|
||||
)
|
||||
|
||||
return {
|
||||
"model": configured_model,
|
||||
"provider": runtime.get("provider"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"api_key": api_key,
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
}
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
"""Load delegation config from CLI_CONFIG if available."""
|
||||
"""Load delegation config from CLI_CONFIG or persistent config.
|
||||
|
||||
Checks the runtime config (cli.py CLI_CONFIG) first, then falls back
|
||||
to the persistent config (hermes_cli/config.py load_config()) so that
|
||||
``delegation.model`` / ``delegation.provider`` are picked up regardless
|
||||
of the entry point (CLI, gateway, cron).
|
||||
"""
|
||||
try:
|
||||
from cli import CLI_CONFIG
|
||||
return CLI_CONFIG.get("delegation", {})
|
||||
cfg = CLI_CONFIG.get("delegation", {})
|
||||
if cfg:
|
||||
return cfg
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
full = load_config()
|
||||
return full.get("delegation", {})
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ class ModalEnvironment(BaseEnvironment):
|
||||
def __init__(
|
||||
self,
|
||||
image: str,
|
||||
cwd: str = "~",
|
||||
cwd: str = "/root",
|
||||
timeout: int = 60,
|
||||
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
|
||||
persistent_filesystem: bool = True,
|
||||
@@ -95,6 +95,7 @@ class ModalEnvironment(BaseEnvironment):
|
||||
startup_timeout=180.0,
|
||||
runtime_timeout=3600.0,
|
||||
modal_sandbox_kwargs=sandbox_kwargs,
|
||||
install_pipx=True, # Required: installs pipx + swe-rex runtime (swerex-remote)
|
||||
)
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
|
||||
@@ -538,6 +538,14 @@ class SamplingHandler:
|
||||
f"Sampling LLM call failed: {_sanitize_error(str(exc))}"
|
||||
)
|
||||
|
||||
# Guard against empty choices (content filtering, provider errors)
|
||||
if not getattr(response, "choices", None):
|
||||
self.metrics["errors"] += 1
|
||||
return self._error(
|
||||
f"LLM returned empty response (no choices) for server "
|
||||
f"'{self.server_name}'"
|
||||
)
|
||||
|
||||
# Track metrics
|
||||
choice = response.choices[0]
|
||||
self.metrics["requests"] += 1
|
||||
|
||||
@@ -119,6 +119,7 @@ def _handle_send(args):
|
||||
"slack": Platform.SLACK,
|
||||
"whatsapp": Platform.WHATSAPP,
|
||||
"signal": Platform.SIGNAL,
|
||||
"email": Platform.EMAIL,
|
||||
}
|
||||
platform = platform_map.get(platform_name)
|
||||
if not platform:
|
||||
@@ -185,6 +186,8 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None)
|
||||
return await _send_slack(pconfig.token, chat_id, message)
|
||||
elif platform == Platform.SIGNAL:
|
||||
return await _send_signal(pconfig.extra, chat_id, message)
|
||||
elif platform == Platform.EMAIL:
|
||||
return await _send_email(pconfig.extra, chat_id, message)
|
||||
return {"error": f"Direct sending not yet implemented for {platform.value}"}
|
||||
|
||||
|
||||
@@ -283,6 +286,35 @@ async def _send_signal(extra, chat_id, message):
|
||||
return {"error": f"Signal send failed: {e}"}
|
||||
|
||||
|
||||
async def _send_email(extra, chat_id, message):
|
||||
"""Send via SMTP (one-shot, no persistent connection needed)."""
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
|
||||
address = extra.get("address") or os.getenv("EMAIL_ADDRESS", "")
|
||||
password = os.getenv("EMAIL_PASSWORD", "")
|
||||
smtp_host = extra.get("smtp_host") or os.getenv("EMAIL_SMTP_HOST", "")
|
||||
smtp_port = int(os.getenv("EMAIL_SMTP_PORT", "587"))
|
||||
|
||||
if not all([address, password, smtp_host]):
|
||||
return {"error": "Email not configured (EMAIL_ADDRESS, EMAIL_PASSWORD, EMAIL_SMTP_HOST required)"}
|
||||
|
||||
try:
|
||||
msg = MIMEText(message, "plain", "utf-8")
|
||||
msg["From"] = address
|
||||
msg["To"] = chat_id
|
||||
msg["Subject"] = "Hermes Agent"
|
||||
|
||||
server = smtplib.SMTP(smtp_host, smtp_port)
|
||||
server.starttls()
|
||||
server.login(address, password)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
return {"success": True, "platform": "email", "chat_id": chat_id}
|
||||
except Exception as e:
|
||||
return {"error": f"Email send failed: {e}"}
|
||||
|
||||
|
||||
def _check_send_message():
|
||||
"""Gate send_message on gateway running (always available on messaging platforms)."""
|
||||
platform = os.getenv("HERMES_SESSION_PLATFORM", "")
|
||||
|
||||
@@ -463,7 +463,7 @@ def _get_env_config() -> Dict[str, Any]:
|
||||
if env_type == "local":
|
||||
default_cwd = os.getcwd()
|
||||
else:
|
||||
default_cwd = "~"
|
||||
default_cwd = "/root"
|
||||
|
||||
# Read TERMINAL_CWD but sanity-check it for container backends.
|
||||
# If the CWD looks like a host-local path that can't exist inside a
|
||||
@@ -553,7 +553,12 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int,
|
||||
if memory > 0:
|
||||
sandbox_kwargs["memory"] = memory
|
||||
if disk > 0:
|
||||
sandbox_kwargs["ephemeral_disk"] = disk
|
||||
try:
|
||||
import inspect, modal
|
||||
if "ephemeral_disk" in inspect.signature(modal.Sandbox.create).parameters:
|
||||
sandbox_kwargs["ephemeral_disk"] = disk
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _ModalEnvironment(
|
||||
image=image, cwd=cwd, timeout=timeout,
|
||||
|
||||
@@ -267,10 +267,16 @@ TOOLSETS = {
|
||||
"includes": []
|
||||
},
|
||||
|
||||
"hermes-email": {
|
||||
"description": "Email bot toolset - interact with Hermes via email (IMAP/SMTP)",
|
||||
"tools": _HERMES_CORE_TOOLS,
|
||||
"includes": []
|
||||
},
|
||||
|
||||
"hermes-gateway": {
|
||||
"description": "Gateway toolset - union of all messaging platform tools",
|
||||
"tools": [],
|
||||
"includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack", "hermes-signal", "hermes-homeassistant"]
|
||||
"includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack", "hermes-signal", "hermes-homeassistant", "hermes-email"]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ Native Windows is **not supported**. Please install [WSL2](https://learn.microso
|
||||
|
||||
### What the Installer Does
|
||||
|
||||
The installer handles everything automatically — all dependencies (Python, Node.js, ripgrep, ffmpeg), the repo clone, virtual environment, and global `hermes` command setup. It finishes by running the interactive setup wizard to configure your LLM provider.
|
||||
The installer handles everything automatically — all dependencies (Python, Node.js, ripgrep, ffmpeg), the repo clone, virtual environment, global `hermes` command setup, and LLM provider configuration. By the end, you're ready to chat.
|
||||
|
||||
### After Installation
|
||||
|
||||
@@ -30,10 +30,19 @@ Reload your shell and start chatting:
|
||||
|
||||
```bash
|
||||
source ~/.bashrc # or: source ~/.zshrc
|
||||
hermes setup # Configure API keys (if you skipped during install)
|
||||
hermes # Start chatting!
|
||||
```
|
||||
|
||||
To reconfigure individual settings later, use the dedicated commands:
|
||||
|
||||
```bash
|
||||
hermes model # Choose your LLM provider and model
|
||||
hermes tools # Configure which tools are enabled
|
||||
hermes gateway setup # Set up messaging platforms
|
||||
hermes config set # Set individual config values
|
||||
hermes setup # Or run the full setup wizard to configure everything at once
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
@@ -192,10 +201,10 @@ echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshrc && source ~/.zshrc
|
||||
fish_add_path $HOME/.local/bin
|
||||
```
|
||||
|
||||
### Step 9: Run the Setup Wizard (Optional)
|
||||
### Step 9: Configure Your Provider
|
||||
|
||||
```bash
|
||||
hermes setup
|
||||
hermes model # Select your LLM provider and model
|
||||
```
|
||||
|
||||
### Step 10: Verify the Installation
|
||||
@@ -253,7 +262,7 @@ hermes
|
||||
| Problem | Solution |
|
||||
|---------|----------|
|
||||
| `hermes: command not found` | Reload your shell (`source ~/.bashrc`) or check PATH |
|
||||
| `API key not set` | Run `hermes setup` or `hermes config set OPENROUTER_API_KEY your_key` |
|
||||
| `API key not set` | Run `hermes model` to configure your provider, or `hermes config set OPENROUTER_API_KEY your_key` |
|
||||
| Missing config after update | Run `hermes config check` then `hermes config migrate` |
|
||||
|
||||
For more diagnostics, run `hermes doctor` — it will tell you exactly what's missing and how to fix it.
|
||||
|
||||
@@ -29,13 +29,15 @@ source ~/.bashrc # or source ~/.zshrc
|
||||
|
||||
## 2. Set Up a Provider
|
||||
|
||||
The installer runs the setup wizard automatically. If you skipped it, run:
|
||||
The installer configures your LLM provider automatically. To change it later, use one of these commands:
|
||||
|
||||
```bash
|
||||
hermes setup
|
||||
hermes model # Choose your LLM provider and model
|
||||
hermes tools # Configure which tools are enabled
|
||||
hermes setup # Or configure everything at once
|
||||
```
|
||||
|
||||
This walks you through selecting an inference provider:
|
||||
`hermes model` walks you through selecting an inference provider:
|
||||
|
||||
| Provider | What it is | How to set up |
|
||||
|----------|-----------|---------------|
|
||||
@@ -160,9 +162,9 @@ mcp_servers:
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `hermes` | Start chatting |
|
||||
| `hermes setup` | Configure providers and settings |
|
||||
| `hermes model` | Switch provider or model |
|
||||
| `hermes model` | Choose your LLM provider and model |
|
||||
| `hermes tools` | Configure which tools are enabled per platform |
|
||||
| `hermes setup` | Full setup wizard (configures everything at once) |
|
||||
| `hermes doctor` | Diagnose issues |
|
||||
| `hermes update` | Update to latest version |
|
||||
| `hermes gateway` | Start the messaging gateway |
|
||||
|
||||
@@ -38,7 +38,7 @@ These are commands you run from your shell.
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `hermes setup` | Full setup wizard (provider, terminal, messaging) |
|
||||
| `hermes setup` | Full setup wizard — configures provider, model, terminal, and messaging all at once |
|
||||
| `hermes config` | View current configuration |
|
||||
| `hermes config edit` | Open config.yaml in your editor |
|
||||
| `hermes config set KEY VAL` | Set a specific value |
|
||||
@@ -147,6 +147,7 @@ Type `/` in the interactive CLI to see an autocomplete dropdown.
|
||||
| `/config` | Show current configuration |
|
||||
| `/prompt [text]` | View/set custom system prompt |
|
||||
| `/personality [name]` | Set a predefined personality |
|
||||
| `/reasoning [arg]` | Manage reasoning effort and display. Args: effort level (`none`, `low`, `medium`, `high`, `xhigh`) or display toggle (`show`, `hide`). No args shows current state. |
|
||||
|
||||
### Conversation
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ Hermes Agent works with any OpenAI-compatible API. Supported providers include:
|
||||
- **MiniMax** — global and China endpoints
|
||||
- **Local models** — via [Ollama](https://ollama.com/), [vLLM](https://docs.vllm.ai/), [llama.cpp](https://github.com/ggerganov/llama.cpp), [SGLang](https://github.com/sgl-project/sglang), or any OpenAI-compatible server
|
||||
|
||||
Set your provider with `hermes setup` or by editing `~/.hermes/.env`. See the [Environment Variables](./environment-variables.md) reference for all provider keys.
|
||||
Set your provider with `hermes model` or by editing `~/.hermes/.env`. See the [Environment Variables](./environment-variables.md) reference for all provider keys.
|
||||
|
||||
### Does it work on Windows?
|
||||
|
||||
@@ -160,8 +160,8 @@ curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scri
|
||||
# Check which keys are set
|
||||
hermes config get OPENROUTER_API_KEY
|
||||
|
||||
# Re-run interactive setup
|
||||
hermes setup
|
||||
# Re-configure your provider
|
||||
hermes model
|
||||
|
||||
# Or set directly
|
||||
hermes config set OPENROUTER_API_KEY sk-or-v1-xxxxxxxxxxxx
|
||||
@@ -279,7 +279,7 @@ hermes gateway logs
|
||||
**Cause:** Network issues, bot token expired, or platform webhook misconfiguration.
|
||||
|
||||
**Solution:**
|
||||
- Verify your bot token is valid with `hermes setup`
|
||||
- Verify your bot token is valid with `hermes gateway setup`
|
||||
- Check gateway logs: `hermes gateway logs`
|
||||
- For webhook-based platforms (Slack, WhatsApp), ensure your server is publicly accessible
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ Type `/` to see an autocomplete dropdown of all available commands.
|
||||
| `/config` | Show current configuration |
|
||||
| `/prompt [text]` | View/set/clear custom system prompt |
|
||||
| `/personality [name]` | Set a predefined personality |
|
||||
| `/reasoning [arg]` | Manage reasoning effort (`none`/`low`/`medium`/`high`/`xhigh`) and display (`show`/`hide`) |
|
||||
|
||||
### Conversation Management
|
||||
|
||||
|
||||
@@ -608,6 +608,16 @@ agent:
|
||||
|
||||
When unset (default), reasoning effort defaults to "medium" — a balanced level that works well for most tasks. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
|
||||
|
||||
You can also change the reasoning effort at runtime with the `/reasoning` command:
|
||||
|
||||
```
|
||||
/reasoning # Show current effort level and display state
|
||||
/reasoning high # Set reasoning effort to high
|
||||
/reasoning none # Disable reasoning
|
||||
/reasoning show # Show model thinking above each response
|
||||
/reasoning hide # Hide model thinking
|
||||
```
|
||||
|
||||
## TTS Configuration
|
||||
|
||||
```yaml
|
||||
@@ -632,6 +642,7 @@ display:
|
||||
compact: false # Compact output mode (less whitespace)
|
||||
resume_display: full # full (show previous messages on resume) | minimal (one-liner only)
|
||||
bell_on_complete: false # Play terminal bell when agent finishes (great for long tasks)
|
||||
show_reasoning: false # Show model reasoning/thinking above each response (toggle with /reasoning show|hide)
|
||||
```
|
||||
|
||||
| Mode | What you see |
|
||||
@@ -729,8 +740,16 @@ delegation:
|
||||
- terminal
|
||||
- file
|
||||
- web
|
||||
# model: "google/gemini-3-flash-preview" # Override model (empty = inherit parent)
|
||||
# provider: "openrouter" # Override provider (empty = inherit parent)
|
||||
```
|
||||
|
||||
**Subagent provider:model override:** By default, subagents inherit the parent agent's provider and model. Set `delegation.provider` and `delegation.model` to route subagents to a different provider:model pair — e.g., use a cheap/fast model for narrowly-scoped subtasks while your primary agent runs an expensive reasoning model.
|
||||
|
||||
The delegation provider uses the same credential resolution as CLI/gateway startup. All configured providers are supported: `openrouter`, `nous`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`. When a provider is set, the system automatically resolves the correct base URL, API key, and API mode — no manual credential wiring needed.
|
||||
|
||||
**Precedence:** `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter).
|
||||
|
||||
## Clarify
|
||||
|
||||
Configure the clarification prompt behavior:
|
||||
|
||||
176
website/docs/user-guide/messaging/email.md
Normal file
176
website/docs/user-guide/messaging/email.md
Normal file
@@ -0,0 +1,176 @@
|
||||
---
|
||||
sidebar_position: 7
|
||||
title: "Email"
|
||||
description: "Set up Hermes Agent as an email assistant via IMAP/SMTP"
|
||||
---
|
||||
|
||||
# Email Setup
|
||||
|
||||
Hermes can receive and reply to emails using standard IMAP and SMTP protocols. Send an email to the agent's address and it replies in-thread — no special client or bot API needed. Works with Gmail, Outlook, Yahoo, Fastmail, or any provider that supports IMAP/SMTP.
|
||||
|
||||
:::info No External Dependencies
|
||||
The Email adapter uses Python's built-in `imaplib`, `smtplib`, and `email` modules. No additional packages or external services are required.
|
||||
:::
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **A dedicated email account** for your Hermes agent (don't use your personal email)
|
||||
- **IMAP enabled** on the email account
|
||||
- **An app password** if using Gmail or another provider with 2FA
|
||||
|
||||
### Gmail Setup
|
||||
|
||||
1. Enable 2-Factor Authentication on your Google Account
|
||||
2. Go to [App Passwords](https://myaccount.google.com/apppasswords)
|
||||
3. Create a new App Password (select "Mail" or "Other")
|
||||
4. Copy the 16-character password — you'll use this instead of your regular password
|
||||
|
||||
### Outlook / Microsoft 365
|
||||
|
||||
1. Go to [Security Settings](https://account.microsoft.com/security)
|
||||
2. Enable 2FA if not already active
|
||||
3. Create an App Password under "Additional security options"
|
||||
4. IMAP host: `outlook.office365.com`, SMTP host: `smtp.office365.com`
|
||||
|
||||
### Other Providers
|
||||
|
||||
Most email providers support IMAP/SMTP. Check your provider's documentation for:
|
||||
- IMAP host and port (usually port 993 with SSL)
|
||||
- SMTP host and port (usually port 587 with STARTTLS)
|
||||
- Whether app passwords are required
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Configure Hermes
|
||||
|
||||
The easiest way:
|
||||
|
||||
```bash
|
||||
hermes gateway setup
|
||||
```
|
||||
|
||||
Select **Email** from the platform menu. The wizard prompts for your email address, password, IMAP/SMTP hosts, and allowed senders.
|
||||
|
||||
### Manual Configuration
|
||||
|
||||
Add to `~/.hermes/.env`:
|
||||
|
||||
```bash
|
||||
# Required
|
||||
EMAIL_ADDRESS=hermes@gmail.com
|
||||
EMAIL_PASSWORD=abcd efgh ijkl mnop # App password (not your regular password)
|
||||
EMAIL_IMAP_HOST=imap.gmail.com
|
||||
EMAIL_SMTP_HOST=smtp.gmail.com
|
||||
|
||||
# Security (recommended)
|
||||
EMAIL_ALLOWED_USERS=your@email.com,colleague@work.com
|
||||
|
||||
# Optional
|
||||
EMAIL_IMAP_PORT=993 # Default: 993 (IMAP SSL)
|
||||
EMAIL_SMTP_PORT=587 # Default: 587 (SMTP STARTTLS)
|
||||
EMAIL_POLL_INTERVAL=15 # Seconds between inbox checks (default: 15)
|
||||
EMAIL_HOME_ADDRESS=your@email.com # Default delivery target for cron jobs
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Start the Gateway
|
||||
|
||||
```bash
|
||||
hermes gateway # Run in foreground
|
||||
hermes gateway install # Install as a system service
|
||||
```
|
||||
|
||||
On startup, the adapter:
|
||||
1. Tests IMAP and SMTP connections
|
||||
2. Marks all existing inbox messages as "seen" (only processes new emails)
|
||||
3. Starts polling for new messages
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
### Receiving Messages
|
||||
|
||||
The adapter polls the IMAP inbox for UNSEEN messages at a configurable interval (default: 15 seconds). For each new email:
|
||||
|
||||
- **Subject line** is included as context (e.g., `[Subject: Deploy to production]`)
|
||||
- **Reply emails** (subject starting with `Re:`) skip the subject prefix — the thread context is already established
|
||||
- **Attachments** are cached locally:
|
||||
- Images (JPEG, PNG, GIF, WebP) → available to the vision tool
|
||||
- Documents (PDF, ZIP, etc.) → available for file access
|
||||
- **HTML-only emails** have tags stripped for plain text extraction
|
||||
- **Self-messages** are filtered out to prevent reply loops
|
||||
|
||||
### Sending Replies
|
||||
|
||||
Replies are sent via SMTP with proper email threading:
|
||||
|
||||
- **In-Reply-To** and **References** headers maintain the thread
|
||||
- **Subject line** preserved with `Re:` prefix (no double `Re: Re:`)
|
||||
- **Message-ID** generated with the agent's domain
|
||||
- Responses are sent as plain text (UTF-8)
|
||||
|
||||
### File Attachments
|
||||
|
||||
The agent can send file attachments in replies. Include `MEDIA:/path/to/file` in the response and the file is attached to the outgoing email.
|
||||
|
||||
---
|
||||
|
||||
## Access Control
|
||||
|
||||
Email access follows the same pattern as all other Hermes platforms:
|
||||
|
||||
1. **`EMAIL_ALLOWED_USERS` set** → only emails from those addresses are processed
|
||||
2. **No allowlist set** → unknown senders get a pairing code
|
||||
3. **`EMAIL_ALLOW_ALL_USERS=true`** → any sender is accepted (use with caution)
|
||||
|
||||
:::warning
|
||||
**Always configure `EMAIL_ALLOWED_USERS`.** Without it, anyone who knows the agent's email address could send commands. The agent has terminal access by default.
|
||||
:::
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Problem | Solution |
|
||||
|---------|----------|
|
||||
| **"IMAP connection failed"** at startup | Verify `EMAIL_IMAP_HOST` and `EMAIL_IMAP_PORT`. Ensure IMAP is enabled on the account. For Gmail, enable it in Settings → Forwarding and POP/IMAP. |
|
||||
| **"SMTP connection failed"** at startup | Verify `EMAIL_SMTP_HOST` and `EMAIL_SMTP_PORT`. Check that your password is correct (use App Password for Gmail). |
|
||||
| **Messages not received** | Check `EMAIL_ALLOWED_USERS` includes the sender's email. Check spam folder — some providers flag automated replies. |
|
||||
| **"Authentication failed"** | For Gmail, you must use an App Password, not your regular password. Ensure 2FA is enabled first. |
|
||||
| **Duplicate replies** | Ensure only one gateway instance is running. Check `hermes gateway status`. |
|
||||
| **Slow response** | The default poll interval is 15 seconds. Reduce with `EMAIL_POLL_INTERVAL=5` for faster response (but more IMAP connections). |
|
||||
| **Replies not threading** | The adapter uses In-Reply-To headers. Some email clients (especially web-based) may not thread correctly with automated messages. |
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
:::warning
|
||||
**Use a dedicated email account.** Don't use your personal email — the agent stores the password in `.env` and has full inbox access via IMAP.
|
||||
:::
|
||||
|
||||
- Use **App Passwords** instead of your main password (required for Gmail with 2FA)
|
||||
- Set `EMAIL_ALLOWED_USERS` to restrict who can interact with the agent
|
||||
- The password is stored in `~/.hermes/.env` — protect this file (`chmod 600`)
|
||||
- IMAP uses SSL (port 993) and SMTP uses STARTTLS (port 587) by default — connections are encrypted
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables Reference
|
||||
|
||||
| Variable | Required | Default | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `EMAIL_ADDRESS` | Yes | — | Agent's email address |
|
||||
| `EMAIL_PASSWORD` | Yes | — | Email password or app password |
|
||||
| `EMAIL_IMAP_HOST` | Yes | — | IMAP server host (e.g., `imap.gmail.com`) |
|
||||
| `EMAIL_SMTP_HOST` | Yes | — | SMTP server host (e.g., `smtp.gmail.com`) |
|
||||
| `EMAIL_IMAP_PORT` | No | `993` | IMAP server port |
|
||||
| `EMAIL_SMTP_PORT` | No | `587` | SMTP server port |
|
||||
| `EMAIL_POLL_INTERVAL` | No | `15` | Seconds between inbox checks |
|
||||
| `EMAIL_ALLOWED_USERS` | No | — | Comma-separated allowed sender addresses |
|
||||
| `EMAIL_HOME_ADDRESS` | No | — | Default delivery target for cron jobs |
|
||||
| `EMAIL_ALLOW_ALL_USERS` | No | `false` | Allow all senders (not recommended) |
|
||||
@@ -1,12 +1,12 @@
|
||||
---
|
||||
sidebar_position: 1
|
||||
title: "Messaging Gateway"
|
||||
description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, or Signal — architecture and setup overview"
|
||||
description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, or Email — architecture and setup overview"
|
||||
---
|
||||
|
||||
# Messaging Gateway
|
||||
|
||||
Chat with Hermes from Telegram, Discord, Slack, WhatsApp, or Signal. The gateway is a single background process that connects to all your configured platforms, handles sessions, runs cron jobs, and delivers voice messages.
|
||||
Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, or Email. The gateway is a single background process that connects to all your configured platforms, handles sessions, runs cron jobs, and delivers voice messages.
|
||||
|
||||
## Architecture
|
||||
|
||||
@@ -15,12 +15,12 @@ Chat with Hermes from Telegram, Discord, Slack, WhatsApp, or Signal. The gateway
|
||||
│ Hermes Gateway │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────┐ │
|
||||
│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │ Signal │ │
|
||||
│ │ Adapter │ │ Adapter │ │ Adapter │ │ Adapter │ │ Adapter│ │
|
||||
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ └───┬────┘ │
|
||||
│ │ │ │ │ │ │
|
||||
│ └─────────────┼────────────┼─────────────┼───────────┘ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────┐ ┌────────┐ ┌───────┐│
|
||||
│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │ Signal │ │ Email ││
|
||||
│ │ Adapter │ │ Adapter │ │ Adapter │ │Adapter │ │Adapter │ │Adapter││
|
||||
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └───┬────┘ └───┬────┘ └──┬────┘│
|
||||
│ │ │ │ │ │ │ │
|
||||
│ └─────────────┼────────────┼────────────┼──────────┼─────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────▼────────┐ │
|
||||
│ │ Session Store │ │
|
||||
@@ -114,9 +114,10 @@ Configure per-platform overrides in `~/.hermes/gateway.json`:
|
||||
# Restrict to specific users (recommended):
|
||||
TELEGRAM_ALLOWED_USERS=123456789,987654321
|
||||
DISCORD_ALLOWED_USERS=123456789012345678
|
||||
SIGNAL_ALLOWED_USERS=+15551234567,+15559876543
|
||||
SIGNAL_ALLOWED_USERS=+155****4567,+155****6543
|
||||
EMAIL_ALLOWED_USERS=trusted@example.com,colleague@work.com
|
||||
|
||||
# Or allow specific users across all platforms (comma-separated user IDs):
|
||||
# Or allow
|
||||
GATEWAY_ALLOWED_USERS=123456789,987654321
|
||||
|
||||
# Or explicitly allow all users (NOT recommended for bots with terminal access):
|
||||
@@ -202,6 +203,7 @@ Each platform has its own toolset:
|
||||
| WhatsApp | `hermes-whatsapp` | Full tools including terminal |
|
||||
| Slack | `hermes-slack` | Full tools including terminal |
|
||||
| Signal | `hermes-signal` | Full tools including terminal |
|
||||
| Email | `hermes-email` | Full tools including terminal |
|
||||
|
||||
## Next Steps
|
||||
|
||||
@@ -210,3 +212,4 @@ Each platform has its own toolset:
|
||||
- [Slack Setup](slack.md)
|
||||
- [WhatsApp Setup](whatsapp.md)
|
||||
- [Signal Setup](signal.md)
|
||||
- [Email Setup](email.md)
|
||||
|
||||
Reference in New Issue
Block a user