Compare commits

..

2 Commits

Author SHA1 Message Date
teknium1
a477118337 Merge PR #425: feat(#417): add pokemon-player skill
Authored by teyrebaz33. Closes #417.
Adds pokemon-player skill for playing Pokemon via headless emulation
using the pokemon-agent package (NousResearch/pokemon-agent).
2026-03-09 05:42:58 -07:00
teyrebaz33
99d9ea1464 feat(#417): add pokemon-player skill
Thin skill file that wraps the pokemon-agent pip package.
All emulation logic lives in NousResearch/pokemon-agent.

- Gameplay loop: observe/orient/decide/act/verify/record/save
- Battle strategy with Gen 1 quirks and type chart
- Memory conventions with PKM: prefix
- Progression milestones (all 8 badges + Elite Four)
- Session save/load lifecycle
- Dashboard reference (localhost:8765/dashboard)
2026-03-05 14:31:29 +03:00
19 changed files with 230 additions and 2044 deletions

View File

@@ -560,16 +560,12 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
forced = _get_auxiliary_provider("vision")
if forced != "auto":
return _resolve_forced_provider(forced)
# Auto: try providers known to support multimodal first, then fall
# back to the user's custom endpoint. Many local models (Qwen-VL,
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
# caused silent failures for local-only users.
for try_fn in (_try_openrouter, _try_nous, _try_codex,
_try_custom_endpoint):
# Auto: only multimodal-capable providers
for try_fn in (_try_openrouter, _try_nous, _try_codex):
client, model = try_fn()
if client is not None:
return client, model
logger.debug("Auxiliary vision client: none available")
logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous/Codex)")
return None, None

44
cli.py
View File

@@ -158,7 +158,6 @@ def load_cli_config() -> Dict[str, Any]:
"singularity_image": "docker://python:3.11",
"modal_image": "python:3.11",
"daytona_image": "nikolaik/python-nodejs:python3.11-nodejs20",
"docker_volumes": [], # host:container volume mounts for Docker backend
},
"browser": {
"inactivity_timeout": 120, # Auto-cleanup inactive browser sessions after 2 min
@@ -726,7 +725,6 @@ HERMES_CADUCEUS = """[#CD7F32]⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⡀⠀⣀⣀
[#B8860B]⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀[/]"""
# Compact banner for smaller terminals (fallback)
# Note: built dynamically by _build_compact_banner() to fit terminal width
COMPACT_BANNER = """
[bold #FFD700]╔══════════════════════════════════════════════════════════════╗[/]
[bold #FFD700]║[/] [#FFBF00]⚕ NOUS HERMES[/] [dim #B8860B]- AI Agent Framework[/] [bold #FFD700]║[/]
@@ -735,26 +733,6 @@ COMPACT_BANNER = """
"""
def _build_compact_banner() -> str:
"""Build a compact banner that fits the current terminal width."""
w = min(shutil.get_terminal_size().columns - 2, 64)
if w < 30:
return "\n[#FFBF00]⚕ NOUS HERMES[/] [dim #B8860B]- Nous Research[/]\n"
inner = w - 2 # inside the box border
bar = "" * w
line1 = "⚕ NOUS HERMES - AI Agent Framework"
line2 = "Messenger of the Digital Gods · Nous Research"
# Truncate and pad to fit
line1 = line1[:inner - 2].ljust(inner - 2)
line2 = line2[:inner - 2].ljust(inner - 2)
return (
f"\n[bold #FFD700]╔{bar}╗[/]\n"
f"[bold #FFD700]║[/] [#FFBF00]{line1}[/] [bold #FFD700]║[/]\n"
f"[bold #FFD700]║[/] [dim #B8860B]{line2}[/] [bold #FFD700]║[/]\n"
f"[bold #FFD700]╚{bar}╝[/]\n"
)
def _get_available_skills() -> Dict[str, List[str]]:
"""
Scan ~/.hermes/skills/ and return skills grouped by category.
@@ -952,12 +930,10 @@ def build_welcome_banner(console: Console, model: str, cwd: str, tools: List[dic
padding=(0, 2),
)
# Print the big HERMES-AGENT logo — skip if terminal is too narrow
# Print the big HERMES-AGENT logo first (no panel wrapper for full width)
console.print()
console.print(HERMES_AGENT_LOGO)
console.print()
term_width = shutil.get_terminal_size().columns
if term_width >= 95:
console.print(HERMES_AGENT_LOGO)
console.print()
# Print the panel with caduceus and info
console.print(outer_panel)
@@ -1407,13 +1383,8 @@ class HermesCLI:
"""Display the welcome banner in Claude Code style."""
self.console.clear()
# Auto-compact for narrow terminals — the full banner with caduceus
# + tool list needs ~80 columns minimum to render without wrapping.
term_width = shutil.get_terminal_size().columns
use_compact = self.compact or term_width < 80
if use_compact:
self.console.print(_build_compact_banner())
if self.compact:
self.console.print(COMPACT_BANNER)
self._show_status()
else:
# Get tools for display
@@ -2423,9 +2394,8 @@ class HermesCLI:
# and gets mangled by patch_stdout).
if self._app:
cc = ChatConsole()
term_w = shutil.get_terminal_size().columns
if self.compact or term_w < 80:
cc.print(_build_compact_banner())
if self.compact:
cc.print(COMPACT_BANNER)
else:
tools = get_tool_definitions(enabled_toolsets=self.enabled_toolsets, quiet_mode=True)
cwd = os.getenv("TERMINAL_CWD", os.getcwd())

View File

@@ -1,46 +0,0 @@
# datagen-config-examples/web_research.yaml
#
# Batch data generation config for WebResearchEnv.
# Generates tool-calling trajectories for multi-step web research tasks.
#
# Usage:
# python batch_runner.py \
# --config datagen-config-examples/web_research.yaml \
# --run_name web_research_v1
environment: web-research
# Toolsets available to the agent during data generation
toolsets:
- web
- file
# How many parallel workers to use
num_workers: 4
# Questions per batch
batch_size: 20
# Total trajectories to generate (comment out to run full dataset)
max_items: 500
# Model to use for generation (override with --model flag)
model: openrouter/nousresearch/hermes-3-llama-3.1-405b
# System prompt additions (ephemeral — not saved to trajectories)
ephemeral_system_prompt: |
You are a highly capable research agent. When asked a factual question,
always use web_search to find current, accurate information before answering.
Cite at least 2 sources. Be concise and accurate.
# Output directory
output_dir: data/web_research_v1
# Trajectory compression settings (for fitting into training token budgets)
compression:
enabled: true
target_max_tokens: 16000
# Eval settings
eval_every: 100 # Run eval every N trajectories
eval_size: 25 # Number of held-out questions per eval run

View File

@@ -1,718 +0,0 @@
"""
WebResearchEnv — RL Environment for Multi-Step Web Research
============================================================
Trains models to do accurate, efficient, multi-source web research.
Reward signals:
- Answer correctness (LLM judge, 0.01.0)
- Source diversity (used ≥2 distinct domains)
- Efficiency (penalizes excessive tool calls)
- Tool usage (bonus for actually using web tools)
Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
HuggingFace: google/frames-benchmark
Fallback: built-in sample questions (no HF token needed)
Usage:
# Phase 1 (OpenAI-compatible server)
python environments/web_research_env.py serve \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel \\
--openai.server_type openai
# Process mode (offline data generation)
python environments/web_research_env.py process \\
--env.data_path_to_save_groups data/web_research.jsonl
# Standalone eval
python environments/web_research_env.py evaluate \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel
Built by: github.com/jackx707
Inspired by: GroceryMind — production Hermes agent doing live web research
across German grocery stores (firecrawl + hermes-agent)
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import random
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
from pydantic import Field
# Ensure hermes-agent root is on path
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
# ---------------------------------------------------------------------------
# Optional HuggingFace datasets import
# ---------------------------------------------------------------------------
try:
from datasets import load_dataset
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.agent_loop import AgentResult
from environments.tool_context import ToolContext
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Fallback sample dataset (used when HuggingFace is unavailable)
# Multi-hop questions requiring real web search to answer.
# ---------------------------------------------------------------------------
SAMPLE_QUESTIONS = [
{
"question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
"answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
"difficulty": "medium",
"hops": 2,
},
{
"question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
"answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
"difficulty": "medium",
"hops": 2,
},
{
"question": "What programming language was used to write the original version of the web framework used by Instagram?",
"answer": "Django, which Instagram was built on, is written in Python.",
"difficulty": "easy",
"hops": 2,
},
{
"question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
"answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
"difficulty": "hard",
"hops": 3,
},
{
"question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
"answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
"difficulty": "medium",
"hops": 2,
},
{
"question": "How many employees does the parent company of Instagram have?",
"answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
"difficulty": "medium",
"hops": 2,
},
{
"question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
"answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
"difficulty": "hard",
"hops": 2,
},
{
"question": "Which company acquired the startup founded by the creator of Oculus VR?",
"answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
"difficulty": "medium",
"hops": 2,
},
{
"question": "What is the market cap of the company that owns the most popular search engine in Russia?",
"answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
"difficulty": "hard",
"hops": 2,
},
{
"question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
"answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
"difficulty": "hard",
"hops": 2,
},
]
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
class WebResearchEnvConfig(HermesAgentEnvConfig):
"""Configuration for the web research RL environment."""
# Reward weights
correctness_weight: float = Field(
default=0.6,
description="Weight for answer correctness in reward (LLM judge score).",
)
tool_usage_weight: float = Field(
default=0.2,
description="Weight for tool usage signal (did the model actually use web tools?).",
)
efficiency_weight: float = Field(
default=0.2,
description="Weight for efficiency signal (penalizes excessive tool calls).",
)
diversity_bonus: float = Field(
default=0.1,
description="Bonus reward for citing ≥2 distinct domains.",
)
# Efficiency thresholds
efficient_max_calls: int = Field(
default=5,
description="Maximum tool calls before efficiency penalty begins.",
)
heavy_penalty_calls: int = Field(
default=10,
description="Tool call count where efficiency penalty steepens.",
)
# Eval
eval_size: int = Field(
default=20,
description="Number of held-out items for evaluation.",
)
eval_split_ratio: float = Field(
default=0.1,
description="Fraction of dataset to hold out for evaluation (0.01.0).",
)
# Dataset
dataset_name: str = Field(
default="google/frames-benchmark",
description="HuggingFace dataset name for research questions.",
)
# ---------------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------------
class WebResearchEnv(HermesAgentBaseEnv):
"""
RL environment for training multi-step web research skills.
The model is given a factual question requiring 2-3 hops of web research
and must use web_search / web_extract tools to find and synthesize the answer.
Reward is multi-signal:
60% — answer correctness (LLM judge)
20% — tool usage (did the model actually search the web?)
20% — efficiency (penalizes >5 tool calls)
Bonus +0.1 for source diversity (≥2 distinct domains cited).
"""
name = "web-research"
env_config_cls = WebResearchEnvConfig
# Default toolsets for this environment — web + file for saving notes
default_toolsets = ["web", "file"]
@classmethod
def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
"""Default configuration for the web research environment."""
env_config = WebResearchEnvConfig(
enabled_toolsets=["web", "file"],
max_agent_turns=15,
agent_temperature=1.0,
system_prompt=(
"You are a highly capable research agent. When asked a factual question, "
"always use web_search to find current, accurate information before answering. "
"Cite at least 2 sources. Be concise and accurate."
),
group_size=4,
total_steps=1000,
steps_per_eval=100,
use_wandb=True,
wandb_name="web-research",
)
server_configs = [
APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name="anthropic/claude-sonnet-4.5",
server_type="openai",
api_key=os.getenv("OPENROUTER_API_KEY", ""),
health_check=False,
)
]
return env_config, server_configs
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._items: list[dict] = []
self._eval_items: list[dict] = []
self._index: int = 0
# Metrics tracking for wandb
self._reward_buffer: list[float] = []
self._correctness_buffer: list[float] = []
self._tool_usage_buffer: list[float] = []
self._efficiency_buffer: list[float] = []
self._diversity_buffer: list[float] = []
# ------------------------------------------------------------------
# 1. Setup — load dataset
# ------------------------------------------------------------------
async def setup(self) -> None:
"""Load the FRAMES benchmark or fall back to built-in samples."""
if HF_AVAILABLE:
try:
logger.info("Loading FRAMES benchmark from HuggingFace...")
ds = load_dataset(self.config.dataset_name, split="test")
self._items = [
{
"question": row["Prompt"],
"answer": row["Answer"],
"difficulty": row.get("reasoning_types", "unknown"),
"hops": 2,
}
for row in ds
]
# Hold out for eval
eval_size = max(
self.config.eval_size,
int(len(self._items) * self.config.eval_split_ratio),
)
random.shuffle(self._items)
self._eval_items = self._items[:eval_size]
self._items = self._items[eval_size:]
logger.info(
f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
f"from FRAMES benchmark."
)
return
except Exception as e:
logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
# Fallback
random.shuffle(SAMPLE_QUESTIONS)
split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
self._items = SAMPLE_QUESTIONS[:split]
self._eval_items = SAMPLE_QUESTIONS[split:]
logger.info(
f"Using built-in sample dataset: {len(self._items)} train / "
f"{len(self._eval_items)} eval items."
)
# ------------------------------------------------------------------
# 2. get_next_item — return the next question
# ------------------------------------------------------------------
async def get_next_item(self) -> dict:
"""Return the next item, cycling through the dataset."""
if not self._items:
raise RuntimeError("Dataset is empty. Did you call setup()?")
item = self._items[self._index % len(self._items)]
self._index += 1
return item
# ------------------------------------------------------------------
# 3. format_prompt — build the user-facing prompt
# ------------------------------------------------------------------
def format_prompt(self, item: dict) -> str:
"""Format the research question as a task prompt."""
return (
f"Research the following question thoroughly using web search. "
f"You MUST search the web to find current, accurate information — "
f"do not rely solely on your training data.\n\n"
f"Question: {item['question']}\n\n"
f"Requirements:\n"
f"- Use web_search and/or web_extract tools to find information\n"
f"- Search at least 2 different sources\n"
f"- Provide a concise, accurate answer (2-4 sentences)\n"
f"- Cite the sources you used"
)
# ------------------------------------------------------------------
# 4. compute_reward — multi-signal scoring
# ------------------------------------------------------------------
async def compute_reward(
self,
item: dict,
result: AgentResult,
ctx: ToolContext,
) -> float:
"""
Multi-signal reward function:
correctness_weight * correctness — LLM judge comparing answer to ground truth
tool_usage_weight * tool_used — binary: did the model use web tools?
efficiency_weight * efficiency — penalizes wasteful tool usage
+ diversity_bonus — source diversity (≥2 distinct domains)
"""
# Extract final response from messages (last assistant message with content)
final_response = ""
tools_used: list[str] = []
for msg in reversed(result.messages):
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
final_response = msg["content"]
# Collect tool names from tool call messages
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
fn = tc.get("function", {}) if isinstance(tc, dict) else {}
name = fn.get("name", "")
if name:
tools_used.append(name)
tool_call_count: int = result.turns_used or len(tools_used)
cfg = self.config
# ---- Signal 1: Answer correctness (LLM judge) ----------------
correctness = await self._llm_judge(
question=item["question"],
expected=item["answer"],
model_answer=final_response,
)
# ---- Signal 2: Web tool usage --------------------------------
web_tools = {"web_search", "web_extract", "search", "firecrawl"}
tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
# ---- Signal 3: Efficiency ------------------------------------
if tool_call_count <= cfg.efficient_max_calls:
efficiency = 1.0
elif tool_call_count <= cfg.heavy_penalty_calls:
efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
else:
efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
# ---- Bonus: Source diversity ---------------------------------
domains = self._extract_domains(final_response)
diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
# ---- Combine ------------------------------------------------
reward = (
cfg.correctness_weight * correctness
+ cfg.tool_usage_weight * tool_used
+ cfg.efficiency_weight * efficiency
+ diversity
)
reward = min(1.0, max(0.0, reward)) # clamp to [0, 1]
# Track for wandb
self._reward_buffer.append(reward)
self._correctness_buffer.append(correctness)
self._tool_usage_buffer.append(tool_used)
self._efficiency_buffer.append(efficiency)
self._diversity_buffer.append(diversity)
logger.debug(
f"Reward breakdown — correctness={correctness:.2f}, "
f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
f"diversity={diversity:.1f} → total={reward:.3f}"
)
return reward
# ------------------------------------------------------------------
# 5. evaluate — run on held-out eval split
# ------------------------------------------------------------------
async def evaluate(self, *args, **kwargs) -> None:
"""Run evaluation on the held-out split using the full agent loop with tools.
Each eval item runs through the same agent loop as training —
the model can use web_search, web_extract, etc. to research answers.
This measures actual agentic research capability, not just knowledge.
"""
import time
import uuid
from environments.agent_loop import HermesAgentLoop
from environments.tool_context import ToolContext
items = self._eval_items
if not items:
logger.warning("No eval items available.")
return
eval_size = min(self.config.eval_size, len(items))
eval_items = items[:eval_size]
logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...")
start_time = time.time()
samples = []
# Resolve tools once for all eval items
tools, valid_names = self._resolve_tools_for_group()
for i, item in enumerate(eval_items):
task_id = str(uuid.uuid4())
logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...")
try:
# Build messages
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(item)})
# Run the full agent loop with tools
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=0.0, # Deterministic for eval
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# Extract final response and tool usage from messages
final_response = ""
tool_call_count = 0
for msg in reversed(result.messages):
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
final_response = msg["content"]
if msg.get("role") == "assistant" and msg.get("tool_calls"):
tool_call_count += len(msg["tool_calls"])
# Compute reward (includes LLM judge for correctness)
# Temporarily save buffer lengths so we can extract the
# correctness score without calling judge twice, and avoid
# polluting training metric buffers with eval data.
buf_len = len(self._correctness_buffer)
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
finally:
ctx.cleanup()
# Extract correctness from the buffer (compute_reward appended it)
# then remove eval entries from training buffers
correctness = (
self._correctness_buffer[buf_len]
if len(self._correctness_buffer) > buf_len
else 0.0
)
# Roll back buffers to avoid polluting training metrics
for buf in (
self._reward_buffer, self._correctness_buffer,
self._tool_usage_buffer, self._efficiency_buffer,
self._diversity_buffer,
):
if len(buf) > buf_len:
buf.pop()
samples.append({
"prompt": item["question"],
"response": final_response[:500],
"expected": item["answer"],
"correctness": correctness,
"reward": reward,
"tool_calls": tool_call_count,
"turns": result.turns_used,
})
logger.info(
f" → correctness={correctness:.2f}, reward={reward:.3f}, "
f"tools={tool_call_count}, turns={result.turns_used}"
)
except Exception as e:
logger.error(f"Eval error on item: {e}")
samples.append({
"prompt": item["question"],
"response": f"ERROR: {e}",
"expected": item["answer"],
"correctness": 0.0,
"reward": 0.0,
"tool_calls": 0,
"turns": 0,
})
end_time = time.time()
# Compute aggregate metrics
correctness_scores = [s["correctness"] for s in samples]
rewards = [s["reward"] for s in samples]
tool_counts = [s["tool_calls"] for s in samples]
n = len(samples)
eval_metrics = {
"eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
"eval/mean_reward": sum(rewards) / n if n else 0.0,
"eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0,
"eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0,
"eval/n_items": n,
}
logger.info(
f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, "
f"reward={eval_metrics['eval/mean_reward']:.3f}, "
f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}"
)
await self.evaluate_log(
metrics=eval_metrics,
samples=samples,
start_time=start_time,
end_time=end_time,
)
# ------------------------------------------------------------------
# 6. wandb_log — custom metrics
# ------------------------------------------------------------------
async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
"""Log reward breakdown metrics to wandb."""
if wandb_metrics is None:
wandb_metrics = {}
if self._reward_buffer:
n = len(self._reward_buffer)
wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
wandb_metrics["train/total_rollouts"] = n
# Accuracy buckets
wandb_metrics["train/correct_rate"] = (
sum(1 for c in self._correctness_buffer if c >= 0.7) / n
)
wandb_metrics["train/tool_usage_rate"] = (
sum(1 for t in self._tool_usage_buffer if t > 0) / n
)
# Clear buffers
self._reward_buffer.clear()
self._correctness_buffer.clear()
self._tool_usage_buffer.clear()
self._efficiency_buffer.clear()
self._diversity_buffer.clear()
await super().wandb_log(wandb_metrics)
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
async def _llm_judge(
self,
question: str,
expected: str,
model_answer: str,
) -> float:
"""
Use the server's LLM to judge answer correctness.
Falls back to keyword heuristic if LLM call fails.
"""
if not model_answer or not model_answer.strip():
return 0.0
judge_prompt = (
"You are an impartial judge evaluating the quality of an AI research answer.\n\n"
f"Question: {question}\n\n"
f"Reference answer: {expected}\n\n"
f"Model answer: {model_answer}\n\n"
"Score the model answer on a scale from 0.0 to 1.0 where:\n"
" 1.0 = fully correct and complete\n"
" 0.7 = mostly correct with minor gaps\n"
" 0.4 = partially correct\n"
" 0.1 = mentions relevant topic but wrong or very incomplete\n"
" 0.0 = completely wrong or no answer\n\n"
"Consider: factual accuracy, completeness, and relevance.\n"
'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
)
try:
response = await self.server.chat_completion(
messages=[{"role": "user", "content": judge_prompt}],
n=1,
max_tokens=150,
temperature=0.0,
split="eval",
)
text = response.choices[0].message.content if response.choices else ""
parsed = self._parse_judge_json(text)
if parsed is not None:
return float(parsed)
except Exception as e:
logger.debug(f"LLM judge failed: {e}. Using heuristic.")
return self._heuristic_score(expected, model_answer)
@staticmethod
def _parse_judge_json(text: str) -> Optional[float]:
"""Extract the score float from LLM judge JSON response."""
try:
clean = re.sub(r"```(?:json)?|```", "", text).strip()
data = json.loads(clean)
score = float(data.get("score", -1))
if 0.0 <= score <= 1.0:
return score
except Exception:
match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
if match:
score = float(match.group(1))
if 0.0 <= score <= 1.0:
return score
return None
@staticmethod
def _heuristic_score(expected: str, model_answer: str) -> float:
"""Lightweight keyword overlap score as fallback."""
stopwords = {
"the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
"at", "to", "for", "with", "and", "or", "but", "it", "its",
"this", "that", "as", "by", "from", "be", "has", "have", "had",
}
def tokenize(text: str) -> set:
tokens = re.findall(r'\b\w+\b', text.lower())
return {t for t in tokens if t not in stopwords and len(t) > 2}
expected_tokens = tokenize(expected)
answer_tokens = tokenize(model_answer)
if not expected_tokens:
return 0.5
overlap = len(expected_tokens & answer_tokens)
union = len(expected_tokens | answer_tokens)
jaccard = overlap / union if union > 0 else 0.0
recall = overlap / len(expected_tokens)
return min(1.0, 0.4 * jaccard + 0.6 * recall)
@staticmethod
def _extract_domains(text: str) -> set:
"""Extract unique domains from URLs cited in the response."""
urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
domains = set()
for url in urls:
try:
parsed = urlparse(url)
domain = parsed.netloc.lower().lstrip("www.")
if domain:
domains.add(domain)
except Exception:
pass
return domains
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
WebResearchEnv.cli()

View File

@@ -10,7 +10,6 @@ Uses slack-bolt (Python) with Socket Mode for:
import asyncio
import os
import re
from typing import Dict, List, Optional, Any
try:
@@ -34,8 +33,6 @@ from gateway.platforms.base import (
MessageEvent,
MessageType,
SendResult,
SUPPORTED_DOCUMENT_TYPES,
cache_document_from_bytes,
cache_image_from_url,
cache_audio_from_url,
)
@@ -99,13 +96,6 @@ class SlackAdapter(BasePlatformAdapter):
async def handle_message_event(event, say):
await self._handle_slack_message(event)
# Acknowledge app_mention events to prevent Bolt 404 errors.
# The "message" handler above already processes @mentions in
# channels, so this is intentionally a no-op to avoid duplicates.
@self._app.event("app_mention")
async def handle_app_mention(event, say):
pass
# Register slash command handler
@self._app.command("/hermes")
async def handle_hermes_command(ack, command):
@@ -276,65 +266,6 @@ class SlackAdapter(BasePlatformAdapter):
except Exception as e:
return SendResult(success=False, error=str(e))
async def send_video(
self,
chat_id: str,
video_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send a video file to Slack."""
if not self._app:
return SendResult(success=False, error="Not connected")
if not os.path.exists(video_path):
return SendResult(success=False, error=f"Video file not found: {video_path}")
try:
result = await self._app.client.files_upload_v2(
channel=chat_id,
file=video_path,
filename=os.path.basename(video_path),
initial_comment=caption or "",
thread_ts=reply_to,
)
return SendResult(success=True, raw_response=result)
except Exception as e:
print(f"[{self.name}] Failed to send video: {e}")
return await super().send_video(chat_id, video_path, caption, reply_to)
async def send_document(
self,
chat_id: str,
file_path: str,
caption: Optional[str] = None,
file_name: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send a document/file attachment to Slack."""
if not self._app:
return SendResult(success=False, error="Not connected")
if not os.path.exists(file_path):
return SendResult(success=False, error=f"File not found: {file_path}")
display_name = file_name or os.path.basename(file_path)
try:
result = await self._app.client.files_upload_v2(
channel=chat_id,
file=file_path,
filename=display_name,
initial_comment=caption or "",
thread_ts=reply_to,
)
return SendResult(success=True, raw_response=result)
except Exception as e:
print(f"[{self.name}] Failed to send document: {e}")
return await super().send_document(chat_id, file_path, caption, file_name, reply_to)
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""Get information about a Slack channel."""
if not self._app:
@@ -416,58 +347,6 @@ class SlackAdapter(BasePlatformAdapter):
msg_type = MessageType.VOICE
except Exception as e:
print(f"[Slack] Failed to cache audio: {e}", flush=True)
elif url:
# Try to handle as a document attachment
try:
original_filename = f.get("name", "")
ext = ""
if original_filename:
_, ext = os.path.splitext(original_filename)
ext = ext.lower()
# Fallback: reverse-lookup from MIME type
if not ext and mimetype:
mime_to_ext = {v: k for k, v in SUPPORTED_DOCUMENT_TYPES.items()}
ext = mime_to_ext.get(mimetype, "")
if ext not in SUPPORTED_DOCUMENT_TYPES:
continue # Skip unsupported file types silently
# Check file size (Slack limit: 20 MB for bots)
file_size = f.get("size", 0)
MAX_DOC_BYTES = 20 * 1024 * 1024
if not file_size or file_size > MAX_DOC_BYTES:
print(f"[Slack] Document too large or unknown size: {file_size}", flush=True)
continue
# Download and cache
raw_bytes = await self._download_slack_file_bytes(url)
cached_path = cache_document_from_bytes(
raw_bytes, original_filename or f"document{ext}"
)
doc_mime = SUPPORTED_DOCUMENT_TYPES[ext]
media_urls.append(cached_path)
media_types.append(doc_mime)
msg_type = MessageType.DOCUMENT
print(f"[Slack] Cached user document: {cached_path}", flush=True)
# Inject text content for .txt/.md files (capped at 100 KB)
MAX_TEXT_INJECT_BYTES = 100 * 1024
if ext in (".md", ".txt") and len(raw_bytes) <= MAX_TEXT_INJECT_BYTES:
try:
text_content = raw_bytes.decode("utf-8")
display_name = original_filename or f"document{ext}"
display_name = re.sub(r'[^\w.\- ]', '_', display_name)
injection = f"[Content of {display_name}]:\n{text_content}"
if text:
text = f"{injection}\n\n{text}"
else:
text = injection
except UnicodeDecodeError:
pass # Binary content, skip injection
except Exception as e:
print(f"[Slack] Failed to cache document: {e}", flush=True)
# Build source
source = self.build_source(
@@ -548,16 +427,3 @@ class SlackAdapter(BasePlatformAdapter):
else:
from gateway.platforms.base import cache_image_from_bytes
return cache_image_from_bytes(response.content, ext)
async def _download_slack_file_bytes(self, url: str) -> bytes:
"""Download a Slack file and return raw bytes."""
import httpx
bot_token = self.config.token
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(
url,
headers={"Authorization": f"Bearer {bot_token}"},
)
response.raise_for_status()
return response.content

View File

@@ -75,16 +75,11 @@ if _config_path.exists():
"container_memory": "TERMINAL_CONTAINER_MEMORY",
"container_disk": "TERMINAL_CONTAINER_DISK",
"container_persistent": "TERMINAL_CONTAINER_PERSISTENT",
"docker_volumes": "TERMINAL_DOCKER_VOLUMES",
"sandbox_dir": "TERMINAL_SANDBOX_DIR",
}
for _cfg_key, _env_var in _terminal_env_map.items():
if _cfg_key in _terminal_cfg:
_val = _terminal_cfg[_cfg_key]
if isinstance(_val, list):
os.environ[_env_var] = json.dumps(_val)
else:
os.environ[_env_var] = str(_val)
os.environ[_env_var] = str(_terminal_cfg[_cfg_key])
_compression_cfg = _cfg.get("compression", {})
if _compression_cfg and isinstance(_compression_cfg, dict):
_compression_env_map = {

View File

@@ -77,10 +77,6 @@ DEFAULT_CONFIG = {
"container_memory": 5120, # MB (default 5GB)
"container_disk": 51200, # MB (default 50GB)
"container_persistent": True, # Persist filesystem across sessions
# Docker volume mounts — share host directories with the container.
# Each entry is "host_path:container_path" (standard Docker -v syntax).
# Example: ["/home/user/projects:/workspace/projects", "/data:/data"]
"docker_volumes": [],
},
"browser": {
@@ -405,18 +401,14 @@ OPTIONAL_ENV_VARS = {
"category": "messaging",
},
"SLACK_BOT_TOKEN": {
"description": "Slack bot token (xoxb-). Get from OAuth & Permissions after installing your app. "
"Required scopes: chat:write, app_mentions:read, channels:history, groups:history, "
"im:history, im:read, im:write, users:read, files:write",
"description": "Slack bot integration",
"prompt": "Slack Bot Token (xoxb-...)",
"url": "https://api.slack.com/apps",
"password": True,
"category": "messaging",
},
"SLACK_APP_TOKEN": {
"description": "Slack app-level token (xapp-) for Socket Mode. Get from Basic Information → "
"App-Level Tokens. Also ensure Event Subscriptions include: message.im, "
"message.channels, message.groups, app_mention",
"description": "Slack Socket Mode connection",
"prompt": "Slack App Token (xapp-...)",
"url": "https://api.slack.com/apps",
"password": True,

View File

@@ -482,19 +482,14 @@ _PLATFORMS = [
"token_var": "SLACK_BOT_TOKEN",
"setup_instructions": [
"1. Go to https://api.slack.com/apps → Create New App → From Scratch",
"2. Enable Socket Mode: Settings → Socket Mode → Enable",
" Create an App-Level Token with scope: connections:write → copy xapp-... token",
"3. Add Bot Token Scopes: Features → OAuth & Permissions → Scopes",
" Required: chat:write, app_mentions:read, channels:history, channels:read,",
" groups:history, im:history, im:read, im:write, users:read, files:write",
"4. Subscribe to Events: Features → Event Subscriptions → Enable",
" Required events: message.im, message.channels, app_mention",
" Optional: message.groups (for private channels)",
" ⚠ Without message.channels the bot will ONLY work in DMs!",
"5. Install to Workspace: Settings → Install App → copy xoxb-... token",
"6. Reinstall the app after any scope or event changes",
"2. Enable Socket Mode: App Settings → Socket Mode → Enable",
"3. Get Bot Token: OAuth & Permissions → Install to Workspace → copy xoxb-... token",
"4. Get App Token: Basic Information → App-Level Tokens → Generate",
" Name it anything, add scope: connections:write → copy xapp-... token",
"5. Add bot scopes: OAuth & Permissions → Scopes → chat:write, im:history,",
" im:read, im:write, channels:history, channels:read",
"6. Reinstall the app to your workspace after adding scopes",
"7. Find your user ID: click your profile → three dots → Copy member ID",
"8. Invite the bot to channels: /invite @YourBot",
],
"vars": [
{"name": "SLACK_BOT_TOKEN", "prompt": "Bot Token (xoxb-...)", "password": True,

View File

@@ -1572,22 +1572,10 @@ def setup_gateway(config: dict):
if not existing_slack and prompt_yes_no("Set up Slack bot?", False):
print_info("Steps to create a Slack app:")
print_info(" 1. Go to https://api.slack.com/apps → Create New App (from scratch)")
print_info(" 2. Enable Socket Mode: Settings → Socket Mode → Enable")
print_info(" • Create an App-Level Token with 'connections:write' scope")
print_info(" 3. Add Bot Token Scopes: Features → OAuth & Permissions")
print_info(" Required scopes: chat:write, app_mentions:read,")
print_info(" channels:history, channels:read, groups:history,")
print_info(" im:history, im:read, im:write, users:read, files:write")
print_info(" 4. Subscribe to Events: Features → Event Subscriptions → Enable")
print_info(" Required events: message.im, message.channels,")
print_info(" message.groups, app_mention")
print_warning(" ⚠ Without message.channels/message.groups events,")
print_warning(" the bot will ONLY work in DMs, not channels!")
print_info(" 5. Install to Workspace: Settings → Install App")
print_info(" 6. After installing, invite the bot to channels: /invite @YourBot")
print()
print_info(" Full guide: https://hermes-agent.ai/docs/user-guide/messaging/slack")
print_info(" 1. Go to https://api.slack.com/apps → Create New App")
print_info(" 2. Enable Socket Mode: App Settings → Socket Mode → Enable")
print_info(" 3. Bot Token: OAuth & Permissions → Install to Workspace")
print_info(" 4. App Token: Basic Information → App-Level Tokens → Generate")
print()
bot_token = prompt("Slack Bot Token (xoxb-...)", password=True)
if bot_token:
@@ -1599,7 +1587,7 @@ def setup_gateway(config: dict):
print()
print_info("🔒 Security: Restrict who can use your bot")
print_info(" To find a Member ID: click a user's name → View full profile → ⋮ → Copy member ID")
print_info(" Find Slack user IDs in your profile or via the Slack API")
print()
allowed_users = prompt("Allowed user IDs (comma-separated, leave empty for open access)")
if allowed_users:

View File

@@ -40,7 +40,7 @@ dependencies = [
[project.optional-dependencies]
modal = ["swe-rex[modal]>=1.4.0"]
daytona = ["daytona>=0.148.0"]
dev = ["pytest", "pytest-asyncio", "mcp>=1.2.0"]
dev = ["pytest", "pytest-asyncio"]
messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]
cron = ["croniter"]
slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]

View File

@@ -0,0 +1,161 @@
---
name: pokemon-player
description: Play Pokémon games autonomously via headless emulation. Starts a game server, reads structured game state from RAM, makes strategic decisions, and sends button inputs — all from the terminal.
tags: [gaming, pokemon, emulator, pyboy, gameplay, gameboy]
---
# Pokémon Player
Play Pokémon games via headless emulation using the `pokemon-agent` package.
## When to Use
- User says "play pokemon", "start pokemon", "pokemon game"
- User asks about Pokemon Red, Blue, Yellow, FireRed, etc.
- User wants to watch an AI play Pokemon
- User references a ROM file (.gb, .gbc, .gba)
## First-Time Setup
### 1. Install the package
```bash
pip install pokemon-agent[dashboard] pyboy
```
### 2. Get the ROM
Ask the user for their ROM file path. Do NOT attempt to download ROMs.
### 3. Start the game server
```bash
pokemon-agent serve --rom <ROM_PATH> --port 8765 &
```
Wait 3 seconds, then verify:
```bash
curl -s http://localhost:8765/health
```
## The Gameplay Loop
### Step 1: OBSERVE
```bash
curl -s http://localhost:8765/state
```
### Step 2: ORIENT
- Dialog active → advance text
- In battle → fight
- Party hurt → heal
- Near objective → navigate
### Step 3: DECIDE
Priority order:
1. If dialog active → a_until_dialog_end
2. If in battle → choose best move
3. If any Pokemon <20% HP → Pokémon Center
4. If near story objective → navigate to it
5. If underleveled → train in grass
6. Otherwise → explore
### Step 4: ACT
```bash
curl -s -X POST http://localhost:8765/action \
-H "Content-Type: application/json" \
-d '{"actions": ["walk_up", "walk_up", "press_a"]}'
```
Action reference:
- press_a — confirm, talk, select
- press_b — cancel, close menu
- press_start — open game menu
- walk_up/down/left/right — move one tile
- a_until_dialog_end — advance all dialog
- wait_60 — wait ~1 second
### Step 5: VERIFY
Check state_after in the response. If stuck 3+ turns:
1. Press B several times
2. Try different directions
3. Take screenshot and use vision_analyze
4. Load last save if truly stuck
### Step 6: RECORD
```
memory add: PKM:OBJECTIVE: Heading to Pewter City to challenge Brock
memory add: PKM:PROGRESS: Got Squirtle, Got Pokedex, → Pewter City
```
### Step 7: SAVE
Save every 20-30 turns and ALWAYS before gym battles:
```bash
curl -s -X POST http://localhost:8765/save \
-H "Content-Type: application/json" \
-d '{"name": "before_brock"}'
```
## Battle Strategy
### Decision Tree
1. Want to catch? → Weaken then throw Poké Ball
2. Wild you don't need? → RUN
3. Type advantage? → Use super-effective move
4. No advantage? → Use strongest STAB move
5. Low HP? → Switch or use Potion
### Type Chart
- Water beats Fire, Ground, Rock
- Fire beats Grass, Bug, Ice
- Grass beats Water, Ground, Rock
- Electric beats Water, Flying
- Ground beats Fire, Electric, Rock, Poison
- Psychic beats Fighting, Poison (dominant in Gen 1!)
### Gen 1 Quirks
- Special stat is both offense AND defense for special moves
- Psychic is overpowered (Ghost moves bugged)
- Critical hits based on Speed stat
- Wrap/Bind prevent opponent from acting
## Memory Conventions
| Prefix | Purpose | Example |
|--------|---------|---------|
| PKM:OBJECTIVE | Current goal | Defeat Brock in Pewter City |
| PKM:MAP | Navigation knowledge | Viridian Forest: go north |
| PKM:STRATEGY | Battle/team plans | Need Grass type before Misty |
| PKM:PROGRESS | Milestone tracker | ✓ Boulder Badge → Cascade Badge |
| PKM:STUCK | Stuck situations | Got stuck in Cerulean Cave |
| PKM:TEAM | Team notes | Squirtle is Water/Ice coverage |
## Progression Milestones
- ☐ Choose starter
- ☐ Deliver Oak's Parcel → receive Pokédex
- ☐ Boulder Badge — Brock (Rock) → use Water/Grass
- ☐ Cascade Badge — Misty (Water) → use Grass/Electric
- ☐ Thunder Badge — Lt. Surge (Electric) → use Ground
- ☐ Rainbow Badge — Erika (Grass) → use Fire/Ice/Flying
- ☐ Soul Badge — Koga (Poison) → use Ground/Psychic
- ☐ Marsh Badge — Sabrina (Psychic)
- ☐ Volcano Badge — Blaine (Fire) → use Water/Ground
- ☐ Earth Badge — Giovanni (Ground) → use Water/Grass/Ice
- ☐ Elite Four → Champion!
## Stopping Play
1. Save the game:
```bash
curl -s -X POST http://localhost:8765/save \
-d '{"name": "session_end"}'
```
2. Update memory with progress
3. Tell user: "Game saved! Say 'play pokemon' to resume."
4. Kill the background server process
## Dashboard
If `pokemon-agent[dashboard]` is installed, open:
http://localhost:8765/dashboard
Live features: game screen, AI reasoning stream, team status, action log.
## Pitfalls
- NEVER download or provide ROM files — always ask the user
- Don't send more than 15 actions per /action call
- Always wait for dialog to clear before moving
- Save BEFORE gym battles
- Take screenshots sparingly — they cost vision tokens
- Verify server is running with /health before any commands

View File

@@ -176,18 +176,14 @@ class TestVisionClientFallback:
assert isinstance(client, CodexAuxiliaryClient)
assert model == "gpt-5.3-codex"
def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch):
"""Custom endpoint is used as fallback in vision auto mode.
Many local models (Qwen-VL, LLaVA, etc.) support vision.
When no OpenRouter/Nous/Codex is available, try the custom endpoint.
"""
def test_vision_auto_skips_custom_endpoint(self, monkeypatch):
"""Custom endpoint is skipped in vision auto mode."""
monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:1234/v1")
monkeypatch.setenv("OPENAI_API_KEY", "local-key")
with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
patch("agent.auxiliary_client.OpenAI") as mock_openai:
with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
client, model = get_vision_auxiliary_client()
assert client is not None # Custom endpoint picked up as fallback
assert client is None
assert model is None
def test_vision_uses_openrouter_when_available(self, monkeypatch):
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")

View File

@@ -1,532 +0,0 @@
"""
Tests for Slack platform adapter.
Covers: app_mention handler, send_document, send_video,
incoming document handling, message routing.
Note: slack-bolt may not be installed in the test environment.
We mock the slack modules at import time to avoid collection errors.
"""
import asyncio
import os
import sys
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
MessageEvent,
MessageType,
SendResult,
SUPPORTED_DOCUMENT_TYPES,
)
# ---------------------------------------------------------------------------
# Mock the slack-bolt package if it's not installed
# ---------------------------------------------------------------------------
def _ensure_slack_mock():
"""Install mock slack modules so SlackAdapter can be imported."""
if "slack_bolt" in sys.modules and hasattr(sys.modules["slack_bolt"], "__file__"):
return # Real library installed
slack_bolt = MagicMock()
slack_bolt.async_app.AsyncApp = MagicMock
slack_bolt.adapter.socket_mode.async_handler.AsyncSocketModeHandler = MagicMock
slack_sdk = MagicMock()
slack_sdk.web.async_client.AsyncWebClient = MagicMock
for name, mod in [
("slack_bolt", slack_bolt),
("slack_bolt.async_app", slack_bolt.async_app),
("slack_bolt.adapter", slack_bolt.adapter),
("slack_bolt.adapter.socket_mode", slack_bolt.adapter.socket_mode),
("slack_bolt.adapter.socket_mode.async_handler", slack_bolt.adapter.socket_mode.async_handler),
("slack_sdk", slack_sdk),
("slack_sdk.web", slack_sdk.web),
("slack_sdk.web.async_client", slack_sdk.web.async_client),
]:
sys.modules.setdefault(name, mod)
_ensure_slack_mock()
# Patch SLACK_AVAILABLE before importing the adapter
import gateway.platforms.slack as _slack_mod
_slack_mod.SLACK_AVAILABLE = True
from gateway.platforms.slack import SlackAdapter # noqa: E402
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture()
def adapter():
config = PlatformConfig(enabled=True, token="xoxb-fake-token")
a = SlackAdapter(config)
# Mock the Slack app client
a._app = MagicMock()
a._app.client = AsyncMock()
a._bot_user_id = "U_BOT"
a._running = True
# Capture events instead of processing them
a.handle_message = AsyncMock()
return a
@pytest.fixture(autouse=True)
def _redirect_cache(tmp_path, monkeypatch):
"""Point document cache to tmp_path so tests don't touch ~/.hermes."""
monkeypatch.setattr(
"gateway.platforms.base.DOCUMENT_CACHE_DIR", tmp_path / "doc_cache"
)
# ---------------------------------------------------------------------------
# TestAppMentionHandler
# ---------------------------------------------------------------------------
class TestAppMentionHandler:
"""Verify that the app_mention event handler is registered."""
def test_app_mention_registered_on_connect(self):
"""connect() should register both 'message' and 'app_mention' handlers."""
config = PlatformConfig(enabled=True, token="xoxb-fake")
adapter = SlackAdapter(config)
# Track which events get registered
registered_events = []
registered_commands = []
mock_app = MagicMock()
def mock_event(event_type):
def decorator(fn):
registered_events.append(event_type)
return fn
return decorator
def mock_command(cmd):
def decorator(fn):
registered_commands.append(cmd)
return fn
return decorator
mock_app.event = mock_event
mock_app.command = mock_command
mock_app.client = AsyncMock()
mock_app.client.auth_test = AsyncMock(return_value={
"user_id": "U_BOT",
"user": "testbot",
})
with patch.object(_slack_mod, "AsyncApp", return_value=mock_app), \
patch.object(_slack_mod, "AsyncSocketModeHandler", return_value=MagicMock()), \
patch.dict(os.environ, {"SLACK_APP_TOKEN": "xapp-fake"}), \
patch("asyncio.create_task"):
asyncio.get_event_loop().run_until_complete(adapter.connect())
assert "message" in registered_events
assert "app_mention" in registered_events
assert "/hermes" in registered_commands
# ---------------------------------------------------------------------------
# TestSendDocument
# ---------------------------------------------------------------------------
class TestSendDocument:
@pytest.mark.asyncio
async def test_send_document_success(self, adapter, tmp_path):
test_file = tmp_path / "report.pdf"
test_file.write_bytes(b"%PDF-1.4 fake content")
adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
result = await adapter.send_document(
chat_id="C123",
file_path=str(test_file),
caption="Here's the report",
)
assert result.success
adapter._app.client.files_upload_v2.assert_called_once()
call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
assert call_kwargs["channel"] == "C123"
assert call_kwargs["file"] == str(test_file)
assert call_kwargs["filename"] == "report.pdf"
assert call_kwargs["initial_comment"] == "Here's the report"
@pytest.mark.asyncio
async def test_send_document_custom_name(self, adapter, tmp_path):
test_file = tmp_path / "data.csv"
test_file.write_bytes(b"a,b,c\n1,2,3")
adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
result = await adapter.send_document(
chat_id="C123",
file_path=str(test_file),
file_name="quarterly-report.csv",
)
assert result.success
call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
assert call_kwargs["filename"] == "quarterly-report.csv"
@pytest.mark.asyncio
async def test_send_document_missing_file(self, adapter):
result = await adapter.send_document(
chat_id="C123",
file_path="/nonexistent/file.pdf",
)
assert not result.success
assert "not found" in result.error.lower()
@pytest.mark.asyncio
async def test_send_document_not_connected(self, adapter):
adapter._app = None
result = await adapter.send_document(
chat_id="C123",
file_path="/some/file.pdf",
)
assert not result.success
assert "Not connected" in result.error
@pytest.mark.asyncio
async def test_send_document_api_error_falls_back(self, adapter, tmp_path):
test_file = tmp_path / "doc.pdf"
test_file.write_bytes(b"content")
adapter._app.client.files_upload_v2 = AsyncMock(
side_effect=RuntimeError("Slack API error")
)
# Should fall back to base class (text message)
result = await adapter.send_document(
chat_id="C123",
file_path=str(test_file),
)
# Base class send() is also mocked, so check it was attempted
adapter._app.client.chat_postMessage.assert_called_once()
@pytest.mark.asyncio
async def test_send_document_with_thread(self, adapter, tmp_path):
test_file = tmp_path / "notes.txt"
test_file.write_bytes(b"some notes")
adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
result = await adapter.send_document(
chat_id="C123",
file_path=str(test_file),
reply_to="1234567890.123456",
)
assert result.success
call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
assert call_kwargs["thread_ts"] == "1234567890.123456"
# ---------------------------------------------------------------------------
# TestSendVideo
# ---------------------------------------------------------------------------
class TestSendVideo:
@pytest.mark.asyncio
async def test_send_video_success(self, adapter, tmp_path):
video = tmp_path / "clip.mp4"
video.write_bytes(b"fake video data")
adapter._app.client.files_upload_v2 = AsyncMock(return_value={"ok": True})
result = await adapter.send_video(
chat_id="C123",
video_path=str(video),
caption="Check this out",
)
assert result.success
call_kwargs = adapter._app.client.files_upload_v2.call_args[1]
assert call_kwargs["filename"] == "clip.mp4"
assert call_kwargs["initial_comment"] == "Check this out"
@pytest.mark.asyncio
async def test_send_video_missing_file(self, adapter):
result = await adapter.send_video(
chat_id="C123",
video_path="/nonexistent/video.mp4",
)
assert not result.success
assert "not found" in result.error.lower()
@pytest.mark.asyncio
async def test_send_video_not_connected(self, adapter):
adapter._app = None
result = await adapter.send_video(
chat_id="C123",
video_path="/some/video.mp4",
)
assert not result.success
assert "Not connected" in result.error
@pytest.mark.asyncio
async def test_send_video_api_error_falls_back(self, adapter, tmp_path):
video = tmp_path / "clip.mp4"
video.write_bytes(b"fake video")
adapter._app.client.files_upload_v2 = AsyncMock(
side_effect=RuntimeError("Slack API error")
)
# Should fall back to base class (text message)
result = await adapter.send_video(
chat_id="C123",
video_path=str(video),
)
adapter._app.client.chat_postMessage.assert_called_once()
# ---------------------------------------------------------------------------
# TestIncomingDocumentHandling
# ---------------------------------------------------------------------------
class TestIncomingDocumentHandling:
def _make_event(self, files=None, text="hello", channel_type="im"):
"""Build a mock Slack message event with file attachments."""
return {
"text": text,
"user": "U_USER",
"channel": "C123",
"channel_type": channel_type,
"ts": "1234567890.000001",
"files": files or [],
}
@pytest.mark.asyncio
async def test_pdf_document_cached(self, adapter):
"""A PDF attachment should be downloaded, cached, and set as DOCUMENT type."""
pdf_bytes = b"%PDF-1.4 fake content"
with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
dl.return_value = pdf_bytes
event = self._make_event(files=[{
"mimetype": "application/pdf",
"name": "report.pdf",
"url_private_download": "https://files.slack.com/report.pdf",
"size": len(pdf_bytes),
}])
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert msg_event.message_type == MessageType.DOCUMENT
assert len(msg_event.media_urls) == 1
assert os.path.exists(msg_event.media_urls[0])
assert msg_event.media_types == ["application/pdf"]
@pytest.mark.asyncio
async def test_txt_document_injects_content(self, adapter):
"""A .txt file under 100KB should have its content injected into event text."""
content = b"Hello from a text file"
with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
dl.return_value = content
event = self._make_event(
text="summarize this",
files=[{
"mimetype": "text/plain",
"name": "notes.txt",
"url_private_download": "https://files.slack.com/notes.txt",
"size": len(content),
}],
)
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert "Hello from a text file" in msg_event.text
assert "[Content of notes.txt]" in msg_event.text
assert "summarize this" in msg_event.text
@pytest.mark.asyncio
async def test_md_document_injects_content(self, adapter):
"""A .md file under 100KB should have its content injected."""
content = b"# Title\nSome markdown content"
with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
dl.return_value = content
event = self._make_event(files=[{
"mimetype": "text/markdown",
"name": "readme.md",
"url_private_download": "https://files.slack.com/readme.md",
"size": len(content),
}], text="")
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert "# Title" in msg_event.text
@pytest.mark.asyncio
async def test_large_txt_not_injected(self, adapter):
"""A .txt file over 100KB should be cached but NOT injected."""
content = b"x" * (200 * 1024)
with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
dl.return_value = content
event = self._make_event(files=[{
"mimetype": "text/plain",
"name": "big.txt",
"url_private_download": "https://files.slack.com/big.txt",
"size": len(content),
}], text="")
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert len(msg_event.media_urls) == 1
assert "[Content of" not in (msg_event.text or "")
@pytest.mark.asyncio
async def test_unsupported_file_type_skipped(self, adapter):
"""A .zip file should be silently skipped."""
event = self._make_event(files=[{
"mimetype": "application/zip",
"name": "archive.zip",
"url_private_download": "https://files.slack.com/archive.zip",
"size": 1024,
}])
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert msg_event.message_type == MessageType.TEXT
assert len(msg_event.media_urls) == 0
@pytest.mark.asyncio
async def test_oversized_document_skipped(self, adapter):
"""A document over 20MB should be skipped."""
event = self._make_event(files=[{
"mimetype": "application/pdf",
"name": "huge.pdf",
"url_private_download": "https://files.slack.com/huge.pdf",
"size": 25 * 1024 * 1024,
}])
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert len(msg_event.media_urls) == 0
@pytest.mark.asyncio
async def test_document_download_error_handled(self, adapter):
"""If document download fails, handler should not crash."""
with patch.object(adapter, "_download_slack_file_bytes", new_callable=AsyncMock) as dl:
dl.side_effect = RuntimeError("download failed")
event = self._make_event(files=[{
"mimetype": "application/pdf",
"name": "report.pdf",
"url_private_download": "https://files.slack.com/report.pdf",
"size": 1024,
}])
await adapter._handle_slack_message(event)
# Handler should still be called (the exception is caught)
adapter.handle_message.assert_called_once()
@pytest.mark.asyncio
async def test_image_still_handled(self, adapter):
"""Image attachments should still go through the image path, not document."""
with patch.object(adapter, "_download_slack_file", new_callable=AsyncMock) as dl:
dl.return_value = "/tmp/cached_image.jpg"
event = self._make_event(files=[{
"mimetype": "image/jpeg",
"name": "photo.jpg",
"url_private_download": "https://files.slack.com/photo.jpg",
"size": 1024,
}])
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert msg_event.message_type == MessageType.PHOTO
# ---------------------------------------------------------------------------
# TestMessageRouting
# ---------------------------------------------------------------------------
class TestMessageRouting:
@pytest.mark.asyncio
async def test_dm_processed_without_mention(self, adapter):
"""DM messages should be processed without requiring a bot mention."""
event = {
"text": "hello",
"user": "U_USER",
"channel": "D123",
"channel_type": "im",
"ts": "1234567890.000001",
}
await adapter._handle_slack_message(event)
adapter.handle_message.assert_called_once()
@pytest.mark.asyncio
async def test_channel_message_requires_mention(self, adapter):
"""Channel messages without a bot mention should be ignored."""
event = {
"text": "just talking",
"user": "U_USER",
"channel": "C123",
"channel_type": "channel",
"ts": "1234567890.000001",
}
await adapter._handle_slack_message(event)
adapter.handle_message.assert_not_called()
@pytest.mark.asyncio
async def test_channel_mention_strips_bot_id(self, adapter):
"""When mentioned in a channel, the bot mention should be stripped."""
event = {
"text": "<@U_BOT> what's the weather?",
"user": "U_USER",
"channel": "C123",
"channel_type": "channel",
"ts": "1234567890.000001",
}
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert msg_event.text == "what's the weather?"
assert "<@U_BOT>" not in msg_event.text
@pytest.mark.asyncio
async def test_bot_messages_ignored(self, adapter):
"""Messages from bots should be ignored."""
event = {
"text": "bot response",
"bot_id": "B_OTHER",
"channel": "C123",
"channel_type": "im",
"ts": "1234567890.000001",
}
await adapter._handle_slack_message(event)
adapter.handle_message.assert_not_called()
@pytest.mark.asyncio
async def test_message_edits_ignored(self, adapter):
"""Message edits should be ignored."""
event = {
"text": "edited message",
"user": "U_USER",
"channel": "C123",
"channel_type": "im",
"ts": "1234567890.000001",
"subtype": "message_changed",
}
await adapter._handle_slack_message(event)
adapter.handle_message.assert_not_called()

View File

@@ -505,25 +505,6 @@ class TestExpandPath:
assert result == str(Path.home())
_assert_clean(result)
def test_tilde_injection_blocked(self, ops):
"""Paths like ~; rm -rf / must NOT execute shell commands."""
malicious = "~; echo PWNED > /tmp/_hermes_injection_test"
result = ops._expand_path(malicious)
# The invalid username (contains ";") should prevent shell expansion.
# The path should be returned as-is (no expansion).
assert result == malicious
# Verify the injected command did NOT execute
import os
assert not os.path.exists("/tmp/_hermes_injection_test")
def test_tilde_username_with_subpath(self, ops):
"""~root/file.txt should attempt expansion (valid username)."""
result = ops._expand_path("~root/file.txt")
# On most systems ~root expands to /root
if result != "~root/file.txt":
assert result.endswith("/file.txt")
assert "~" not in result
# ── Terminal output cleanliness ──────────────────────────────────────────

View File

@@ -1,351 +0,0 @@
"""Tests for tools/vision_tools.py — URL validation, type hints, error logging."""
import asyncio
import json
import logging
import os
from pathlib import Path
from typing import Awaitable
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from tools.vision_tools import (
_validate_image_url,
_handle_vision_analyze,
_determine_mime_type,
_image_to_base64_data_url,
vision_analyze_tool,
check_vision_requirements,
get_debug_session_info,
)
# ---------------------------------------------------------------------------
# _validate_image_url — urlparse-based validation
# ---------------------------------------------------------------------------
class TestValidateImageUrl:
"""Tests for URL validation, including urlparse-based netloc check."""
def test_valid_https_url(self):
assert _validate_image_url("https://example.com/image.jpg") is True
def test_valid_http_url(self):
assert _validate_image_url("http://cdn.example.org/photo.png") is True
def test_valid_url_without_extension(self):
"""CDN endpoints that redirect to images should still pass."""
assert _validate_image_url("https://cdn.example.com/abcdef123") is True
def test_valid_url_with_query_params(self):
assert _validate_image_url("https://img.example.com/pic?w=200&h=200") is True
def test_valid_url_with_port(self):
assert _validate_image_url("http://localhost:8080/image.png") is True
def test_valid_url_with_path_only(self):
assert _validate_image_url("https://example.com/") is True
def test_rejects_empty_string(self):
assert _validate_image_url("") is False
def test_rejects_none(self):
assert _validate_image_url(None) is False
def test_rejects_non_string(self):
assert _validate_image_url(12345) is False
def test_rejects_ftp_scheme(self):
assert _validate_image_url("ftp://files.example.com/image.jpg") is False
def test_rejects_file_scheme(self):
assert _validate_image_url("file:///etc/passwd") is False
def test_rejects_no_scheme(self):
assert _validate_image_url("example.com/image.jpg") is False
def test_rejects_javascript_scheme(self):
assert _validate_image_url("javascript:alert(1)") is False
def test_rejects_http_without_netloc(self):
"""http:// alone has no network location — urlparse catches this."""
assert _validate_image_url("http://") is False
def test_rejects_https_without_netloc(self):
assert _validate_image_url("https://") is False
def test_rejects_http_colon_only(self):
assert _validate_image_url("http:") is False
def test_rejects_data_url(self):
assert _validate_image_url("data:image/png;base64,iVBOR") is False
def test_rejects_whitespace_only(self):
assert _validate_image_url(" ") is False
def test_rejects_boolean(self):
assert _validate_image_url(True) is False
def test_rejects_list(self):
assert _validate_image_url(["https://example.com"]) is False
# ---------------------------------------------------------------------------
# _determine_mime_type
# ---------------------------------------------------------------------------
class TestDetermineMimeType:
def test_jpg(self):
assert _determine_mime_type(Path("photo.jpg")) == "image/jpeg"
def test_jpeg(self):
assert _determine_mime_type(Path("photo.jpeg")) == "image/jpeg"
def test_png(self):
assert _determine_mime_type(Path("screenshot.png")) == "image/png"
def test_gif(self):
assert _determine_mime_type(Path("anim.gif")) == "image/gif"
def test_webp(self):
assert _determine_mime_type(Path("modern.webp")) == "image/webp"
def test_unknown_extension_defaults_to_jpeg(self):
assert _determine_mime_type(Path("file.xyz")) == "image/jpeg"
# ---------------------------------------------------------------------------
# _image_to_base64_data_url
# ---------------------------------------------------------------------------
class TestImageToBase64DataUrl:
def test_returns_data_url(self, tmp_path):
img = tmp_path / "test.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 8)
result = _image_to_base64_data_url(img)
assert result.startswith("data:image/png;base64,")
def test_custom_mime_type(self, tmp_path):
img = tmp_path / "test.bin"
img.write_bytes(b"\x00" * 16)
result = _image_to_base64_data_url(img, mime_type="image/webp")
assert result.startswith("data:image/webp;base64,")
def test_file_not_found_raises(self, tmp_path):
with pytest.raises(FileNotFoundError):
_image_to_base64_data_url(tmp_path / "nonexistent.png")
# ---------------------------------------------------------------------------
# _handle_vision_analyze — type signature & behavior
# ---------------------------------------------------------------------------
class TestHandleVisionAnalyze:
"""Verify _handle_vision_analyze returns an Awaitable and builds correct prompt."""
def test_returns_awaitable(self):
"""The handler must return an Awaitable (coroutine) since it's registered as async."""
with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
mock_tool.return_value = json.dumps({"result": "ok"})
result = _handle_vision_analyze(
{"image_url": "https://example.com/img.png", "question": "What is this?"}
)
# It should be an Awaitable (coroutine)
assert isinstance(result, Awaitable)
# Clean up the coroutine to avoid RuntimeWarning
result.close()
def test_prompt_contains_question(self):
"""The full prompt should incorporate the user's question."""
with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
mock_tool.return_value = json.dumps({"result": "ok"})
coro = _handle_vision_analyze(
{"image_url": "https://example.com/img.png", "question": "Describe the cat"}
)
# Clean up coroutine
coro.close()
call_args = mock_tool.call_args
full_prompt = call_args[0][1] # second positional arg
assert "Describe the cat" in full_prompt
assert "Fully describe and explain" in full_prompt
def test_uses_auxiliary_vision_model_env(self):
"""AUXILIARY_VISION_MODEL env var should override DEFAULT_VISION_MODEL."""
with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \
patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "custom/model-v1"}):
mock_tool.return_value = json.dumps({"result": "ok"})
coro = _handle_vision_analyze(
{"image_url": "https://example.com/img.png", "question": "test"}
)
coro.close()
call_args = mock_tool.call_args
model = call_args[0][2] # third positional arg
assert model == "custom/model-v1"
def test_falls_back_to_default_model(self):
"""Without AUXILIARY_VISION_MODEL, should use DEFAULT_VISION_MODEL or fallback."""
with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool, \
patch.dict(os.environ, {}, clear=False):
# Ensure AUXILIARY_VISION_MODEL is not set
os.environ.pop("AUXILIARY_VISION_MODEL", None)
mock_tool.return_value = json.dumps({"result": "ok"})
coro = _handle_vision_analyze(
{"image_url": "https://example.com/img.png", "question": "test"}
)
coro.close()
call_args = mock_tool.call_args
model = call_args[0][2]
# Should be DEFAULT_VISION_MODEL or the hardcoded fallback
assert model is not None
assert len(model) > 0
def test_empty_args_graceful(self):
"""Missing keys should default to empty strings, not raise."""
with patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool:
mock_tool.return_value = json.dumps({"result": "ok"})
result = _handle_vision_analyze({})
assert isinstance(result, Awaitable)
result.close()
# ---------------------------------------------------------------------------
# Error logging with exc_info — verify tracebacks are logged
# ---------------------------------------------------------------------------
class TestErrorLoggingExcInfo:
"""Verify that exc_info=True is used in error/warning log calls."""
@pytest.mark.asyncio
async def test_download_failure_logs_exc_info(self, tmp_path, caplog):
"""After max retries, the download error should include exc_info."""
from tools.vision_tools import _download_image
with patch("tools.vision_tools.httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client.get = AsyncMock(side_effect=ConnectionError("network down"))
mock_client_cls.return_value = mock_client
dest = tmp_path / "image.jpg"
with caplog.at_level(logging.ERROR, logger="tools.vision_tools"), \
pytest.raises(ConnectionError):
await _download_image("https://example.com/img.jpg", dest, max_retries=1)
# Should have logged with exc_info (traceback present)
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
assert len(error_records) >= 1
assert error_records[0].exc_info is not None
@pytest.mark.asyncio
async def test_analysis_error_logs_exc_info(self, caplog):
"""When vision_analyze_tool encounters an error, it should log with exc_info."""
with patch("tools.vision_tools._validate_image_url", return_value=True), \
patch("tools.vision_tools._download_image", new_callable=AsyncMock,
side_effect=Exception("download boom")), \
caplog.at_level(logging.ERROR, logger="tools.vision_tools"):
result = await vision_analyze_tool(
"https://example.com/img.jpg", "describe this", "test/model"
)
result_data = json.loads(result)
# Error response uses "success": False, not an "error" key
assert result_data["success"] is False
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
assert any(r.exc_info is not None for r in error_records)
@pytest.mark.asyncio
async def test_cleanup_error_logs_exc_info(self, tmp_path, caplog):
"""Temp file cleanup failure should log warning with exc_info."""
# Create a real temp file that will be "downloaded"
temp_dir = tmp_path / "temp_vision_images"
temp_dir.mkdir()
async def fake_download(url, dest, max_retries=3):
"""Simulate download by writing file to the expected destination."""
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(b"\xff\xd8\xff" + b"\x00" * 16)
return dest
with patch("tools.vision_tools._validate_image_url", return_value=True), \
patch("tools.vision_tools._download_image", side_effect=fake_download), \
patch("tools.vision_tools._image_to_base64_data_url",
return_value="data:image/jpeg;base64,abc"), \
patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None), \
patch("agent.auxiliary_client.auxiliary_max_tokens_param", return_value={"max_tokens": 2000}), \
caplog.at_level(logging.WARNING, logger="tools.vision_tools"):
# Mock the vision client
mock_client = AsyncMock()
mock_response = MagicMock()
mock_choice = MagicMock()
mock_choice.message.content = "A test image description"
mock_response.choices = [mock_choice]
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
# Patch module-level _aux_async_client so the tool doesn't bail early
with patch("tools.vision_tools._aux_async_client", mock_client), \
patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"):
# Make unlink fail to trigger cleanup warning
original_unlink = Path.unlink
def failing_unlink(self, *args, **kwargs):
raise PermissionError("no permission")
with patch.object(Path, "unlink", failing_unlink):
result = await vision_analyze_tool(
"https://example.com/tempimg.jpg", "describe", "test/model"
)
warning_records = [r for r in caplog.records if r.levelno == logging.WARNING
and "temporary file" in r.getMessage().lower()]
assert len(warning_records) >= 1
assert warning_records[0].exc_info is not None
# ---------------------------------------------------------------------------
# check_vision_requirements & get_debug_session_info
# ---------------------------------------------------------------------------
class TestVisionRequirements:
def test_check_requirements_returns_bool(self):
result = check_vision_requirements()
assert isinstance(result, bool)
def test_debug_session_info_returns_dict(self):
info = get_debug_session_info()
assert isinstance(info, dict)
# DebugSession.get_session_info() returns these keys
assert "enabled" in info
assert "session_id" in info
assert "total_calls" in info
# ---------------------------------------------------------------------------
# Integration: registry entry
# ---------------------------------------------------------------------------
class TestVisionRegistration:
def test_vision_analyze_registered(self):
from tools.registry import registry
entry = registry._tools.get("vision_analyze")
assert entry is not None
assert entry.toolset == "vision"
assert entry.is_async is True
def test_schema_has_required_fields(self):
from tools.registry import registry
entry = registry._tools.get("vision_analyze")
schema = entry.schema
assert schema["name"] == "vision_analyze"
params = schema.get("parameters", {})
props = params.get("properties", {})
assert "image_url" in props
assert "question" in props
def test_handler_is_callable(self):
from tools.registry import registry
entry = registry._tools.get("vision_analyze")
assert callable(entry.handler)

View File

@@ -400,16 +400,10 @@ class ShellFileOperations(FileOperations):
return home
elif path.startswith('~/'):
return home + path[1:] # Replace ~ with home
# ~username format - extract and validate username before
# letting shell expand it (prevent shell injection via
# paths like "~; rm -rf /").
rest = path[1:] # strip leading ~
slash_idx = rest.find('/')
username = rest[:slash_idx] if slash_idx >= 0 else rest
if username and re.fullmatch(r'[a-zA-Z0-9._-]+', username):
expand_result = self._exec(f"echo {path}")
if expand_result.exit_code == 0 and expand_result.stdout.strip():
return expand_result.stdout.strip()
# ~username format - let shell expand it
expand_result = self._exec(f"echo {path}")
if expand_result.exit_code == 0:
return expand_result.stdout.strip()
return path

View File

@@ -27,15 +27,14 @@ Usage:
)
"""
import asyncio
import base64
import json
import logging
import os
import asyncio
import uuid
import base64
from pathlib import Path
from typing import Any, Awaitable, Dict, Optional
from urllib.parse import urlparse
from typing import Dict, Any, Optional
import httpx
from openai import AsyncOpenAI
from agent.auxiliary_client import get_vision_auxiliary_client
@@ -74,18 +73,15 @@ def _validate_image_url(url: str) -> bool:
"""
if not url or not isinstance(url, str):
return False
# Basic HTTP/HTTPS URL check
if not (url.startswith("http://") or url.startswith("https://")):
# Check if it's a valid URL format
if not (url.startswith('http://') or url.startswith('https://')):
return False
# Parse to ensure we at least have a network location; still allow URLs
# without file extensions (e.g. CDN endpoints that redirect to images).
parsed = urlparse(url)
if not parsed.netloc:
return False
return True # Allow all well-formed HTTP/HTTPS URLs for flexibility
# Check for common image extensions (optional, as URLs may not have extensions)
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg']
return True # Allow all HTTP/HTTPS URLs for flexibility
async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
@@ -135,12 +131,7 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
logger.warning("Retrying in %ss...", wait_time)
await asyncio.sleep(wait_time)
else:
logger.error(
"Image download failed after %s attempts: %s",
max_retries,
str(e)[:100],
exc_info=True,
)
logger.error("Image download failed after %s attempts: %s", max_retries, str(e)[:100])
raise last_error
@@ -197,7 +188,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
async def vision_analyze_tool(
image_url: str,
user_prompt: str,
model: str = DEFAULT_VISION_MODEL,
model: str = DEFAULT_VISION_MODEL
) -> str:
"""
Analyze an image from a URL or local file path using vision AI.
@@ -356,7 +347,7 @@ async def vision_analyze_tool(
except Exception as e:
error_msg = f"Error analyzing image: {str(e)}"
logger.error("%s", error_msg, exc_info=True)
logger.error("%s", error_msg)
# Prepare error response
result = {
@@ -377,9 +368,7 @@ async def vision_analyze_tool(
temp_image_path.unlink()
logger.debug("Cleaned up temporary image file")
except Exception as cleanup_error:
logger.warning(
"Could not delete temporary file: %s", cleanup_error, exc_info=True
)
logger.warning("Could not delete temporary file: %s", cleanup_error)
def check_vision_requirements() -> bool:
@@ -475,13 +464,10 @@ VISION_ANALYZE_SCHEMA = {
}
def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
def _handle_vision_analyze(args, **kw):
image_url = args.get("image_url", "")
question = args.get("question", "")
full_prompt = (
"Fully describe and explain everything about this image, then answer the "
f"following question:\n\n{question}"
)
full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
or DEFAULT_VISION_MODEL
or "google/gemini-3-flash-preview")

View File

@@ -393,40 +393,8 @@ terminal:
backend: local # or: docker, ssh, singularity, modal, daytona
cwd: "." # Working directory ("." = current dir)
timeout: 180 # Command timeout in seconds
# Docker-specific settings
docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
docker_volumes: # Share host directories with the container
- "/home/user/projects:/workspace/projects"
- "/home/user/data:/data:ro" # :ro for read-only
# Container resource limits (docker, singularity, modal, daytona)
container_cpu: 1 # CPU cores
container_memory: 5120 # MB (default 5GB)
container_disk: 51200 # MB (default 50GB)
container_persistent: true # Persist filesystem across sessions
```
### Docker Volume Mounts
When using the Docker backend, `docker_volumes` lets you share host directories with the container. Each entry uses standard Docker `-v` syntax: `host_path:container_path[:options]`.
```yaml
terminal:
backend: docker
docker_volumes:
- "/home/user/projects:/workspace/projects" # Read-write (default)
- "/home/user/datasets:/data:ro" # Read-only
- "/home/user/outputs:/outputs" # Agent writes, you read
```
This is useful for:
- **Providing files** to the agent (datasets, configs, reference code)
- **Receiving files** from the agent (generated code, reports, exports)
- **Shared workspaces** where both you and the agent access the same files
Can also be set via environment variable: `TERMINAL_DOCKER_VOLUMES='["/host:/container"]'` (JSON array).
See [Code Execution](features/code-execution.md) and the [Terminal section of the README](features/tools.md) for details on each backend.
## Memory Configuration

View File

@@ -46,26 +46,20 @@ Navigate to **Features → OAuth & Permissions** in the sidebar. Scroll to **Sco
| Scope | Purpose |
|-------|---------|
| `chat:write` | Send messages as the bot |
| `app_mentions:read` | Detect when @mentioned in channels |
| `app_mentions:read` | Respond when @mentioned in channels |
| `channels:history` | Read messages in public channels the bot is in |
| `channels:read` | List and get info about public channels |
| `groups:history` | Read messages in private channels the bot is invited to |
| `im:history` | Read direct message history |
| `im:read` | View basic DM info |
| `im:write` | Open and manage DMs |
| `users:read` | Look up user information |
| `files:write` | Upload files (images, audio, documents) |
:::caution Missing scopes = missing features
Without `channels:history` and `groups:history`, the bot **will not receive messages in channels**
it will only work in DMs. These are the most commonly missed scopes.
:::
**Optional scopes:**
| Scope | Purpose |
|-------|---------|
| `groups:read` | List and get info about private channels |
| `groups:history` | Read messages in private channels the bot is invited to |
| `files:write` | Upload files (audio, images) |
---
@@ -89,27 +83,23 @@ You can always find or regenerate app-level tokens under **Settings → Basic In
## Step 4: Subscribe to Events
This step is critical — it controls what messages the bot can see.
1. In the sidebar, go to **Features → Event Subscriptions**
2. Toggle **Enable Events** to ON
3. Expand **Subscribe to bot events** and add:
| Event | Required? | Purpose |
|-------|-----------|---------|
| `message.im` | **Yes** | Bot receives direct messages |
| `message.channels` | **Yes** | Bot receives messages in **public** channels it's added to |
| `message.groups` | **Recommended** | Bot receives messages in **private** channels it's invited to |
| `app_mention` | **Yes** | Prevents Bolt SDK errors when bot is @mentioned |
| Event | Purpose |
|-------|---------|
| `app_mention` | Bot responds when @mentioned in any channel |
| `message.im` | Bot responds to direct messages |
**Optional event:**
| Event | Purpose |
|-------|---------|
| `message.channels` | Bot sees all messages in public channels it's added to |
4. Click **Save Changes** at the bottom of the page
:::danger Missing event subscriptions is the #1 setup issue
If the bot works in DMs but **not in channels**, you almost certainly forgot to add
`message.channels` (for public channels) and/or `message.groups` (for private channels).
Without these events, Slack simply never delivers channel messages to the bot.
:::
---
## Step 5: Install App to Workspace
@@ -121,8 +111,8 @@ Without these events, Slack simply never delivers channel messages to the bot.
5. **Copy this token** — this is your `SLACK_BOT_TOKEN`
:::tip
If you change scopes or event subscriptions later, you **must reinstall the app** for the changes
to take effect. The Install App page will show a banner prompting you to do so.
If you change scopes later, you'll need to **reinstall the app** for the new scopes to take effect.
The Install App page will show a banner prompting you to do so.
:::
---
@@ -149,7 +139,7 @@ Add the following to your `~/.hermes/.env` file:
```bash
# Required
SLACK_BOT_TOKEN=xoxb-your-bot-token-here
SLACK_APP_TOKEN=xapp-your-app-token-here
SLACK_APP_TOKEN=xapp-your-app-level-token-here
SLACK_ALLOWED_USERS=U01ABC2DEF3 # Comma-separated Member IDs
# Optional
@@ -171,35 +161,6 @@ hermes gateway install # Install as a system service
---
## Step 8: Invite the Bot to Channels
After starting the gateway, you need to **invite the bot** to any channel where you want it to respond:
```
/invite @Hermes Agent
```
The bot will **not** automatically join channels. You must invite it to each channel individually.
---
## How the Bot Responds
Understanding how Hermes behaves in different contexts:
| Context | Behavior |
|---------|----------|
| **DMs** | Bot responds to every message — no @mention needed |
| **Channels** | Bot **only responds when @mentioned** (e.g., `@Hermes Agent what time is it?`) |
| **Threads** | Bot replies in threads when the triggering message is in a thread |
:::tip
In channels, always @mention the bot. Simply typing a message without mentioning it will be ignored.
This is intentional — it prevents the bot from responding to every message in busy channels.
:::
---
## Home Channel
Set `SLACK_HOME_CHANNEL` to a channel ID where Hermes will deliver scheduled messages,
@@ -231,27 +192,11 @@ Hermes supports voice on Slack:
| Problem | Solution |
|---------|----------|
| Bot doesn't respond to DMs | Verify `message.im` is in your event subscriptions and the app is reinstalled |
| Bot works in DMs but not in channels | **Most common issue.** Add `message.channels` and `message.groups` to event subscriptions, reinstall the app, and invite the bot to the channel with `/invite @Hermes Agent` |
| Bot doesn't respond to @mentions in channels | 1) Check `message.channels` event is subscribed. 2) Bot must be invited to the channel. 3) Ensure `channels:history` scope is added. 4) Reinstall the app after scope/event changes |
| Bot ignores messages in private channels | Add both the `message.groups` event subscription and `groups:history` scope, then reinstall the app and `/invite` the bot |
| Bot doesn't respond to @mentions | Verify `app_mention` is in your event subscriptions |
| "not_authed" or "invalid_auth" errors | Regenerate your Bot Token and App Token, update `.env` |
| Bot responds but can't post in a channel | Invite the bot to the channel with `/invite @Hermes Agent` |
| "missing_scope" error | Add the required scope in OAuth & Permissions, then **reinstall** the app |
| Socket disconnects frequently | Check your network; Bolt auto-reconnects but unstable connections cause lag |
| Changed scopes/events but nothing changed | You **must reinstall** the app to your workspace after any scope or event subscription change |
### Quick Checklist
If the bot isn't working in channels, verify **all** of the following:
1.`message.channels` event is subscribed (for public channels)
2.`message.groups` event is subscribed (for private channels)
3.`app_mention` event is subscribed
4.`channels:history` scope is added (for public channels)
5.`groups:history` scope is added (for private channels)
6. ✅ App was **reinstalled** after adding scopes/events
7. ✅ Bot was **invited** to the channel (`/invite @Hermes Agent`)
8. ✅ You are **@mentioning** the bot in your message
---