Files
hermes-agent/cron/scheduler.py

1180 lines
46 KiB
Python
Raw Normal View History

"""
Cron job scheduler - executes due jobs.
Provides tick() which checks for due jobs and runs them. The gateway
calls this every 60 seconds from a background thread.
Uses a file-based lock (~/.hermes/cron/.tick.lock) so only one tick
runs at a time if multiple processes overlap.
"""
import asyncio
import concurrent.futures
import contextvars
import json
import logging
import os
import subprocess
import sys
2026-02-25 16:27:40 -08:00
# fcntl is Unix-only; on Windows use msvcrt for file locking
try:
import fcntl
except ImportError:
fcntl = None
try:
import msvcrt
except ImportError:
msvcrt = None
from pathlib import Path
from typing import List, Optional
# Add parent directory to path for imports BEFORE repo-level imports.
# Without this, standalone invocations (e.g. after `hermes update` reloads
# the module) fail with ModuleNotFoundError for hermes_time et al.
sys.path.insert(0, str(Path(__file__).parent.parent))
from hermes_constants import get_hermes_home
from hermes_cli.config import load_config
from hermes_time import now as _hermes_now
logger = logging.getLogger(__name__)
# Valid delivery platforms — used to validate user-supplied platform names
# in cron delivery targets, preventing env var enumeration via crafted names.
_KNOWN_DELIVERY_PLATFORMS = frozenset({
"telegram", "discord", "slack", "whatsapp", "signal",
"matrix", "mattermost", "homeassistant", "dingtalk", "feishu",
"wecom", "wecom_callback", "weixin", "sms", "email", "webhook", "bluebubbles",
"qqbot",
})
# Platforms that support a configured cron/notification home target, mapped to
# the environment variable used by gateway setup/runtime config.
_HOME_TARGET_ENV_VARS = {
"matrix": "MATRIX_HOME_ROOM",
"telegram": "TELEGRAM_HOME_CHANNEL",
"discord": "DISCORD_HOME_CHANNEL",
"slack": "SLACK_HOME_CHANNEL",
"signal": "SIGNAL_HOME_CHANNEL",
"mattermost": "MATTERMOST_HOME_CHANNEL",
"sms": "SMS_HOME_CHANNEL",
"email": "EMAIL_HOME_ADDRESS",
"dingtalk": "DINGTALK_HOME_CHANNEL",
"feishu": "FEISHU_HOME_CHANNEL",
"wecom": "WECOM_HOME_CHANNEL",
"weixin": "WEIXIN_HOME_CHANNEL",
"bluebubbles": "BLUEBUBBLES_HOME_CHANNEL",
fix(qqbot): add back-compat for env var rename; drop qrcode core dep Follow-up to WideLee's salvaged PR #11582. Back-compat for QQ_HOME_CHANNEL → QQBOT_HOME_CHANNEL rename: - gateway/config.py reads QQBOT_HOME_CHANNEL, falls back to QQ_HOME_CHANNEL with a one-shot deprecation warning so users on the old name aren't silently broken. - cron/scheduler.py: _HOME_TARGET_ENV_VARS['qqbot'] now maps to the new name; _get_home_target_chat_id falls back to the legacy name via a _LEGACY_HOME_TARGET_ENV_VARS table. - hermes_cli/status.py + hermes_cli/setup.py: honor both names when displaying or checking for missing home channels. - hermes_cli/config.py: keep legacy QQ_HOME_CHANNEL[_NAME] in _EXTRA_ENV_KEYS so .env sanitization still recognizes them. Scope cleanup: - Drop qrcode from core dependencies and requirements.txt (remains in messaging/dingtalk/feishu extras). _qqbot_render_qr already degrades gracefully when qrcode is missing, printing a 'pip install qrcode' tip and falling back to URL-only display. - Restore @staticmethod on QQAdapter._detect_message_type (it doesn't use self). Revert the test change that was only needed when it was converted to an instance method. - Reset uv.lock to origin/main; the PR's stale lock also included unrelated changes (atroposlib source URL, hermes-agent version bump, fastapi additions) that don't belong. Verified E2E: - Existing user (QQ_HOME_CHANNEL set): gateway + cron both pick up the legacy name; deprecation warning logs once. - Fresh user (QQBOT_HOME_CHANNEL set): gateway + cron use new name, no warning. - Both set: new name wins on both surfaces. Targeted tests: 296 passed, 4 skipped (qqbot + cron + hermes_cli).
2026-04-17 15:29:15 -07:00
"qqbot": "QQBOT_HOME_CHANNEL",
}
# Legacy env var names kept for back-compat. Each entry is the current
# primary env var → the previous name. _get_home_target_chat_id falls
# back to the legacy name if the primary is unset, so users who set the
# old name before the rename keep working until they migrate.
_LEGACY_HOME_TARGET_ENV_VARS = {
"QQBOT_HOME_CHANNEL": "QQ_HOME_CHANNEL",
}
from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run
# Sentinel: when a cron agent has nothing new to report, it can start its
# response with this marker to suppress delivery. Output is still saved
# locally for audit.
SILENT_MARKER = "[SILENT]"
# Resolve Hermes home directory (respects HERMES_HOME override)
_hermes_home = get_hermes_home()
# File-based lock prevents concurrent ticks from gateway + daemon + systemd timer
_LOCK_DIR = _hermes_home / "cron"
_LOCK_FILE = _LOCK_DIR / ".tick.lock"
def _resolve_origin(job: dict) -> Optional[dict]:
"""Extract origin info from a job, preserving any extra routing metadata."""
origin = job.get("origin")
if not origin:
return None
platform = origin.get("platform")
chat_id = origin.get("chat_id")
if platform and chat_id:
return origin
return None
def _get_home_target_chat_id(platform_name: str) -> str:
"""Return the configured home target chat/room ID for a delivery platform."""
env_var = _HOME_TARGET_ENV_VARS.get(platform_name.lower())
if not env_var:
return ""
fix(qqbot): add back-compat for env var rename; drop qrcode core dep Follow-up to WideLee's salvaged PR #11582. Back-compat for QQ_HOME_CHANNEL → QQBOT_HOME_CHANNEL rename: - gateway/config.py reads QQBOT_HOME_CHANNEL, falls back to QQ_HOME_CHANNEL with a one-shot deprecation warning so users on the old name aren't silently broken. - cron/scheduler.py: _HOME_TARGET_ENV_VARS['qqbot'] now maps to the new name; _get_home_target_chat_id falls back to the legacy name via a _LEGACY_HOME_TARGET_ENV_VARS table. - hermes_cli/status.py + hermes_cli/setup.py: honor both names when displaying or checking for missing home channels. - hermes_cli/config.py: keep legacy QQ_HOME_CHANNEL[_NAME] in _EXTRA_ENV_KEYS so .env sanitization still recognizes them. Scope cleanup: - Drop qrcode from core dependencies and requirements.txt (remains in messaging/dingtalk/feishu extras). _qqbot_render_qr already degrades gracefully when qrcode is missing, printing a 'pip install qrcode' tip and falling back to URL-only display. - Restore @staticmethod on QQAdapter._detect_message_type (it doesn't use self). Revert the test change that was only needed when it was converted to an instance method. - Reset uv.lock to origin/main; the PR's stale lock also included unrelated changes (atroposlib source URL, hermes-agent version bump, fastapi additions) that don't belong. Verified E2E: - Existing user (QQ_HOME_CHANNEL set): gateway + cron both pick up the legacy name; deprecation warning logs once. - Fresh user (QQBOT_HOME_CHANNEL set): gateway + cron use new name, no warning. - Both set: new name wins on both surfaces. Targeted tests: 296 passed, 4 skipped (qqbot + cron + hermes_cli).
2026-04-17 15:29:15 -07:00
value = os.getenv(env_var, "")
if not value:
legacy = _LEGACY_HOME_TARGET_ENV_VARS.get(env_var)
if legacy:
value = os.getenv(legacy, "")
return value
def _resolve_single_delivery_target(job: dict, deliver_value: str) -> Optional[dict]:
"""Resolve one concrete auto-delivery target for a cron job."""
origin = _resolve_origin(job)
if deliver_value == "local":
return None
if deliver_value == "origin":
if origin:
return {
"platform": origin["platform"],
"chat_id": str(origin["chat_id"]),
"thread_id": origin.get("thread_id"),
}
# Origin missing (e.g. job created via API/script) — try each
# platform's home channel as a fallback instead of silently dropping.
for platform_name in _HOME_TARGET_ENV_VARS:
chat_id = _get_home_target_chat_id(platform_name)
if chat_id:
logger.info(
"Job '%s' has deliver=origin but no origin; falling back to %s home channel",
job.get("name", job.get("id", "?")),
platform_name,
)
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": None,
}
return None
if ":" in deliver_value:
platform_name, rest = deliver_value.split(":", 1)
platform_key = platform_name.lower()
from tools.send_message_tool import _parse_target_ref
parsed_chat_id, parsed_thread_id, is_explicit = _parse_target_ref(platform_key, rest)
if is_explicit:
chat_id, thread_id = parsed_chat_id, parsed_thread_id
else:
chat_id, thread_id = rest, None
# Resolve human-friendly labels like "Alice (dm)" to real IDs.
try:
from gateway.channel_directory import resolve_channel_name
resolved = resolve_channel_name(platform_key, chat_id)
if resolved:
parsed_chat_id, parsed_thread_id, resolved_is_explicit = _parse_target_ref(platform_key, resolved)
if resolved_is_explicit:
chat_id, thread_id = parsed_chat_id, parsed_thread_id
else:
chat_id = resolved
except Exception:
pass
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": thread_id,
}
platform_name = deliver_value
if origin and origin.get("platform") == platform_name:
return {
"platform": platform_name,
"chat_id": str(origin["chat_id"]),
"thread_id": origin.get("thread_id"),
}
if platform_name.lower() not in _KNOWN_DELIVERY_PLATFORMS:
return None
chat_id = _get_home_target_chat_id(platform_name)
if not chat_id:
return None
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": None,
}
def _resolve_delivery_targets(job: dict) -> List[dict]:
"""Resolve all concrete auto-delivery targets for a cron job (supports comma-separated deliver)."""
deliver = job.get("deliver", "local")
if deliver == "local":
return []
parts = [p.strip() for p in str(deliver).split(",") if p.strip()]
seen = set()
targets = []
for part in parts:
target = _resolve_single_delivery_target(job, part)
if target:
key = (target["platform"].lower(), str(target["chat_id"]), target.get("thread_id"))
if key not in seen:
seen.add(key)
targets.append(target)
return targets
def _resolve_delivery_target(job: dict) -> Optional[dict]:
"""Resolve the concrete auto-delivery target for a cron job, if any."""
targets = _resolve_delivery_targets(job)
return targets[0] if targets else None
# Media extension sets — keep in sync with gateway/platforms/base.py:_process_message_background
_AUDIO_EXTS = frozenset({'.ogg', '.opus', '.mp3', '.wav', '.m4a'})
_VIDEO_EXTS = frozenset({'.mp4', '.mov', '.avi', '.mkv', '.webm', '.3gp'})
_IMAGE_EXTS = frozenset({'.jpg', '.jpeg', '.png', '.webp', '.gif'})
def _send_media_via_adapter(adapter, chat_id: str, media_files: list, metadata: dict | None, loop, job: dict) -> None:
"""Send extracted MEDIA files as native platform attachments via a live adapter.
Routes each file to the appropriate adapter method (send_voice, send_image_file,
send_video, send_document) based on file extension mirroring the routing logic
in ``BasePlatformAdapter._process_message_background``.
"""
from pathlib import Path
for media_path, _is_voice in media_files:
try:
ext = Path(media_path).suffix.lower()
if ext in _AUDIO_EXTS:
coro = adapter.send_voice(chat_id=chat_id, audio_path=media_path, metadata=metadata)
elif ext in _VIDEO_EXTS:
coro = adapter.send_video(chat_id=chat_id, video_path=media_path, metadata=metadata)
elif ext in _IMAGE_EXTS:
coro = adapter.send_image_file(chat_id=chat_id, image_path=media_path, metadata=metadata)
else:
coro = adapter.send_document(chat_id=chat_id, file_path=media_path, metadata=metadata)
future = asyncio.run_coroutine_threadsafe(coro, loop)
try:
result = future.result(timeout=30)
except TimeoutError:
future.cancel()
raise
if result and not getattr(result, "success", True):
logger.warning(
"Job '%s': media send failed for %s: %s",
job.get("id", "?"), media_path, getattr(result, "error", "unknown"),
)
except Exception as e:
logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e)
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Optional[str]:
"""
Deliver job output to the configured target(s) (origin chat, specific platform, etc.).
When ``adapters`` and ``loop`` are provided (gateway is running), tries to
use the live adapter first this supports E2EE rooms (e.g. Matrix) where
the standalone HTTP path cannot encrypt. Falls back to standalone send if
the adapter path fails or is unavailable.
Returns None on success, or an error string on failure.
"""
targets = _resolve_delivery_targets(job)
if not targets:
if job.get("deliver", "local") != "local":
msg = f"no delivery target resolved for deliver={job.get('deliver', 'local')}"
logger.warning("Job '%s': %s", job["id"], msg)
return msg
return None # local-only jobs don't deliver — not a failure
from tools.send_message_tool import _send_to_platform
from gateway.config import load_gateway_config, Platform
platform_map = {
"telegram": Platform.TELEGRAM,
"discord": Platform.DISCORD,
"slack": Platform.SLACK,
"whatsapp": Platform.WHATSAPP,
fix: Signal adapter parity pass — integration gaps, clawdbot features, env var simplification Integration gaps fixed (7 files missing Signal): - cron/scheduler.py: Signal in platform_map (cron delivery was broken) - agent/prompt_builder.py: PLATFORM_HINTS for Signal (agent knows it's on Signal) - toolsets.py: hermes-signal toolset + added to hermes-gateway composite - hermes_cli/status.py: Signal + Slack in platform status display - tools/send_message_tool.py: Signal example in target description - tools/cronjob_tools.py: Signal in delivery option docs + schema - gateway/channel_directory.py: Signal in session-based channel discovery Clawdbot parity features added to signal.py: - Self-message filtering: prevents reply loops by checking sender != account - SyncMessage filtering: ignores sync envelopes (sent transcripts, read receipts) - Edit message support: reads dataMessage from editMessage envelope - Mention rendering: replaces \uFFFC placeholders with @identifier text - Jitter in SSE reconnection backoff (20% randomization, prevents thundering herd) Env var simplification (7 → 4): - Removed SIGNAL_DM_POLICY (DM auth follows standard platform pattern via SIGNAL_ALLOWED_USERS + DM pairing, same as Telegram/Discord) - Removed SIGNAL_GROUP_POLICY (derived from SIGNAL_GROUP_ALLOWED_USERS: not set = disabled, set with IDs = allowlist, set with * = open) - Removed SIGNAL_DEBUG (was setting root logger, removed entirely) - Remaining: SIGNAL_HTTP_URL, SIGNAL_ACCOUNT (required), SIGNAL_ALLOWED_USERS, SIGNAL_GROUP_ALLOWED_USERS (optional) Updated all docs (website, AGENTS.md, signal.md) to match.
2026-03-08 21:00:21 -07:00
"signal": Platform.SIGNAL,
"matrix": Platform.MATRIX,
"mattermost": Platform.MATTERMOST,
"homeassistant": Platform.HOMEASSISTANT,
"dingtalk": Platform.DINGTALK,
"feishu": Platform.FEISHU,
"wecom": Platform.WECOM,
"wecom_callback": Platform.WECOM_CALLBACK,
"weixin": Platform.WEIXIN,
"email": Platform.EMAIL,
"sms": Platform.SMS,
"bluebubbles": Platform.BLUEBUBBLES,
"qqbot": Platform.QQBOT,
}
# Optionally wrap the content with a header/footer so the user knows this
# is a cron delivery. Wrapping is on by default; set cron.wrap_response: false
# in config.yaml for clean output.
wrap_response = True
try:
user_cfg = load_config()
wrap_response = user_cfg.get("cron", {}).get("wrap_response", True)
except Exception:
pass
if wrap_response:
task_name = job.get("name", job["id"])
fix(cron): include job_id in delivery and guide models on removal workflow (#10242) * fix(gateway): suppress duplicate replies on interrupt and streaming flood control Three fixes for the duplicate reply bug affecting all gateway platforms: 1. base.py: Suppress stale response when the session was interrupted by a new message that hasn't been consumed yet. Checks both interrupt_event and _pending_messages to avoid false positives. (#8221, #2483) 2. run.py (return path): Remove response_previewed guard from already_sent check. Stream consumer's already_sent alone is authoritative — if content was delivered via streaming, the duplicate send must be suppressed regardless of the agent's response_previewed flag. (#8375) 3. run.py (queued-message path): Same fix — already_sent without response_previewed now correctly marks the first response as already streamed, preventing re-send before processing the queued message. The response_previewed field is still produced by the agent (run_agent.py) but is no longer required as a gate for duplicate suppression. The stream consumer's already_sent flag is the delivery-level truth about what the user actually saw. Concepts from PR #8380 (konsisumer). Closes #8375, #8221, #2483. * fix(cron): include job_id in delivery and guide models on removal workflow Users reported cron reminders keep firing after asking the agent to stop. Root cause: the conversational agent didn't know the job_id (not in delivery) and models don't reliably do the list→remove two-step without guidance. 1. Include job_id in the cron delivery wrapper so users and agents can reference it when requesting removal. 2. Replace confusing footer ('The agent cannot see this message') with actionable guidance ('To stop or manage this job, send me a new message'). 3. Add explicit list→remove guidance in the cronjob tool schema so models know to list first and never guess job IDs.
2026-04-15 03:46:58 -07:00
job_id = job.get("id", "")
delivery_content = (
f"Cronjob Response: {task_name}\n"
fix(cron): include job_id in delivery and guide models on removal workflow (#10242) * fix(gateway): suppress duplicate replies on interrupt and streaming flood control Three fixes for the duplicate reply bug affecting all gateway platforms: 1. base.py: Suppress stale response when the session was interrupted by a new message that hasn't been consumed yet. Checks both interrupt_event and _pending_messages to avoid false positives. (#8221, #2483) 2. run.py (return path): Remove response_previewed guard from already_sent check. Stream consumer's already_sent alone is authoritative — if content was delivered via streaming, the duplicate send must be suppressed regardless of the agent's response_previewed flag. (#8375) 3. run.py (queued-message path): Same fix — already_sent without response_previewed now correctly marks the first response as already streamed, preventing re-send before processing the queued message. The response_previewed field is still produced by the agent (run_agent.py) but is no longer required as a gate for duplicate suppression. The stream consumer's already_sent flag is the delivery-level truth about what the user actually saw. Concepts from PR #8380 (konsisumer). Closes #8375, #8221, #2483. * fix(cron): include job_id in delivery and guide models on removal workflow Users reported cron reminders keep firing after asking the agent to stop. Root cause: the conversational agent didn't know the job_id (not in delivery) and models don't reliably do the list→remove two-step without guidance. 1. Include job_id in the cron delivery wrapper so users and agents can reference it when requesting removal. 2. Replace confusing footer ('The agent cannot see this message') with actionable guidance ('To stop or manage this job, send me a new message'). 3. Add explicit list→remove guidance in the cronjob tool schema so models know to list first and never guess job IDs.
2026-04-15 03:46:58 -07:00
f"(job_id: {job_id})\n"
f"-------------\n\n"
f"{content}\n\n"
fix(cron): include job_id in delivery and guide models on removal workflow (#10242) * fix(gateway): suppress duplicate replies on interrupt and streaming flood control Three fixes for the duplicate reply bug affecting all gateway platforms: 1. base.py: Suppress stale response when the session was interrupted by a new message that hasn't been consumed yet. Checks both interrupt_event and _pending_messages to avoid false positives. (#8221, #2483) 2. run.py (return path): Remove response_previewed guard from already_sent check. Stream consumer's already_sent alone is authoritative — if content was delivered via streaming, the duplicate send must be suppressed regardless of the agent's response_previewed flag. (#8375) 3. run.py (queued-message path): Same fix — already_sent without response_previewed now correctly marks the first response as already streamed, preventing re-send before processing the queued message. The response_previewed field is still produced by the agent (run_agent.py) but is no longer required as a gate for duplicate suppression. The stream consumer's already_sent flag is the delivery-level truth about what the user actually saw. Concepts from PR #8380 (konsisumer). Closes #8375, #8221, #2483. * fix(cron): include job_id in delivery and guide models on removal workflow Users reported cron reminders keep firing after asking the agent to stop. Root cause: the conversational agent didn't know the job_id (not in delivery) and models don't reliably do the list→remove two-step without guidance. 1. Include job_id in the cron delivery wrapper so users and agents can reference it when requesting removal. 2. Replace confusing footer ('The agent cannot see this message') with actionable guidance ('To stop or manage this job, send me a new message'). 3. Add explicit list→remove guidance in the cronjob tool schema so models know to list first and never guess job IDs.
2026-04-15 03:46:58 -07:00
f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
)
else:
delivery_content = content
# Extract MEDIA: tags so attachments are forwarded as files, not raw text
from gateway.platforms.base import BasePlatformAdapter
media_files, cleaned_delivery_content = BasePlatformAdapter.extract_media(delivery_content)
try:
config = load_gateway_config()
except Exception as e:
msg = f"failed to load gateway config: {e}"
logger.error("Job '%s': %s", job["id"], msg)
return msg
delivery_errors = []
for target in targets:
platform_name = target["platform"]
chat_id = target["chat_id"]
thread_id = target.get("thread_id")
# Diagnostic: log thread_id for topic-aware delivery debugging
origin = job.get("origin") or {}
origin_thread = origin.get("thread_id")
if origin_thread and not thread_id:
logger.warning(
"Job '%s': origin has thread_id=%s but delivery target lost it "
"(deliver=%s, target=%s)",
job["id"], origin_thread, job.get("deliver", "local"), target,
)
elif thread_id:
logger.debug(
"Job '%s': delivering to %s:%s thread_id=%s",
job["id"], platform_name, chat_id, thread_id,
)
platform = platform_map.get(platform_name.lower())
if not platform:
msg = f"unknown platform '{platform_name}'"
logger.warning("Job '%s': %s", job["id"], msg)
delivery_errors.append(msg)
continue
# Prefer the live adapter when the gateway is running — this supports E2EE
# rooms (e.g. Matrix) where the standalone HTTP path cannot encrypt.
runtime_adapter = (adapters or {}).get(platform)
delivered = False
if runtime_adapter is not None and loop is not None and getattr(loop, "is_running", lambda: False)():
send_metadata = {"thread_id": thread_id} if thread_id else None
try:
# Send cleaned text (MEDIA tags stripped) — not the raw content
text_to_send = cleaned_delivery_content.strip()
adapter_ok = True
if text_to_send:
future = asyncio.run_coroutine_threadsafe(
runtime_adapter.send(chat_id, text_to_send, metadata=send_metadata),
loop,
)
try:
send_result = future.result(timeout=60)
except TimeoutError:
future.cancel()
raise
if send_result and not getattr(send_result, "success", True):
err = getattr(send_result, "error", "unknown")
logger.warning(
"Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
job["id"], platform_name, chat_id, err,
)
adapter_ok = False # fall through to standalone path
# Send extracted media files as native attachments via the live adapter
if adapter_ok and media_files:
_send_media_via_adapter(runtime_adapter, chat_id, media_files, send_metadata, loop, job)
if adapter_ok:
logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
delivered = True
except Exception as e:
logger.warning(
"Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
job["id"], platform_name, chat_id, e,
)
if not delivered:
pconfig = config.platforms.get(platform)
if not pconfig or not pconfig.enabled:
msg = f"platform '{platform_name}' not configured/enabled"
logger.warning("Job '%s': %s", job["id"], msg)
delivery_errors.append(msg)
continue
# Standalone path: run the async send in a fresh event loop (safe from any thread)
coro = _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files)
try:
result = asyncio.run(coro)
except RuntimeError:
# asyncio.run() checks for a running loop before awaiting the coroutine;
# when it raises, the original coro was never started — close it to
# prevent "coroutine was never awaited" RuntimeWarning, then retry in a
# fresh thread that has no running loop.
coro.close()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
result = future.result(timeout=30)
except Exception as e:
msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
logger.error("Job '%s': %s", job["id"], msg)
delivery_errors.append(msg)
continue
if result and result.get("error"):
msg = f"delivery error: {result['error']}"
logger.error("Job '%s': %s", job["id"], msg)
delivery_errors.append(msg)
continue
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
if delivery_errors:
return "; ".join(delivery_errors)
return None
_DEFAULT_SCRIPT_TIMEOUT = 120 # seconds
# Backward-compatible module override used by tests and emergency monkeypatches.
_SCRIPT_TIMEOUT = _DEFAULT_SCRIPT_TIMEOUT
def _get_script_timeout() -> int:
"""Resolve cron pre-run script timeout from module/env/config with a safe default."""
if _SCRIPT_TIMEOUT != _DEFAULT_SCRIPT_TIMEOUT:
try:
timeout = int(float(_SCRIPT_TIMEOUT))
if timeout > 0:
return timeout
except Exception:
logger.warning("Invalid patched _SCRIPT_TIMEOUT=%r; using env/config/default", _SCRIPT_TIMEOUT)
env_value = os.getenv("HERMES_CRON_SCRIPT_TIMEOUT", "").strip()
if env_value:
try:
timeout = int(float(env_value))
if timeout > 0:
return timeout
except Exception:
logger.warning("Invalid HERMES_CRON_SCRIPT_TIMEOUT=%r; using config/default", env_value)
try:
cfg = load_config() or {}
cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {}
configured = cron_cfg.get("script_timeout_seconds")
if configured is not None:
timeout = int(float(configured))
if timeout > 0:
return timeout
except Exception as exc:
logger.debug("Failed to load cron script timeout from config: %s", exc)
return _DEFAULT_SCRIPT_TIMEOUT
def _run_job_script(script_path: str) -> tuple[bool, str]:
"""Execute a cron job's data-collection script and capture its output.
Scripts must reside within HERMES_HOME/scripts/. Both relative and
absolute paths are resolved and validated against this directory to
prevent arbitrary script execution via path traversal or absolute
path injection.
Args:
script_path: Path to a Python script. Relative paths are resolved
against HERMES_HOME/scripts/. Absolute and ~-prefixed paths
are also validated to ensure they stay within the scripts dir.
Returns:
(success, output) on failure *output* contains the error message so the
LLM can report the problem to the user.
"""
from hermes_constants import get_hermes_home
scripts_dir = get_hermes_home() / "scripts"
scripts_dir.mkdir(parents=True, exist_ok=True)
scripts_dir_resolved = scripts_dir.resolve()
raw = Path(script_path).expanduser()
if raw.is_absolute():
path = raw.resolve()
else:
path = (scripts_dir / raw).resolve()
# Guard against path traversal, absolute path injection, and symlink
# escape — scripts MUST reside within HERMES_HOME/scripts/.
try:
path.relative_to(scripts_dir_resolved)
except ValueError:
return False, (
f"Blocked: script path resolves outside the scripts directory "
f"({scripts_dir_resolved}): {script_path!r}"
)
if not path.exists():
return False, f"Script not found: {path}"
if not path.is_file():
return False, f"Script path is not a file: {path}"
script_timeout = _get_script_timeout()
try:
result = subprocess.run(
[sys.executable, str(path)],
capture_output=True,
text=True,
timeout=script_timeout,
cwd=str(path.parent),
)
stdout = (result.stdout or "").strip()
stderr = (result.stderr or "").strip()
# Redact secrets from both stdout and stderr before any return path.
try:
from agent.redact import redact_sensitive_text
stdout = redact_sensitive_text(stdout)
stderr = redact_sensitive_text(stderr)
except Exception:
pass
if result.returncode != 0:
parts = [f"Script exited with code {result.returncode}"]
if stderr:
parts.append(f"stderr:\n{stderr}")
if stdout:
parts.append(f"stdout:\n{stdout}")
return False, "\n".join(parts)
return True, stdout
except subprocess.TimeoutExpired:
return False, f"Script timed out after {script_timeout}s: {path}"
except Exception as exc:
return False, f"Script execution failed: {exc}"
feat(cron): add wakeAgent gate — scripts can skip the agent entirely Extends the existing cron script hook with a wake gate ported from nanoclaw #1232. When a cron job's pre-check Python script (already sandboxed to HERMES_HOME/scripts/) writes a JSON line like ```json {"wakeAgent": false} ``` on its last stdout line, `run_job()` returns the SILENT marker and skips the agent entirely — no LLM call, no delivery, no tokens spent. Useful for frequent polls (every 1-5 min) that only need to wake the agent when something has genuinely changed. Any other script output (non-JSON, missing key, non-dict, `wakeAgent: true`, truthy/falsy non-False values) behaves as before: stdout is injected as context and the agent runs normally. Strict `False` is required to skip — avoids accidental gating from arbitrary JSON. Refactor: - New pure helper `_parse_wake_gate(script_output)` in cron/scheduler.py - `_build_job_prompt` accepts optional `prerun_script` tuple so the script runs exactly once per job (run_job runs it for the gate check, reuses the output for prompt injection) - `run_job` short-circuits with SILENT_MARKER when gate fires Script failures (success=False) still cannot trigger the gate — the failure is reported as context to the agent as before. This replaces the approach in closed PR #3837, which inlined bash scripts via tempfile and lost the path-traversal/scripts-dir sandbox that main's impl has. The wake-gate idea (the one net-new capability) is ported on top of the existing sandboxed Python-script model. Tests: - 11 pure unit tests for _parse_wake_gate (empty, whitespace, non-JSON, non-dict JSON, missing key, truthy/falsy non-False, multi-line, trailing blanks, non-last-line JSON) - 5 integration tests for run_job wake-gate (skip returns SILENT, wake-true passes through, script-runs-only-once, script failure doesn't gate, no-script regression) - Full tests/cron/ suite: 194/194 pass
2026-04-18 19:30:07 -07:00
def _parse_wake_gate(script_output: str) -> bool:
"""Parse the last non-empty stdout line of a cron job's pre-check script
as a wake gate.
The convention (ported from nanoclaw #1232): if the last stdout line is
JSON like ``{"wakeAgent": false}``, the agent is skipped entirely no
LLM run, no delivery. Any other output (non-JSON, missing flag, gate
absent, or ``wakeAgent: true``) means wake the agent normally.
Returns True if the agent should wake, False to skip.
"""
if not script_output:
return True
stripped_lines = [line for line in script_output.splitlines() if line.strip()]
if not stripped_lines:
return True
last_line = stripped_lines[-1].strip()
try:
gate = json.loads(last_line)
except (json.JSONDecodeError, ValueError):
return True
if not isinstance(gate, dict):
return True
return gate.get("wakeAgent", True) is not False
def _build_job_prompt(job: dict, prerun_script: Optional[tuple] = None) -> str:
"""Build the effective prompt for a cron job, optionally loading one or more skills first.
Args:
job: The cron job dict.
prerun_script: Optional ``(success, stdout)`` from a script that has
already been executed by the caller (e.g. for a wake-gate check).
When provided, the script is not re-executed and the cached
result is used for prompt injection. When omitted, the script
(if any) runs inline as before.
"""
prompt = job.get("prompt", "")
skills = job.get("skills")
# Run data-collection script if configured, inject output as context.
script_path = job.get("script")
if script_path:
feat(cron): add wakeAgent gate — scripts can skip the agent entirely Extends the existing cron script hook with a wake gate ported from nanoclaw #1232. When a cron job's pre-check Python script (already sandboxed to HERMES_HOME/scripts/) writes a JSON line like ```json {"wakeAgent": false} ``` on its last stdout line, `run_job()` returns the SILENT marker and skips the agent entirely — no LLM call, no delivery, no tokens spent. Useful for frequent polls (every 1-5 min) that only need to wake the agent when something has genuinely changed. Any other script output (non-JSON, missing key, non-dict, `wakeAgent: true`, truthy/falsy non-False values) behaves as before: stdout is injected as context and the agent runs normally. Strict `False` is required to skip — avoids accidental gating from arbitrary JSON. Refactor: - New pure helper `_parse_wake_gate(script_output)` in cron/scheduler.py - `_build_job_prompt` accepts optional `prerun_script` tuple so the script runs exactly once per job (run_job runs it for the gate check, reuses the output for prompt injection) - `run_job` short-circuits with SILENT_MARKER when gate fires Script failures (success=False) still cannot trigger the gate — the failure is reported as context to the agent as before. This replaces the approach in closed PR #3837, which inlined bash scripts via tempfile and lost the path-traversal/scripts-dir sandbox that main's impl has. The wake-gate idea (the one net-new capability) is ported on top of the existing sandboxed Python-script model. Tests: - 11 pure unit tests for _parse_wake_gate (empty, whitespace, non-JSON, non-dict JSON, missing key, truthy/falsy non-False, multi-line, trailing blanks, non-last-line JSON) - 5 integration tests for run_job wake-gate (skip returns SILENT, wake-true passes through, script-runs-only-once, script failure doesn't gate, no-script regression) - Full tests/cron/ suite: 194/194 pass
2026-04-18 19:30:07 -07:00
if prerun_script is not None:
success, script_output = prerun_script
else:
success, script_output = _run_job_script(script_path)
if success:
if script_output:
prompt = (
"## Script Output\n"
"The following data was collected by a pre-run script. "
"Use it as context for your analysis.\n\n"
f"```\n{script_output}\n```\n\n"
f"{prompt}"
)
else:
prompt = (
"[Script ran successfully but produced no output.]\n\n"
f"{prompt}"
)
else:
prompt = (
"## Script Error\n"
"The data-collection script failed. Report this to the user.\n\n"
f"```\n{script_output}\n```\n\n"
f"{prompt}"
)
# Always prepend cron execution guidance so the agent knows how
# delivery works and can suppress delivery when appropriate.
cron_hint = (
"[SYSTEM: You are running as a scheduled cron job. "
"DELIVERY: Your final response will be automatically delivered "
"to the user — do NOT use send_message or try to deliver "
"the output yourself. Just produce your report/output as your "
"final response and the system handles the rest. "
"SILENT: If there is genuinely nothing new to report, respond "
"with exactly \"[SILENT]\" (nothing else) to suppress delivery. "
"Never combine [SILENT] with content — either report your "
"findings normally, or say [SILENT] and nothing more.]\n\n"
)
prompt = cron_hint + prompt
if skills is None:
legacy = job.get("skill")
skills = [legacy] if legacy else []
skill_names = [str(name).strip() for name in skills if str(name).strip()]
if not skill_names:
return prompt
from tools.skills_tool import skill_view
parts = []
skipped: list[str] = []
for skill_name in skill_names:
loaded = json.loads(skill_view(skill_name))
if not loaded.get("success"):
error = loaded.get("error") or f"Failed to load skill '{skill_name}'"
logger.warning("Cron job '%s': skill not found, skipping — %s", job.get("name", job.get("id")), error)
skipped.append(skill_name)
continue
content = str(loaded.get("content") or "").strip()
if parts:
parts.append("")
parts.extend(
[
f'[SYSTEM: The user has invoked the "{skill_name}" skill, indicating they want you to follow its instructions. The full skill content is loaded below.]',
"",
content,
]
)
if skipped:
notice = (
f"[SYSTEM: The following skill(s) were listed for this job but could not be found "
f"and were skipped: {', '.join(skipped)}. "
f"Start your response with a brief notice so the user is aware, e.g.: "
f"'⚠️ Skill(s) not found and skipped: {', '.join(skipped)}']"
)
parts.insert(0, notice)
if prompt:
parts.extend(["", f"The user has provided the following instruction alongside the skill invocation: {prompt}"])
return "\n".join(parts)
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
"""
Execute a single cron job.
Returns:
Tuple of (success, full_output_doc, final_response, error_message)
"""
from run_agent import AIAgent
# Initialize SQLite session store so cron job messages are persisted
# and discoverable via session_search (same pattern as gateway/run.py).
_session_db = None
try:
from hermes_state import SessionDB
_session_db = SessionDB()
except Exception as e:
logger.debug("Job '%s': SQLite session store not available: %s", job.get("id", "?"), e)
job_id = job["id"]
job_name = job["name"]
feat(cron): add wakeAgent gate — scripts can skip the agent entirely Extends the existing cron script hook with a wake gate ported from nanoclaw #1232. When a cron job's pre-check Python script (already sandboxed to HERMES_HOME/scripts/) writes a JSON line like ```json {"wakeAgent": false} ``` on its last stdout line, `run_job()` returns the SILENT marker and skips the agent entirely — no LLM call, no delivery, no tokens spent. Useful for frequent polls (every 1-5 min) that only need to wake the agent when something has genuinely changed. Any other script output (non-JSON, missing key, non-dict, `wakeAgent: true`, truthy/falsy non-False values) behaves as before: stdout is injected as context and the agent runs normally. Strict `False` is required to skip — avoids accidental gating from arbitrary JSON. Refactor: - New pure helper `_parse_wake_gate(script_output)` in cron/scheduler.py - `_build_job_prompt` accepts optional `prerun_script` tuple so the script runs exactly once per job (run_job runs it for the gate check, reuses the output for prompt injection) - `run_job` short-circuits with SILENT_MARKER when gate fires Script failures (success=False) still cannot trigger the gate — the failure is reported as context to the agent as before. This replaces the approach in closed PR #3837, which inlined bash scripts via tempfile and lost the path-traversal/scripts-dir sandbox that main's impl has. The wake-gate idea (the one net-new capability) is ported on top of the existing sandboxed Python-script model. Tests: - 11 pure unit tests for _parse_wake_gate (empty, whitespace, non-JSON, non-dict JSON, missing key, truthy/falsy non-False, multi-line, trailing blanks, non-last-line JSON) - 5 integration tests for run_job wake-gate (skip returns SILENT, wake-true passes through, script-runs-only-once, script failure doesn't gate, no-script regression) - Full tests/cron/ suite: 194/194 pass
2026-04-18 19:30:07 -07:00
# Wake-gate: if this job has a pre-check script, run it BEFORE building
# the prompt so a ``{"wakeAgent": false}`` response can short-circuit
# the whole agent run. We pass the result into _build_job_prompt so
# the script is only executed once.
prerun_script = None
script_path = job.get("script")
if script_path:
prerun_script = _run_job_script(script_path)
_ran_ok, _script_output = prerun_script
if _ran_ok and not _parse_wake_gate(_script_output):
logger.info(
"Job '%s' (ID: %s): wakeAgent=false, skipping agent run",
job_name, job_id,
)
silent_doc = (
f"# Cron Job: {job_name}\n\n"
f"**Job ID:** {job_id}\n"
f"**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
"Script gate returned `wakeAgent=false` — agent skipped.\n"
)
return True, silent_doc, SILENT_MARKER, None
prompt = _build_job_prompt(job, prerun_script=prerun_script)
origin = _resolve_origin(job)
_cron_session_id = f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}"
logger.info("Running job '%s' (ID: %s)", job_name, job_id)
logger.info("Prompt: %s", prompt[:100])
# Mark this as a cron session so the approval system can apply cron_mode.
# This env var is process-wide and persists for the lifetime of the
# scheduler process — every job this process runs is a cron job.
os.environ["HERMES_CRON_SESSION"] = "1"
# Use ContextVars for per-job session/delivery state so parallel jobs
# don't clobber each other's targets (os.environ is process-global).
from gateway.session_context import set_session_vars, clear_session_vars, _VAR_MAP
_ctx_tokens = set_session_vars(
platform=origin["platform"] if origin else "",
chat_id=str(origin["chat_id"]) if origin else "",
chat_name=origin.get("chat_name", "") if origin else "",
)
try:
# Re-read .env and config.yaml fresh every run so provider/key
# changes take effect without a gateway restart.
from dotenv import load_dotenv
2026-02-25 15:20:42 -08:00
try:
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="utf-8")
2026-02-25 15:20:42 -08:00
except UnicodeDecodeError:
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1")
delivery_target = _resolve_delivery_target(job)
if delivery_target:
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_PLATFORM"].set(delivery_target["platform"])
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_CHAT_ID"].set(str(delivery_target["chat_id"]))
if delivery_target.get("thread_id") is not None:
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_THREAD_ID"].set(str(delivery_target["thread_id"]))
model = job.get("model") or os.getenv("HERMES_MODEL") or ""
# Load config.yaml for model, reasoning, prefill, toolsets, provider routing
_cfg = {}
try:
import yaml
_cfg_path = str(_hermes_home / "config.yaml")
if os.path.exists(_cfg_path):
with open(_cfg_path) as _f:
_cfg = yaml.safe_load(_f) or {}
_model_cfg = _cfg.get("model", {})
if not job.get("model"):
if isinstance(_model_cfg, str):
model = _model_cfg
elif isinstance(_model_cfg, dict):
model = _model_cfg.get("default", model)
except Exception as e:
logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)
# Apply IPv4 preference if configured.
try:
from hermes_constants import apply_ipv4_preference
_net_cfg = _cfg.get("network", {})
if isinstance(_net_cfg, dict) and _net_cfg.get("force_ipv4"):
apply_ipv4_preference(force=True)
except Exception:
pass
# Reasoning config from config.yaml
from hermes_constants import parse_reasoning_effort
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
reasoning_config = parse_reasoning_effort(effort)
# Prefill messages from env or config.yaml
prefill_messages = None
prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "")
if prefill_file:
pfpath = Path(prefill_file).expanduser()
if not pfpath.is_absolute():
pfpath = _hermes_home / pfpath
if pfpath.exists():
try:
with open(pfpath, "r", encoding="utf-8") as _pf:
refactor: remove remaining redundant local imports (comprehensive sweep) Full AST-based scan of all .py files to find every case where a module or name is imported locally inside a function body but is already available at module level. This is the second pass — the first commit handled the known cases from the lint report; this one catches everything else. Files changed (19): cli.py — 16 removals: time as _time/_t/_tmod (×10), re / re as _re (×2), os as _os, sys, partial os from combo import, from model_tools import get_tool_definitions gateway/run.py — 8 removals: MessageEvent as _ME / MessageType as _MT (×3), os as _os2, MessageEvent+MessageType (×2), Platform, BasePlatformAdapter as _BaseAdapter run_agent.py — 6 removals: get_hermes_home as _ghh, partial (contextlib, os as _os), cleanup_vm, cleanup_browser, set_interrupt as _sif (×2), partial get_toolset_for_tool hermes_cli/main.py — 4 removals: get_hermes_home, time as _time, logging as _log, shutil hermes_cli/config.py — 1 removal: get_hermes_home as _ghome hermes_cli/runtime_provider.py — 1 removal: load_config as _load_bedrock_config hermes_cli/setup.py — 2 removals: importlib.util (×2) hermes_cli/nous_subscription.py — 1 removal: from hermes_cli.config import load_config hermes_cli/tools_config.py — 1 removal: from hermes_cli.config import load_config, save_config cron/scheduler.py — 3 removals: concurrent.futures, json as _json, from hermes_cli.config import load_config batch_runner.py — 1 removal: list_distributions as get_all_dists (kept print_distribution_info, not at top level) tools/send_message_tool.py — 2 removals: import os (×2) tools/skills_tool.py — 1 removal: logging as _logging tools/browser_camofox.py — 1 removal: from hermes_cli.config import load_config tools/image_generation_tool.py — 1 removal: import fal_client environments/tool_context.py — 1 removal: concurrent.futures gateway/platforms/bluebubbles.py — 1 removal: httpx as _httpx gateway/platforms/whatsapp.py — 1 removal: import asyncio tui_gateway/server.py — 2 removals: from datetime import datetime, import time All alias references (_time, _t, _tmod, _re, _os, _os2, _json, _ghh, _ghome, _sif, _ME, _MT, _BaseAdapter, _load_bedrock_config, _httpx, _logging, _log, get_all_dists) updated to use the top-level names.
2026-04-21 12:46:31 +05:30
prefill_messages = json.load(_pf)
if not isinstance(prefill_messages, list):
prefill_messages = None
except Exception as e:
logger.warning("Job '%s': failed to parse prefill messages file '%s': %s", job_id, pfpath, e)
prefill_messages = None
# Max iterations
max_iterations = _cfg.get("agent", {}).get("max_turns") or _cfg.get("max_turns") or 90
# Provider routing
pr = _cfg.get("provider_routing", {})
from hermes_cli.runtime_provider import (
resolve_runtime_provider,
format_runtime_provider_error,
)
try:
runtime_kwargs = {
"requested": job.get("provider") or os.getenv("HERMES_INFERENCE_PROVIDER"),
}
if job.get("base_url"):
runtime_kwargs["explicit_base_url"] = job.get("base_url")
runtime = resolve_runtime_provider(**runtime_kwargs)
except Exception as exc:
message = format_runtime_provider_error(exc)
raise RuntimeError(message) from exc
fallback_model = _cfg.get("fallback_providers") or _cfg.get("fallback_model") or None
credential_pool = None
refactor: remove smart_model_routing feature (#12732) Smart model routing (auto-routing short/simple turns to a cheap model across providers) was opt-in and disabled by default. This removes the feature wholesale: the routing module, its config keys, docs, tests, and the orchestration scaffolding it required in cli.py / gateway/run.py / cron/scheduler.py. The /fast (Priority Processing / Anthropic fast mode) feature kept its hooks into _resolve_turn_agent_config — those still build a route dict and attach request_overrides when the model supports it; the route now just always uses the session's primary model/provider rather than running prompts through choose_cheap_model_route() first. Also removed: - DEFAULT_CONFIG['smart_model_routing'] block and matching commented-out example sections in hermes_cli/config.py and cli-config.yaml.example - _load_smart_model_routing() / self._smart_model_routing on GatewayRunner - self._smart_model_routing / self._active_agent_route_signature on HermesCLI (signature kept; just no longer initialised through the smart-routing pipeline) - route_label parameter on HermesCLI._init_agent (only set by smart routing; never read elsewhere) - 'Smart Model Routing' section in website/docs/integrations/providers.md - tip in hermes_cli/tips.py - entries in hermes_cli/dump.py + hermes_cli/web_server.py - row in skills/autonomous-ai-agents/hermes-agent/SKILL.md Tests: - Deleted tests/agent/test_smart_model_routing.py - Rewrote tests/agent/test_credential_pool_routing.py to target the simplified _resolve_turn_agent_config directly (preserves credential pool propagation + 429 rotation coverage) - Dropped 'cheap model' test from test_cli_provider_resolution.py - Dropped resolve_turn_route patches from cli + gateway test_fast_command — they now exercise the real method end-to-end - Removed _smart_model_routing stub assignments from gateway/cron test helpers Targeted suites: 74/74 in the directly affected test files; tests/agent + tests/cron + tests/cli pass except 5 failures that already exist on main (cron silent-delivery + alias quick-command).
2026-04-19 18:12:55 -07:00
runtime_provider = str(runtime.get("provider") or "").strip().lower()
if runtime_provider:
try:
from agent.credential_pool import load_pool
pool = load_pool(runtime_provider)
if pool.has_credentials():
credential_pool = pool
logger.info(
"Job '%s': loaded credential pool for provider %s with %d entries",
job_id,
runtime_provider,
len(pool.entries()),
)
except Exception as e:
logger.debug("Job '%s': failed to load credential pool for %s: %s", job_id, runtime_provider, e)
agent = AIAgent(
refactor: remove smart_model_routing feature (#12732) Smart model routing (auto-routing short/simple turns to a cheap model across providers) was opt-in and disabled by default. This removes the feature wholesale: the routing module, its config keys, docs, tests, and the orchestration scaffolding it required in cli.py / gateway/run.py / cron/scheduler.py. The /fast (Priority Processing / Anthropic fast mode) feature kept its hooks into _resolve_turn_agent_config — those still build a route dict and attach request_overrides when the model supports it; the route now just always uses the session's primary model/provider rather than running prompts through choose_cheap_model_route() first. Also removed: - DEFAULT_CONFIG['smart_model_routing'] block and matching commented-out example sections in hermes_cli/config.py and cli-config.yaml.example - _load_smart_model_routing() / self._smart_model_routing on GatewayRunner - self._smart_model_routing / self._active_agent_route_signature on HermesCLI (signature kept; just no longer initialised through the smart-routing pipeline) - route_label parameter on HermesCLI._init_agent (only set by smart routing; never read elsewhere) - 'Smart Model Routing' section in website/docs/integrations/providers.md - tip in hermes_cli/tips.py - entries in hermes_cli/dump.py + hermes_cli/web_server.py - row in skills/autonomous-ai-agents/hermes-agent/SKILL.md Tests: - Deleted tests/agent/test_smart_model_routing.py - Rewrote tests/agent/test_credential_pool_routing.py to target the simplified _resolve_turn_agent_config directly (preserves credential pool propagation + 429 rotation coverage) - Dropped 'cheap model' test from test_cli_provider_resolution.py - Dropped resolve_turn_route patches from cli + gateway test_fast_command — they now exercise the real method end-to-end - Removed _smart_model_routing stub assignments from gateway/cron test helpers Targeted suites: 74/74 in the directly affected test files; tests/agent + tests/cron + tests/cli pass except 5 failures that already exist on main (cron silent-delivery + alias quick-command).
2026-04-19 18:12:55 -07:00
model=model,
api_key=runtime.get("api_key"),
base_url=runtime.get("base_url"),
provider=runtime.get("provider"),
api_mode=runtime.get("api_mode"),
acp_command=runtime.get("command"),
acp_args=runtime.get("args"),
max_iterations=max_iterations,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,
fallback_model=fallback_model,
credential_pool=credential_pool,
providers_allowed=pr.get("only"),
providers_ignored=pr.get("ignore"),
providers_order=pr.get("order"),
provider_sort=pr.get("sort"),
disabled_toolsets=["cronjob", "messaging", "clarify"],
quiet_mode=True,
skip_context_files=True, # Don't inject SOUL.md/AGENTS.md from scheduler cwd
feat(memory): pluggable memory provider interface with profile isolation, review fixes, and honcho CLI restoration (#4623) * feat(memory): add pluggable memory provider interface with profile isolation Introduces a pluggable MemoryProvider ABC so external memory backends can integrate with Hermes without modifying core files. Each backend becomes a plugin implementing a standard interface, orchestrated by MemoryManager. Key architecture: - agent/memory_provider.py — ABC with core + optional lifecycle hooks - agent/memory_manager.py — single integration point in the agent loop - agent/builtin_memory_provider.py — wraps existing MEMORY.md/USER.md Profile isolation fixes applied to all 6 shipped plugins: - Cognitive Memory: use get_hermes_home() instead of raw env var - Hindsight Memory: check $HERMES_HOME/hindsight/config.json first, fall back to legacy ~/.hindsight/ for backward compat - Hermes Memory Store: replace hardcoded ~/.hermes paths with get_hermes_home() for config loading and DB path defaults - Mem0 Memory: use get_hermes_home() instead of raw env var - RetainDB Memory: auto-derive profile-scoped project name from hermes_home path (hermes-<profile>), explicit env var overrides - OpenViking Memory: read-only, no local state, isolation via .env MemoryManager.initialize_all() now injects hermes_home into kwargs so every provider can resolve profile-scoped storage without importing get_hermes_home() themselves. Plugin system: adds register_memory_provider() to PluginContext and get_plugin_memory_providers() accessor. Based on PR #3825. 46 tests (37 unit + 5 E2E + 4 plugin registration). * refactor(memory): drop cognitive plugin, rewrite OpenViking as full provider Remove cognitive-memory plugin (#727) — core mechanics are broken: decay runs 24x too fast (hourly not daily), prefetch uses row ID as timestamp, search limited by importance not similarity. Rewrite openviking-memory plugin from a read-only search wrapper into a full bidirectional memory provider using the complete OpenViking session lifecycle API: - sync_turn: records user/assistant messages to OpenViking session (threaded, non-blocking) - on_session_end: commits session to trigger automatic memory extraction into 6 categories (profile, preferences, entities, events, cases, patterns) - prefetch: background semantic search via find() endpoint - on_memory_write: mirrors built-in memory writes to the session - is_available: checks env var only, no network calls (ABC compliance) Tools expanded from 3 to 5: - viking_search: semantic search with mode/scope/limit - viking_read: tiered content (abstract ~100tok / overview ~2k / full) - viking_browse: filesystem-style navigation (list/tree/stat) - viking_remember: explicit memory storage via session - viking_add_resource: ingest URLs/docs into knowledge base Uses direct HTTP via httpx (no openviking SDK dependency needed). Response truncation on viking_read to prevent context flooding. * fix(memory): harden Mem0 plugin — thread safety, non-blocking sync, circuit breaker - Remove redundant mem0_context tool (identical to mem0_search with rerank=true, top_k=5 — wastes a tool slot and confuses the model) - Thread sync_turn so it's non-blocking — Mem0's server-side LLM extraction can take 5-10s, was stalling the agent after every turn - Add threading.Lock around _get_client() for thread-safe lazy init (prefetch and sync threads could race on first client creation) - Add circuit breaker: after 5 consecutive API failures, pause calls for 120s instead of hammering a down server every turn. Auto-resets after cooldown. Logs a warning when tripped. - Track success/failure in prefetch, sync_turn, and all tool calls - Wait for previous sync to finish before starting a new one (prevents unbounded thread accumulation on rapid turns) - Clean up shutdown to join both prefetch and sync threads * fix(memory): enforce single external memory provider limit MemoryManager now rejects a second non-builtin provider with a warning. Built-in memory (MEMORY.md/USER.md) is always accepted. Only ONE external plugin provider is allowed at a time. This prevents tool schema bloat (some providers add 3-5 tools each) and conflicting memory backends. The warning message directs users to configure memory.provider in config.yaml to select which provider to activate. Updated all 47 tests to use builtin + one external pattern instead of multiple externals. Added test_second_external_rejected to verify the enforcement. * feat(memory): add ByteRover memory provider plugin Implements the ByteRover integration (from PR #3499 by hieuntg81) as a MemoryProvider plugin instead of direct run_agent.py modifications. ByteRover provides persistent memory via the brv CLI — a hierarchical knowledge tree with tiered retrieval (fuzzy text then LLM-driven search). Local-first with optional cloud sync. Plugin capabilities: - prefetch: background brv query for relevant context - sync_turn: curate conversation turns (threaded, non-blocking) - on_memory_write: mirror built-in memory writes to brv - on_pre_compress: extract insights before context compression Tools (3): - brv_query: search the knowledge tree - brv_curate: store facts/decisions/patterns - brv_status: check CLI version and context tree state Profile isolation: working directory at $HERMES_HOME/byterover/ (scoped per profile). Binary resolution cached with thread-safe double-checked locking. All write operations threaded to avoid blocking the agent (curate can take 120s with LLM processing). * fix(memory): thread remaining sync_turns, fix holographic, add config key Plugin fixes: - Hindsight: thread sync_turn (was blocking up to 30s via _run_in_thread) - RetainDB: thread sync_turn (was blocking on HTTP POST) - Both: shutdown now joins sync threads alongside prefetch threads Holographic retrieval fixes: - reason(): removed dead intersection_key computation (bundled but never used in scoring). Now reuses pre-computed entity_residuals directly, moved role_content encoding outside the inner loop. - contradict(): added _MAX_CONTRADICT_FACTS=500 scaling guard. Above 500 facts, only checks the most recently updated ones to avoid O(n^2) explosion (~125K comparisons at 500 is acceptable). Config: - Added memory.provider key to DEFAULT_CONFIG ("" = builtin only). No version bump needed (deep_merge handles new keys automatically). * feat(memory): extract Honcho as a MemoryProvider plugin Creates plugins/honcho-memory/ as a thin adapter over the existing honcho_integration/ package. All 4 Honcho tools (profile, search, context, conclude) move from the normal tool registry to the MemoryProvider interface. The plugin delegates all work to HonchoSessionManager — no Honcho logic is reimplemented. It uses the existing config chain: $HERMES_HOME/honcho.json -> ~/.honcho/config.json -> env vars. Lifecycle hooks: - initialize: creates HonchoSessionManager via existing client factory - prefetch: background dialectic query - sync_turn: records messages + flushes to API (threaded) - on_memory_write: mirrors user profile writes as conclusions - on_session_end: flushes all pending messages This is a prerequisite for the MemoryManager wiring in run_agent.py. Once wired, Honcho goes through the same provider interface as all other memory plugins, and the scattered Honcho code in run_agent.py can be consolidated into the single MemoryManager integration point. * feat(memory): wire MemoryManager into run_agent.py Adds 8 integration points for the external memory provider plugin, all purely additive (zero existing code modified): 1. Init (~L1130): Create MemoryManager, find matching plugin provider from memory.provider config, initialize with session context 2. Tool injection (~L1160): Append provider tool schemas to self.tools and self.valid_tool_names after memory_manager init 3. System prompt (~L2705): Add external provider's system_prompt_block alongside existing MEMORY.md/USER.md blocks 4. Tool routing (~L5362): Route provider tool calls through memory_manager.handle_tool_call() before the catchall handler 5. Memory write bridge (~L5353): Notify external provider via on_memory_write() when the built-in memory tool writes 6. Pre-compress (~L5233): Call on_pre_compress() before context compression discards messages 7. Prefetch (~L6421): Inject provider prefetch results into the current-turn user message (same pattern as Honcho turn context) 8. Turn sync + session end (~L8161, ~L8172): sync_all() after each completed turn, queue_prefetch_all() for next turn, on_session_end() + shutdown_all() at conversation end All hooks are wrapped in try/except — a failing provider never breaks the agent. The existing memory system, Honcho integration, and all other code paths are completely untouched. Full suite: 7222 passed, 4 pre-existing failures. * refactor(memory): remove legacy Honcho integration from core Extracts all Honcho-specific code from run_agent.py, model_tools.py, toolsets.py, and gateway/run.py. Honcho is now exclusively available as a memory provider plugin (plugins/honcho-memory/). Removed from run_agent.py (-457 lines): - Honcho init block (session manager creation, activation, config) - 8 Honcho methods: _honcho_should_activate, _strip_honcho_tools, _activate_honcho, _register_honcho_exit_hook, _queue_honcho_prefetch, _honcho_prefetch, _honcho_save_user_observation, _honcho_sync - _inject_honcho_turn_context module-level function - Honcho system prompt block (tool descriptions, CLI commands) - Honcho context injection in api_messages building - Honcho params from __init__ (honcho_session_key, honcho_manager, honcho_config) - HONCHO_TOOL_NAMES constant - All honcho-specific tool dispatch forwarding Removed from other files: - model_tools.py: honcho_tools import, honcho params from handle_function_call - toolsets.py: honcho toolset definition, honcho tools from core tools list - gateway/run.py: honcho params from AIAgent constructor calls Removed tests (-339 lines): - 9 Honcho-specific test methods from test_run_agent.py - TestHonchoAtexitFlush class from test_exit_cleanup_interrupt.py Restored two regex constants (_SURROGATE_RE, _BUDGET_WARNING_RE) that were accidentally removed during the honcho function extraction. The honcho_integration/ package is kept intact — the plugin delegates to it. tools/honcho_tools.py registry entries are now dead code (import commented out in model_tools.py) but the file is preserved for reference. Full suite: 7207 passed, 4 pre-existing failures. Zero regressions. * refactor(memory): restructure plugins, add CLI, clean gateway, migration notice Plugin restructure: - Move all memory plugins from plugins/<name>-memory/ to plugins/memory/<name>/ (byterover, hindsight, holographic, honcho, mem0, openviking, retaindb) - New plugins/memory/__init__.py discovery module that scans the directory directly, loading providers by name without the general plugin system - run_agent.py uses load_memory_provider() instead of get_plugin_memory_providers() CLI wiring: - hermes memory setup — interactive curses picker + config wizard - hermes memory status — show active provider, config, availability - hermes memory off — disable external provider (built-in only) - hermes honcho — now shows migration notice pointing to hermes memory setup Gateway cleanup: - Remove _get_or_create_gateway_honcho (already removed in prev commit) - Remove _shutdown_gateway_honcho and _shutdown_all_gateway_honcho methods - Remove all calls to shutdown methods (4 call sites) - Remove _honcho_managers/_honcho_configs dict references Dead code removal: - Delete tools/honcho_tools.py (279 lines, import was already commented out) - Delete tests/gateway/test_honcho_lifecycle.py (131 lines, tested removed methods) - Remove if False placeholder from run_agent.py Migration: - Honcho migration notice on startup: detects existing honcho.json or ~/.honcho/config.json, prints guidance to run hermes memory setup. Only fires when memory.provider is not set and not in quiet mode. Full suite: 7203 passed, 4 pre-existing failures. Zero regressions. * feat(memory): standardize plugin config + add per-plugin documentation Config architecture: - Add save_config(values, hermes_home) to MemoryProvider ABC - Honcho: writes to $HERMES_HOME/honcho.json (SDK native) - Mem0: writes to $HERMES_HOME/mem0.json - Hindsight: writes to $HERMES_HOME/hindsight/config.json - Holographic: writes to config.yaml under plugins.hermes-memory-store - OpenViking/RetainDB/ByteRover: env-var only (default no-op) Setup wizard (hermes memory setup): - Now calls provider.save_config() for non-secret config - Secrets still go to .env via env vars - Only memory.provider activation key goes to config.yaml Documentation: - README.md for each of the 7 providers in plugins/memory/<name>/ - Requirements, setup (wizard + manual), config reference, tools table - Consistent format across all providers The contract for new memory plugins: - get_config_schema() declares all fields (REQUIRED) - save_config() writes native config (REQUIRED if not env-var-only) - Secrets use env_var field in schema, written to .env by wizard - README.md in the plugin directory * docs: add memory providers user guide + developer guide New pages: - user-guide/features/memory-providers.md — comprehensive guide covering all 7 shipped providers (Honcho, OpenViking, Mem0, Hindsight, Holographic, RetainDB, ByteRover). Each with setup, config, tools, cost, and unique features. Includes comparison table and profile isolation notes. - developer-guide/memory-provider-plugin.md — how to build a new memory provider plugin. Covers ABC, required methods, config schema, save_config, threading contract, profile isolation, testing. Updated pages: - user-guide/features/memory.md — replaced Honcho section with link to new Memory Providers page - user-guide/features/honcho.md — replaced with migration redirect to the new Memory Providers page - sidebars.ts — added both new pages to navigation * fix(memory): auto-migrate Honcho users to memory provider plugin When honcho.json or ~/.honcho/config.json exists but memory.provider is not set, automatically set memory.provider: honcho in config.yaml and activate the plugin. The plugin reads the same config files, so all data and credentials are preserved. Zero user action needed. Persists the migration to config.yaml so it only fires once. Prints a one-line confirmation in non-quiet mode. * fix(memory): only auto-migrate Honcho when enabled + credentialed Check HonchoClientConfig.enabled AND (api_key OR base_url) before auto-migrating — not just file existence. Prevents false activation for users who disabled Honcho, stopped using it (config lingers), or have ~/.honcho/ from a different tool. * feat(memory): auto-install pip dependencies during hermes memory setup Reads pip_dependencies from plugin.yaml, checks which are missing, installs them via pip before config walkthrough. Also shows install guidance for external_dependencies (e.g. brv CLI for ByteRover). Updated all 7 plugin.yaml files with pip_dependencies: - honcho: honcho-ai - mem0: mem0ai - openviking: httpx - hindsight: hindsight-client - holographic: (none) - retaindb: requests - byterover: (external_dependencies for brv CLI) * fix: remove remaining Honcho crash risks from cli.py and gateway cli.py: removed Honcho session re-mapping block (would crash importing deleted tools/honcho_tools.py), Honcho flush on compress, Honcho session display on startup, Honcho shutdown on exit, honcho_session_key AIAgent param. gateway/run.py: removed honcho_session_key params from helper methods, sync_honcho param, _honcho.shutdown() block. tests: fixed test_cron_session_with_honcho_key_skipped (was passing removed honcho_key param to _flush_memories_for_session). * fix: include plugins/ in pyproject.toml package list Without this, plugins/memory/ wouldn't be included in non-editable installs. Hermes always runs from the repo checkout so this is belt- and-suspenders, but prevents breakage if the install method changes. * fix(memory): correct pip-to-import name mapping for dep checks The heuristic dep.replace('-', '_') fails for packages where the pip name differs from the import name: honcho-ai→honcho, mem0ai→mem0, hindsight-client→hindsight_client. Added explicit mapping table so hermes memory setup doesn't try to reinstall already-installed packages. * chore: remove dead code from old plugin memory registration path - hermes_cli/plugins.py: removed register_memory_provider(), _memory_providers list, get_plugin_memory_providers() — memory providers now use plugins/memory/ discovery, not the general plugin system - hermes_cli/main.py: stripped 74 lines of dead honcho argparse subparsers (setup, status, sessions, map, peer, mode, tokens, identity, migrate) — kept only the migration redirect - agent/memory_provider.py: updated docstring to reflect new registration path - tests: replaced TestPluginMemoryProviderRegistration with TestPluginMemoryDiscovery that tests the actual plugins/memory/ discovery system. Added 3 new tests (discover, load, nonexistent). * chore: delete dead honcho_integration/cli.py and its tests cli.py (794 lines) was the old 'hermes honcho' command handler — nobody calls it since cmd_honcho was replaced with a migration redirect. Deleted tests that imported from removed code: - tests/honcho_integration/test_cli.py (tested _resolve_api_key) - tests/honcho_integration/test_config_isolation.py (tested CLI config paths) - tests/tools/test_honcho_tools.py (tested the deleted tools/honcho_tools.py) Remaining honcho_integration/ files (actively used by the plugin): - client.py (445 lines) — config loading, SDK client creation - session.py (991 lines) — session management, queries, flush * refactor: move honcho_integration/ into the honcho plugin Moves client.py (445 lines) and session.py (991 lines) from the top-level honcho_integration/ package into plugins/memory/honcho/. No Honcho code remains in the main codebase. - plugins/memory/honcho/client.py — config loading, SDK client creation - plugins/memory/honcho/session.py — session management, queries, flush - Updated all imports: run_agent.py (auto-migration), hermes_cli/doctor.py, plugin __init__.py, session.py cross-import, all tests - Removed honcho_integration/ package and pyproject.toml entry - Renamed tests/honcho_integration/ → tests/honcho_plugin/ * docs: update architecture + gateway-internals for memory provider system - architecture.md: replaced honcho_integration/ with plugins/memory/ - gateway-internals.md: replaced Honcho-specific session routing and flush lifecycle docs with generic memory provider interface docs * fix: update stale mock path for resolve_active_host after honcho plugin migration * fix(memory): address review feedback — P0 lifecycle, ABC contract, honcho CLI restore Review feedback from Honcho devs (erosika): P0 — Provider lifecycle: - Remove on_session_end() + shutdown_all() from run_conversation() tail (was killing providers after every turn in multi-turn sessions) - Add shutdown_memory_provider() method on AIAgent for callers - Wire shutdown into CLI atexit, reset_conversation, gateway stop/expiry Bug fixes: - Remove sync_honcho=False kwarg from /btw callsites (TypeError crash) - Fix doctor.py references to dead 'hermes honcho setup' command - Cache prefetch_all() before tool loop (was re-calling every iteration) ABC contract hardening (all backwards-compatible): - Add session_id kwarg to prefetch/sync_turn/queue_prefetch - Make on_pre_compress() return str (provider insights in compression) - Add **kwargs to on_turn_start() for runtime context - Add on_delegation() hook for parent-side subagent observation - Document agent_context/agent_identity/agent_workspace kwargs on initialize() (prevents cron corruption, enables profile scoping) - Fix docstring: single external provider, not multiple Honcho CLI restoration: - Add plugins/memory/honcho/cli.py (from main's honcho_integration/cli.py with imports adapted to plugin path) - Restore full hermes honcho command with all subcommands (status, peer, mode, tokens, identity, enable/disable, sync, peers, --target-profile) - Restore auto-clone on profile creation + sync on hermes update - hermes honcho setup now redirects to hermes memory setup * fix(memory): wire on_delegation, skip_memory for cron/flush, fix ByteRover return type - Wire on_delegation() in delegate_tool.py — parent's memory provider is notified with task+result after each subagent completes - Add skip_memory=True to cron scheduler (prevents cron system prompts from corrupting user representations — closes #4052) - Add skip_memory=True to gateway flush agent (throwaway agent shouldn't activate memory provider) - Fix ByteRover on_pre_compress() return type: None -> str * fix(honcho): port profile isolation fixes from PR #4632 Ports 5 bug fixes found during profile testing (erosika's PR #4632): 1. 3-tier config resolution — resolve_config_path() now checks $HERMES_HOME/honcho.json → ~/.hermes/honcho.json → ~/.honcho/config.json (non-default profiles couldn't find shared host blocks) 2. Thread host=_host_key() through from_global_config() in cmd_setup, cmd_status, cmd_identity (--target-profile was being ignored) 3. Use bare profile name as aiPeer (not host key with dots) — Honcho's peer ID pattern is ^[a-zA-Z0-9_-]+$, dots are invalid 4. Wrap add_peers() in try/except — was fatal on new AI peers, killed all message uploads for the session 5. Gate Honcho clone behind --clone/--clone-all on profile create (bare create should be blank-slate) Also: sanitize assistant_peer_id via _sanitize_id() * fix(tests): add module cleanup fixture to test_cli_provider_resolution test_cli_provider_resolution._import_cli() wipes tools.*, cli, and run_agent from sys.modules to force fresh imports, but had no cleanup. This poisoned all subsequent tests on the same xdist worker — mocks targeting tools.file_tools, tools.send_message_tool, etc. patched the NEW module object while already-imported functions still referenced the OLD one. Caused ~25 cascade failures: send_message KeyError, process_registry FileNotFoundError, file_read_guards timeouts, read_loop_detection file-not-found, mcp_oauth None port, and provider_parity/codex_execution stale tool lists. Fix: autouse fixture saves all affected modules before each test and restores them after, matching the pattern in test_managed_browserbase_and_modal.py.
2026-04-02 15:33:51 -07:00
skip_memory=True, # Cron system prompts would corrupt user representations
platform="cron",
session_id=_cron_session_id,
session_db=_session_db,
)
# Run the agent with an *inactivity*-based timeout: the job can run
# for hours if it's actively calling tools / receiving stream tokens,
# but a hung API call or stuck tool with no activity for the configured
# duration is caught and killed. Default 600s (10 min inactivity);
# override via HERMES_CRON_TIMEOUT env var. 0 = unlimited.
#
# Uses the agent's built-in activity tracker (updated by
# _touch_activity() on every tool call, API call, and stream delta).
_cron_timeout = float(os.getenv("HERMES_CRON_TIMEOUT", 600))
_cron_inactivity_limit = _cron_timeout if _cron_timeout > 0 else None
_POLL_INTERVAL = 5.0
_cron_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
# Preserve scheduler-scoped ContextVar state (for example skill-declared
# env passthrough registrations) when the cron run hops into the worker
# thread used for inactivity timeout monitoring.
_cron_context = contextvars.copy_context()
_cron_future = _cron_pool.submit(_cron_context.run, agent.run_conversation, prompt)
_inactivity_timeout = False
try:
if _cron_inactivity_limit is None:
# Unlimited — just wait for the result.
result = _cron_future.result()
else:
result = None
while True:
done, _ = concurrent.futures.wait(
{_cron_future}, timeout=_POLL_INTERVAL,
)
if done:
result = _cron_future.result()
break
# Agent still running — check inactivity.
_idle_secs = 0.0
if hasattr(agent, "get_activity_summary"):
try:
_act = agent.get_activity_summary()
_idle_secs = _act.get("seconds_since_activity", 0.0)
except Exception:
pass
if _idle_secs >= _cron_inactivity_limit:
_inactivity_timeout = True
break
except Exception:
_cron_pool.shutdown(wait=False, cancel_futures=True)
raise
finally:
_cron_pool.shutdown(wait=False, cancel_futures=True)
if _inactivity_timeout:
# Build diagnostic summary from the agent's activity tracker.
_activity = {}
if hasattr(agent, "get_activity_summary"):
try:
_activity = agent.get_activity_summary()
except Exception:
pass
_last_desc = _activity.get("last_activity_desc", "unknown")
_secs_ago = _activity.get("seconds_since_activity", 0)
_cur_tool = _activity.get("current_tool")
_iter_n = _activity.get("api_call_count", 0)
_iter_max = _activity.get("max_iterations", 0)
logger.error(
"Job '%s' idle for %.0fs (inactivity limit %.0fs) "
"| last_activity=%s | iteration=%s/%s | tool=%s",
job_name, _secs_ago, _cron_inactivity_limit,
_last_desc, _iter_n, _iter_max,
_cur_tool or "none",
)
if hasattr(agent, "interrupt"):
agent.interrupt("Cron job timed out (inactivity)")
raise TimeoutError(
f"Cron job '{job_name}' idle for "
f"{int(_secs_ago)}s (limit {int(_cron_inactivity_limit)}s) "
f"— last activity: {_last_desc}"
)
# Guard against non-dict returns from run_conversation under error conditions
if not isinstance(result, dict):
raise RuntimeError(
f"agent.run_conversation returned {type(result).__name__} instead of dict: {result!r}"
)
final_response = result.get("final_response", "") or ""
# Strip leaked placeholder text that upstream may inject on empty completions.
if final_response.strip() == "(No response generated)":
final_response = ""
# Use a separate variable for log display; keep final_response clean
# for delivery logic (empty response = no delivery).
logged_response = final_response if final_response else "(No response generated)"
output = f"""# Cron Job: {job_name}
**Job ID:** {job_id}
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Response
{logged_response}
"""
logger.info("Job '%s' completed successfully", job_name)
return True, output, final_response, None
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
logger.exception("Job '%s' failed: %s", job_name, error_msg)
output = f"""# Cron Job: {job_name} (FAILED)
**Job ID:** {job_id}
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Error
```
{error_msg}
```
"""
return False, output, "", error_msg
finally:
# Clean up ContextVar session/delivery state for this job.
clear_session_vars(_ctx_tokens)
if _session_db:
try:
_session_db.end_session(_cron_session_id, "cron_complete")
except (Exception, KeyboardInterrupt) as e:
logger.debug("Job '%s': failed to end session: %s", job_id, e)
try:
_session_db.close()
except (Exception, KeyboardInterrupt) as e:
logger.debug("Job '%s': failed to close SQLite session store: %s", job_id, e)
def tick(verbose: bool = True, adapters=None, loop=None) -> int:
"""
Check and run all due jobs.
Uses a file lock so only one tick runs at a time, even if the gateway's
in-process ticker and a standalone daemon or manual tick overlap.
Args:
verbose: Whether to print status messages
adapters: Optional dict mapping Platform live adapter (from gateway)
loop: Optional asyncio event loop (from gateway) for live adapter sends
Returns:
Number of jobs executed (0 if another tick is already running)
"""
_LOCK_DIR.mkdir(parents=True, exist_ok=True)
2026-02-25 16:27:40 -08:00
# Cross-platform file locking: fcntl on Unix, msvcrt on Windows
lock_fd = None
try:
lock_fd = open(_LOCK_FILE, "w")
2026-02-25 16:27:40 -08:00
if fcntl:
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
elif msvcrt:
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_NBLCK, 1)
except (OSError, IOError):
logger.debug("Tick skipped — another instance holds the lock")
if lock_fd is not None:
lock_fd.close()
return 0
try:
due_jobs = get_due_jobs()
if verbose and not due_jobs:
logger.info("%s - No jobs due", _hermes_now().strftime('%H:%M:%S'))
return 0
if verbose:
logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))
# Advance next_run_at for all recurring jobs FIRST, under the file lock,
# before any execution begins. This preserves at-most-once semantics.
for job in due_jobs:
advance_next_run(job["id"])
# Resolve max parallel workers: env var > config.yaml > unbounded.
# Set HERMES_CRON_MAX_PARALLEL=1 to restore old serial behaviour.
_max_workers: Optional[int] = None
try:
_env_par = os.getenv("HERMES_CRON_MAX_PARALLEL", "").strip()
if _env_par:
_max_workers = int(_env_par) or None
except (ValueError, TypeError):
logger.warning("Invalid HERMES_CRON_MAX_PARALLEL value; defaulting to unbounded")
if _max_workers is None:
try:
_ucfg = load_config() or {}
_cfg_par = (
_ucfg.get("cron", {}) if isinstance(_ucfg, dict) else {}
).get("max_parallel_jobs")
if _cfg_par is not None:
_max_workers = int(_cfg_par) or None
except Exception:
pass
if verbose:
logger.info(
"Running %d job(s) in parallel (max_workers=%s)",
len(due_jobs),
_max_workers if _max_workers else "unbounded",
)
def _process_job(job: dict) -> bool:
"""Run one due job end-to-end: execute, save, deliver, mark."""
try:
success, output, final_response, error = run_job(job)
output_file = save_job_output(job["id"], output)
if verbose:
logger.info("Output saved to: %s", output_file)
# Deliver the final response to the origin/target chat.
# If the agent responded with [SILENT], skip delivery (but
# output is already saved above). Failed jobs always deliver.
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
should_deliver = bool(deliver_content)
if should_deliver and success and SILENT_MARKER in deliver_content.strip().upper():
logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER)
should_deliver = False
delivery_error = None
if should_deliver:
try:
delivery_error = _deliver_result(job, deliver_content, adapters=adapters, loop=loop)
except Exception as de:
delivery_error = str(de)
logger.error("Delivery failed for job %s: %s", job["id"], de)
# Treat empty final_response as a soft failure so last_status
# is not "ok" — the agent ran but produced nothing useful.
# (issue #8585)
if success and not final_response:
success = False
error = "Agent completed but produced empty response (model error, timeout, or misconfiguration)"
mark_job_run(job["id"], success, error, delivery_error=delivery_error)
return True
except Exception as e:
logger.error("Error processing job %s: %s", job['id'], e)
mark_job_run(job["id"], False, str(e))
return False
# Run all due jobs concurrently, each in its own ContextVar copy
# so session/delivery state stays isolated per-thread.
with concurrent.futures.ThreadPoolExecutor(max_workers=_max_workers) as _tick_pool:
_futures = []
for job in due_jobs:
_ctx = contextvars.copy_context()
_futures.append(_tick_pool.submit(_ctx.run, _process_job, job))
_results = [f.result() for f in _futures]
return sum(_results)
finally:
2026-02-25 16:27:40 -08:00
if fcntl:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
elif msvcrt:
try:
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_UNLCK, 1)
except (OSError, IOError):
pass
lock_fd.close()
if __name__ == "__main__":
tick(verbose=True)