Compare commits

...

2 Commits

Author SHA1 Message Date
Teknium
9f3097033b docs: add cron script timeout and provider recovery documentation
- Add HERMES_CRON_TIMEOUT and HERMES_CRON_SCRIPT_TIMEOUT to env vars reference
- Add script timeout and provider recovery sections to cron features page
- Add timeout resolution chain and credential pool details to cron internals
2026-04-10 02:53:49 -07:00
Dominic Grieco
dafd2cb4f0 fix: harden cron script timeout and provider recovery 2026-04-10 02:50:38 -07:00
4 changed files with 104 additions and 3 deletions

View File

@@ -346,7 +346,42 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
return None
_SCRIPT_TIMEOUT = 120 # seconds
_DEFAULT_SCRIPT_TIMEOUT = 120 # seconds
# Backward-compatible module override used by tests and emergency monkeypatches.
_SCRIPT_TIMEOUT = _DEFAULT_SCRIPT_TIMEOUT
def _get_script_timeout() -> int:
"""Resolve cron pre-run script timeout from module/env/config with a safe default."""
if _SCRIPT_TIMEOUT != _DEFAULT_SCRIPT_TIMEOUT:
try:
timeout = int(float(_SCRIPT_TIMEOUT))
if timeout > 0:
return timeout
except Exception:
logger.warning("Invalid patched _SCRIPT_TIMEOUT=%r; using env/config/default", _SCRIPT_TIMEOUT)
env_value = os.getenv("HERMES_CRON_SCRIPT_TIMEOUT", "").strip()
if env_value:
try:
timeout = int(float(env_value))
if timeout > 0:
return timeout
except Exception:
logger.warning("Invalid HERMES_CRON_SCRIPT_TIMEOUT=%r; using config/default", env_value)
try:
cfg = load_config() or {}
cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {}
configured = cron_cfg.get("script_timeout_seconds")
if configured is not None:
timeout = int(float(configured))
if timeout > 0:
return timeout
except Exception as exc:
logger.debug("Failed to load cron script timeout from config: %s", exc)
return _DEFAULT_SCRIPT_TIMEOUT
def _run_job_script(script_path: str) -> tuple[bool, str]:
@@ -393,12 +428,14 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
if not path.is_file():
return False, f"Script path is not a file: {path}"
script_timeout = _get_script_timeout()
try:
result = subprocess.run(
[sys.executable, str(path)],
capture_output=True,
text=True,
timeout=_SCRIPT_TIMEOUT,
timeout=script_timeout,
cwd=str(path.parent),
)
stdout = (result.stdout or "").strip()
@@ -422,7 +459,7 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
return True, stdout
except subprocess.TimeoutExpired:
return False, f"Script timed out after {_SCRIPT_TIMEOUT}s: {path}"
return False, f"Script timed out after {script_timeout}s: {path}"
except Exception as exc:
return False, f"Script execution failed: {exc}"
@@ -646,6 +683,24 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
},
)
fallback_model = _cfg.get("fallback_providers") or _cfg.get("fallback_model") or None
credential_pool = None
runtime_provider = str(turn_route["runtime"].get("provider") or "").strip().lower()
if runtime_provider:
try:
from agent.credential_pool import load_pool
pool = load_pool(runtime_provider)
if pool.has_credentials():
credential_pool = pool
logger.info(
"Job '%s': loaded credential pool for provider %s with %d entries",
job_id,
runtime_provider,
len(pool.entries()),
)
except Exception as e:
logger.debug("Job '%s': failed to load credential pool for %s: %s", job_id, runtime_provider, e)
agent = AIAgent(
model=turn_route["model"],
api_key=turn_route["runtime"].get("api_key"),
@@ -657,6 +712,8 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
max_iterations=max_iterations,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,
fallback_model=fallback_model,
credential_pool=credential_pool,
providers_allowed=pr.get("only"),
providers_ignored=pr.get("ignore"),
providers_order=pr.get("order"),

View File

@@ -132,6 +132,22 @@ import requests, json
# Print summary to stdout — agent analyzes and reports
```
The script timeout defaults to 120 seconds. `_get_script_timeout()` resolves the limit through a three-layer chain:
1. **Module-level override**`_SCRIPT_TIMEOUT` (for tests/monkeypatching). Only used when it differs from the default.
2. **Environment variable**`HERMES_CRON_SCRIPT_TIMEOUT`
3. **Config**`cron.script_timeout_seconds` in `config.yaml` (read via `load_config()`)
4. **Default** — 120 seconds
### Provider Recovery
`run_job()` passes the user's configured fallback providers and credential pool into the `AIAgent` instance:
- **Fallback providers** — reads `fallback_providers` (list) or `fallback_model` (legacy dict) from `config.yaml`, matching the gateway's `_load_fallback_model()` pattern. Passed as `fallback_model=` to `AIAgent.__init__`, which normalizes both formats into a fallback chain.
- **Credential pool** — loads via `load_pool(provider)` from `agent.credential_pool` using the resolved runtime provider name. Only passed when the pool has credentials (`pool.has_credentials()`). Enables same-provider key rotation on 429/rate-limit errors.
This mirrors the gateway's behavior — without it, cron agents would fail on rate limits without attempting recovery.
## Delivery Model
Cron job results can be delivered to any supported platform:

View File

@@ -285,6 +285,13 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI
| `HERMES_BACKGROUND_NOTIFICATIONS` | Background process notification mode in gateway: `all` (default), `result`, `error`, `off` |
| `HERMES_EPHEMERAL_SYSTEM_PROMPT` | Ephemeral system prompt injected at API-call time (never persisted to sessions) |
## Cron Scheduler
| Variable | Description |
|----------|-------------|
| `HERMES_CRON_TIMEOUT` | Inactivity timeout for cron job agent runs in seconds (default: `600`). The agent can run indefinitely while actively calling tools or receiving stream tokens — this only triggers when idle. Set to `0` for unlimited. |
| `HERMES_CRON_SCRIPT_TIMEOUT` | Timeout for pre-run scripts attached to cron jobs in seconds (default: `120`). Override for scripts that need longer execution (e.g., randomized delays for anti-bot timing). Also configurable via `cron.script_timeout_seconds` in `config.yaml`. |
## Session Settings
| Variable | Description |

View File

@@ -240,6 +240,27 @@ Otherwise, report the issue.
Failed jobs always deliver regardless of the `[SILENT]` marker — only successful runs can be silenced.
## Script timeout
Pre-run scripts (attached via the `script` parameter) have a default timeout of 120 seconds. If your scripts need longer — for example, to include randomized delays that avoid bot-like timing patterns — you can increase this:
```yaml
# ~/.hermes/config.yaml
cron:
script_timeout_seconds: 300 # 5 minutes
```
Or set the `HERMES_CRON_SCRIPT_TIMEOUT` environment variable. The resolution order is: env var → config.yaml → 120s default.
## Provider recovery
Cron jobs inherit your configured fallback providers and credential pool rotation. If the primary API key is rate-limited or the provider returns an error, the cron agent can:
- **Fall back to an alternate provider** if you have `fallback_providers` (or the legacy `fallback_model`) configured in `config.yaml`
- **Rotate to the next credential** in your [credential pool](/docs/user-guide/configuration#credential-pool-strategies) for the same provider
This means cron jobs that run at high frequency or during peak hours are more resilient — a single rate-limited key won't fail the entire run.
## Schedule formats
The agent's final response is automatically delivered — you do **not** need to include `send_message` in the cron prompt for that same destination. If a cron run calls `send_message` to the exact target the scheduler will already deliver to, Hermes skips that duplicate send and tells the model to put the user-facing content in the final response instead. Use `send_message` only for additional or different targets.