fix(cron): don't silently disable recurring cron jobs when croniter is missing (#16368)

If the gateway's Python env loses access to 'croniter' between when a
cron job was created and when mark_job_run() fires, compute_next_run()
returns None for cron schedules. mark_job_run() treated that as terminal
completion and wrote enabled=false, state=completed — turning a missing
runtime dep into a silent, permanent job-off.

That behaviour is safe for one-shot jobs but wrong for recurring ones. A
missing dep should surface as an error the user can see, not as successful
completion of a job that is about to stop firing.

mark_job_run() now only disables the job on next_run_at=None when the
schedule is one-shot. For recurring (cron/interval) schedules it keeps
enabled=true, sets state=error, and records last_error so the user can
see why the job isn't advancing. compute_next_run() also logs a warning
the first time cron+no-croniter hits, so the underlying cause is visible
in the gateway log.

Tests cover:
- recurring cron job stays enabled with state=error when HAS_CRONITER=False
- recurring interval stays enabled when compute_next_run returns None
- one-shot jobs still flip to enabled=false, state=completed (no regression)

Fixes #16265
This commit is contained in:
Teknium
2026-04-26 21:47:32 -07:00
committed by GitHub
parent c5781d50c7
commit 7c63c24613
2 changed files with 113 additions and 3 deletions

View File

@@ -311,6 +311,12 @@ def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None
elif schedule["kind"] == "cron":
if not HAS_CRONITER:
logger.warning(
"Cannot compute next run for cron schedule %r: 'croniter' "
"is not installed. Install the 'cron' extra (pip install "
"'hermes-agent[cron]') to re-enable recurring cron jobs.",
schedule.get("expr"),
)
return None
cron = croniter(schedule["expr"], now)
next_run = cron.get_next(datetime)
@@ -698,10 +704,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
# Compute next run
job["next_run_at"] = compute_next_run(job["schedule"], now)
# If no next run (one-shot completed), disable
# If no next run, decide whether this is terminal completion
# (one-shot) or a transient failure (recurring schedule couldn't
# compute — e.g. 'croniter' missing from the runtime env).
# Recurring jobs must NEVER be silently disabled: that turns a
# missing runtime dep into "job completed" and the user's
# schedule quietly goes off. See issue #16265.
if job["next_run_at"] is None:
job["enabled"] = False
job["state"] = "completed"
kind = job.get("schedule", {}).get("kind")
if kind in ("cron", "interval"):
job["state"] = "error"
if not job.get("last_error"):
job["last_error"] = (
"Failed to compute next run for recurring "
"schedule (is the 'croniter' package "
"installed in the gateway's Python env?)"
)
logger.error(
"Job '%s' (%s) could not compute next_run_at; "
"leaving enabled and marking state=error so the "
"job is not silently disabled.",
job.get("name", job["id"]),
kind,
)
else:
job["enabled"] = False
job["state"] = "completed"
elif job.get("state") != "paused":
job["state"] = "scheduled"

View File

@@ -369,6 +369,88 @@ class TestMarkJobRun:
assert updated["last_error"] == "model timeout"
assert updated["last_delivery_error"] == "platform 'discord' not enabled"
def test_recurring_cron_not_disabled_when_croniter_missing(self, tmp_cron_dir, monkeypatch):
"""Regression test for issue #16265.
If the gateway runs in an env where `croniter` went missing after a
recurring cron job was persisted, `compute_next_run()` returns None.
`mark_job_run()` must NOT treat that as terminal completion — the job
has to stay enabled with state=error so the user notices, rather than
silently flipping to enabled=false, state=completed.
"""
pytest.importorskip("croniter") # need it to create the job
job = create_job(prompt="Recurring", schedule="0 7,15,23 * * *")
assert job["schedule"]["kind"] == "cron"
# Simulate the runtime env having lost croniter between job creation
# and this run.
monkeypatch.setattr("cron.jobs.HAS_CRONITER", False)
mark_job_run(job["id"], success=True)
updated = get_job(job["id"])
assert updated is not None, "recurring cron job was deleted"
assert updated["enabled"] is True, (
"recurring cron job was disabled despite croniter-missing being "
"a runtime dep issue, not a terminal completion"
)
assert updated["state"] == "error"
assert updated["state"] != "completed"
assert updated["next_run_at"] is None
assert updated["last_error"]
assert "croniter" in updated["last_error"].lower()
def test_recurring_interval_not_disabled_when_next_run_is_none(self, tmp_cron_dir, monkeypatch):
"""Defensive sibling of the cron test — any recurring schedule that
somehow yields next_run_at=None must stay enabled with state=error.
"""
job = create_job(prompt="Recurring", schedule="every 1h")
assert job["schedule"]["kind"] == "interval"
# Force compute_next_run to return None for this call — simulates
# any future regression where a recurring schedule loses its
# next-run computation (missing dep, corrupt schedule, etc.).
monkeypatch.setattr("cron.jobs.compute_next_run", lambda *a, **kw: None)
mark_job_run(job["id"], success=True)
updated = get_job(job["id"])
assert updated is not None
assert updated["enabled"] is True
assert updated["state"] == "error"
assert updated["state"] != "completed"
def test_oneshot_still_completes_when_next_run_is_none(self, tmp_cron_dir):
"""One-shot jobs must still flip to enabled=false, state=completed
when next_run_at cannot be computed — the #16265 fix must not
regress this path. We bypass create_job and craft a minimal
one-shot record directly so that the repeat-limit branch doesn't
pop the job before we observe the terminal-completion branch.
"""
jobs = [{
"id": "oneshot-test",
"prompt": "Once",
"schedule": {"kind": "once", "run_at": "2020-01-01T00:00:00+00:00", "display": "once"},
"repeat": {"times": None, "completed": 0},
"enabled": True,
"state": "scheduled",
"next_run_at": "2020-01-01T00:00:00+00:00",
"last_run_at": None,
"last_status": None,
"last_error": None,
"last_delivery_error": None,
"created_at": "2020-01-01T00:00:00+00:00",
}]
save_jobs(jobs)
mark_job_run("oneshot-test", success=True)
updated = get_job("oneshot-test")
assert updated is not None
assert updated["next_run_at"] is None
assert updated["enabled"] is False
assert updated["state"] == "completed"
class TestAdvanceNextRun:
"""Tests for advance_next_run() — crash-safety for recurring jobs."""