mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 22:41:19 +08:00
fix(cron): don't silently disable recurring cron jobs when croniter is missing (#16368)
If the gateway's Python env loses access to 'croniter' between when a cron job was created and when mark_job_run() fires, compute_next_run() returns None for cron schedules. mark_job_run() treated that as terminal completion and wrote enabled=false, state=completed — turning a missing runtime dep into a silent, permanent job-off. That behaviour is safe for one-shot jobs but wrong for recurring ones. A missing dep should surface as an error the user can see, not as successful completion of a job that is about to stop firing. mark_job_run() now only disables the job on next_run_at=None when the schedule is one-shot. For recurring (cron/interval) schedules it keeps enabled=true, sets state=error, and records last_error so the user can see why the job isn't advancing. compute_next_run() also logs a warning the first time cron+no-croniter hits, so the underlying cause is visible in the gateway log. Tests cover: - recurring cron job stays enabled with state=error when HAS_CRONITER=False - recurring interval stays enabled when compute_next_run returns None - one-shot jobs still flip to enabled=false, state=completed (no regression) Fixes #16265
This commit is contained in:
34
cron/jobs.py
34
cron/jobs.py
@@ -311,6 +311,12 @@ def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None
|
||||
|
||||
elif schedule["kind"] == "cron":
|
||||
if not HAS_CRONITER:
|
||||
logger.warning(
|
||||
"Cannot compute next run for cron schedule %r: 'croniter' "
|
||||
"is not installed. Install the 'cron' extra (pip install "
|
||||
"'hermes-agent[cron]') to re-enable recurring cron jobs.",
|
||||
schedule.get("expr"),
|
||||
)
|
||||
return None
|
||||
cron = croniter(schedule["expr"], now)
|
||||
next_run = cron.get_next(datetime)
|
||||
@@ -698,10 +704,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
|
||||
# Compute next run
|
||||
job["next_run_at"] = compute_next_run(job["schedule"], now)
|
||||
|
||||
# If no next run (one-shot completed), disable
|
||||
# If no next run, decide whether this is terminal completion
|
||||
# (one-shot) or a transient failure (recurring schedule couldn't
|
||||
# compute — e.g. 'croniter' missing from the runtime env).
|
||||
# Recurring jobs must NEVER be silently disabled: that turns a
|
||||
# missing runtime dep into "job completed" and the user's
|
||||
# schedule quietly goes off. See issue #16265.
|
||||
if job["next_run_at"] is None:
|
||||
job["enabled"] = False
|
||||
job["state"] = "completed"
|
||||
kind = job.get("schedule", {}).get("kind")
|
||||
if kind in ("cron", "interval"):
|
||||
job["state"] = "error"
|
||||
if not job.get("last_error"):
|
||||
job["last_error"] = (
|
||||
"Failed to compute next run for recurring "
|
||||
"schedule (is the 'croniter' package "
|
||||
"installed in the gateway's Python env?)"
|
||||
)
|
||||
logger.error(
|
||||
"Job '%s' (%s) could not compute next_run_at; "
|
||||
"leaving enabled and marking state=error so the "
|
||||
"job is not silently disabled.",
|
||||
job.get("name", job["id"]),
|
||||
kind,
|
||||
)
|
||||
else:
|
||||
job["enabled"] = False
|
||||
job["state"] = "completed"
|
||||
elif job.get("state") != "paused":
|
||||
job["state"] = "scheduled"
|
||||
|
||||
|
||||
@@ -369,6 +369,88 @@ class TestMarkJobRun:
|
||||
assert updated["last_error"] == "model timeout"
|
||||
assert updated["last_delivery_error"] == "platform 'discord' not enabled"
|
||||
|
||||
def test_recurring_cron_not_disabled_when_croniter_missing(self, tmp_cron_dir, monkeypatch):
|
||||
"""Regression test for issue #16265.
|
||||
|
||||
If the gateway runs in an env where `croniter` went missing after a
|
||||
recurring cron job was persisted, `compute_next_run()` returns None.
|
||||
`mark_job_run()` must NOT treat that as terminal completion — the job
|
||||
has to stay enabled with state=error so the user notices, rather than
|
||||
silently flipping to enabled=false, state=completed.
|
||||
"""
|
||||
pytest.importorskip("croniter") # need it to create the job
|
||||
job = create_job(prompt="Recurring", schedule="0 7,15,23 * * *")
|
||||
assert job["schedule"]["kind"] == "cron"
|
||||
|
||||
# Simulate the runtime env having lost croniter between job creation
|
||||
# and this run.
|
||||
monkeypatch.setattr("cron.jobs.HAS_CRONITER", False)
|
||||
|
||||
mark_job_run(job["id"], success=True)
|
||||
|
||||
updated = get_job(job["id"])
|
||||
assert updated is not None, "recurring cron job was deleted"
|
||||
assert updated["enabled"] is True, (
|
||||
"recurring cron job was disabled despite croniter-missing being "
|
||||
"a runtime dep issue, not a terminal completion"
|
||||
)
|
||||
assert updated["state"] == "error"
|
||||
assert updated["state"] != "completed"
|
||||
assert updated["next_run_at"] is None
|
||||
assert updated["last_error"]
|
||||
assert "croniter" in updated["last_error"].lower()
|
||||
|
||||
def test_recurring_interval_not_disabled_when_next_run_is_none(self, tmp_cron_dir, monkeypatch):
|
||||
"""Defensive sibling of the cron test — any recurring schedule that
|
||||
somehow yields next_run_at=None must stay enabled with state=error.
|
||||
"""
|
||||
job = create_job(prompt="Recurring", schedule="every 1h")
|
||||
assert job["schedule"]["kind"] == "interval"
|
||||
|
||||
# Force compute_next_run to return None for this call — simulates
|
||||
# any future regression where a recurring schedule loses its
|
||||
# next-run computation (missing dep, corrupt schedule, etc.).
|
||||
monkeypatch.setattr("cron.jobs.compute_next_run", lambda *a, **kw: None)
|
||||
|
||||
mark_job_run(job["id"], success=True)
|
||||
|
||||
updated = get_job(job["id"])
|
||||
assert updated is not None
|
||||
assert updated["enabled"] is True
|
||||
assert updated["state"] == "error"
|
||||
assert updated["state"] != "completed"
|
||||
|
||||
def test_oneshot_still_completes_when_next_run_is_none(self, tmp_cron_dir):
|
||||
"""One-shot jobs must still flip to enabled=false, state=completed
|
||||
when next_run_at cannot be computed — the #16265 fix must not
|
||||
regress this path. We bypass create_job and craft a minimal
|
||||
one-shot record directly so that the repeat-limit branch doesn't
|
||||
pop the job before we observe the terminal-completion branch.
|
||||
"""
|
||||
jobs = [{
|
||||
"id": "oneshot-test",
|
||||
"prompt": "Once",
|
||||
"schedule": {"kind": "once", "run_at": "2020-01-01T00:00:00+00:00", "display": "once"},
|
||||
"repeat": {"times": None, "completed": 0},
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"next_run_at": "2020-01-01T00:00:00+00:00",
|
||||
"last_run_at": None,
|
||||
"last_status": None,
|
||||
"last_error": None,
|
||||
"last_delivery_error": None,
|
||||
"created_at": "2020-01-01T00:00:00+00:00",
|
||||
}]
|
||||
save_jobs(jobs)
|
||||
|
||||
mark_job_run("oneshot-test", success=True)
|
||||
|
||||
updated = get_job("oneshot-test")
|
||||
assert updated is not None
|
||||
assert updated["next_run_at"] is None
|
||||
assert updated["enabled"] is False
|
||||
assert updated["state"] == "completed"
|
||||
|
||||
|
||||
class TestAdvanceNextRun:
|
||||
"""Tests for advance_next_run() — crash-safety for recurring jobs."""
|
||||
|
||||
Reference in New Issue
Block a user