fix(cron): don't silently disable recurring cron jobs when croniter is missing (#16368)

If the gateway's Python env loses access to 'croniter' between when a cron job was created and when mark_job_run() fires, compute_next_run() returns None for cron schedules. mark_job_run() treated that as terminal completion and wrote enabled=false, state=completed — turning a missing runtime dep into a silent, permanent job-off. That behaviour is safe for one-shot jobs but wrong for recurring ones. A missing dep should surface as an error the user can see, not as successful completion of a job that is about to stop firing. mark_job_run() now only disables the job on next_run_at=None when the schedule is one-shot. For recurring (cron/interval) schedules it keeps enabled=true, sets state=error, and records last_error so the user can see why the job isn't advancing. compute_next_run() also logs a warning the first time cron+no-croniter hits, so the underlying cause is visible in the gateway log. Tests cover: - recurring cron job stays enabled with state=error when HAS_CRONITER=False - recurring interval stays enabled when compute_next_run returns None - one-shot jobs still flip to enabled=false, state=completed (no regression) Fixes #16265
2026-04-27 22:41:19 +08:00 · 2026-04-26 21:47:32 -07:00
parent c5781d50c7
commit 7c63c24613
2 changed files with 113 additions and 3 deletions
--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -311,6 +311,12 @@ def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None

    elif schedule["kind"] == "cron":
        if not HAS_CRONITER:
+            logger.warning(
+                "Cannot compute next run for cron schedule %r: 'croniter' "
+                "is not installed. Install the 'cron' extra (pip install "
+                "'hermes-agent[cron]') to re-enable recurring cron jobs.",
+                schedule.get("expr"),
+            )
            return None
        cron = croniter(schedule["expr"], now)
        next_run = cron.get_next(datetime)
@@ -698,10 +704,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
                # Compute next run
                job["next_run_at"] = compute_next_run(job["schedule"], now)

-                # If no next run (one-shot completed), disable
+                # If no next run, decide whether this is terminal completion
+                # (one-shot) or a transient failure (recurring schedule couldn't
+                # compute — e.g. 'croniter' missing from the runtime env).
+                # Recurring jobs must NEVER be silently disabled: that turns a
+                # missing runtime dep into "job completed" and the user's
+                # schedule quietly goes off. See issue #16265.
                if job["next_run_at"] is None:
-                    job["enabled"] = False
-                    job["state"] = "completed"
+                    kind = job.get("schedule", {}).get("kind")
+                    if kind in ("cron", "interval"):
+                        job["state"] = "error"
+                        if not job.get("last_error"):
+                            job["last_error"] = (
+                                "Failed to compute next run for recurring "
+                                "schedule (is the 'croniter' package "
+                                "installed in the gateway's Python env?)"
+                            )
+                        logger.error(
+                            "Job '%s' (%s) could not compute next_run_at; "
+                            "leaving enabled and marking state=error so the "
+                            "job is not silently disabled.",
+                            job.get("name", job["id"]),
+                            kind,
+                        )
+                    else:
+                        job["enabled"] = False
+                        job["state"] = "completed"
                elif job.get("state") != "paused":
                    job["state"] = "scheduled"

--- a/tests/cron/test_jobs.py
+++ b/tests/cron/test_jobs.py
@@ -369,6 +369,88 @@ class TestMarkJobRun:
        assert updated["last_error"] == "model timeout"
        assert updated["last_delivery_error"] == "platform 'discord' not enabled"

+    def test_recurring_cron_not_disabled_when_croniter_missing(self, tmp_cron_dir, monkeypatch):
+        """Regression test for issue #16265.
+
+        If the gateway runs in an env where `croniter` went missing after a
+        recurring cron job was persisted, `compute_next_run()` returns None.
+        `mark_job_run()` must NOT treat that as terminal completion — the job
+        has to stay enabled with state=error so the user notices, rather than
+        silently flipping to enabled=false, state=completed.
+        """
+        pytest.importorskip("croniter")  # need it to create the job
+        job = create_job(prompt="Recurring", schedule="0 7,15,23 * * *")
+        assert job["schedule"]["kind"] == "cron"
+
+        # Simulate the runtime env having lost croniter between job creation
+        # and this run.
+        monkeypatch.setattr("cron.jobs.HAS_CRONITER", False)
+
+        mark_job_run(job["id"], success=True)
+
+        updated = get_job(job["id"])
+        assert updated is not None, "recurring cron job was deleted"
+        assert updated["enabled"] is True, (
+            "recurring cron job was disabled despite croniter-missing being "
+            "a runtime dep issue, not a terminal completion"
+        )
+        assert updated["state"] == "error"
+        assert updated["state"] != "completed"
+        assert updated["next_run_at"] is None
+        assert updated["last_error"]
+        assert "croniter" in updated["last_error"].lower()
+
+    def test_recurring_interval_not_disabled_when_next_run_is_none(self, tmp_cron_dir, monkeypatch):
+        """Defensive sibling of the cron test — any recurring schedule that
+        somehow yields next_run_at=None must stay enabled with state=error.
+        """
+        job = create_job(prompt="Recurring", schedule="every 1h")
+        assert job["schedule"]["kind"] == "interval"
+
+        # Force compute_next_run to return None for this call — simulates
+        # any future regression where a recurring schedule loses its
+        # next-run computation (missing dep, corrupt schedule, etc.).
+        monkeypatch.setattr("cron.jobs.compute_next_run", lambda *a, **kw: None)
+
+        mark_job_run(job["id"], success=True)
+
+        updated = get_job(job["id"])
+        assert updated is not None
+        assert updated["enabled"] is True
+        assert updated["state"] == "error"
+        assert updated["state"] != "completed"
+
+    def test_oneshot_still_completes_when_next_run_is_none(self, tmp_cron_dir):
+        """One-shot jobs must still flip to enabled=false, state=completed
+        when next_run_at cannot be computed — the #16265 fix must not
+        regress this path. We bypass create_job and craft a minimal
+        one-shot record directly so that the repeat-limit branch doesn't
+        pop the job before we observe the terminal-completion branch.
+        """
+        jobs = [{
+            "id": "oneshot-test",
+            "prompt": "Once",
+            "schedule": {"kind": "once", "run_at": "2020-01-01T00:00:00+00:00", "display": "once"},
+            "repeat": {"times": None, "completed": 0},
+            "enabled": True,
+            "state": "scheduled",
+            "next_run_at": "2020-01-01T00:00:00+00:00",
+            "last_run_at": None,
+            "last_status": None,
+            "last_error": None,
+            "last_delivery_error": None,
+            "created_at": "2020-01-01T00:00:00+00:00",
+        }]
+        save_jobs(jobs)
+
+        mark_job_run("oneshot-test", success=True)
+
+        updated = get_job("oneshot-test")
+        assert updated is not None
+        assert updated["next_run_at"] is None
+        assert updated["enabled"] is False
+        assert updated["state"] == "completed"
+

 class TestAdvanceNextRun:
    """Tests for advance_next_run() — crash-safety for recurring jobs."""