fixup! fix(gateway): preserve inactivity clock on interrupt-recursive cached-agent turns (#15654)

Address Copilot review findings:

1. Gate _last_activity_desc on interrupt_depth == 0 alongside _last_activity_ts.
   Both fields are semantically paired — desc describes the activity *at* ts.
   Updating desc without ts made get_activity_summary() report "starting new
   turn (cached)" for 20+ minutes while the timestamp showed the true stale
   duration, producing misleading diagnostic output.

2. Monkeypatch gateway.run.time.time to a fixed epoch in tests that assert
   on _last_activity_ts values.  Real time.time() comparisons were latently
   flaky under slow CI or NTP adjustments.  _FAKE_NOW = 10_000.0 is used
   as the reference; assertions are now exact equality rather than >=.

3. Add test_fresh_turn_resets_desc and test_interrupt_turn_preserves_desc to
   directly cover the gated desc behaviour introduced by (1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
briandevans
2026-04-25 17:12:36 -07:00
committed by Teknium
parent de24315978
commit 4e356098d2
2 changed files with 47 additions and 16 deletions

View File

@@ -8726,16 +8726,19 @@ class GatewayRunner:
def _init_cached_agent_for_turn(agent: Any, interrupt_depth: int) -> None:
"""Reset per-turn state on a cached agent before a new turn starts.
_last_activity_ts is only reset for fresh external turns (depth 0).
For interrupt-recursive turns the timestamp is preserved so the
inactivity watchdog can accumulate stuck-turn idle time and fire
the 30-min timeout (#15654). The depth-0 reset is still needed:
a session idle for 29 min would otherwise trip the watchdog before
the new turn makes its first API call (#9051).
Both _last_activity_ts and _last_activity_desc are only reset for
fresh external turns (depth 0); they are semantically paired —
desc describes the activity *at* ts, so updating one without the
other would make get_activity_summary() misleading.
For interrupt-recursive turns both are preserved so the inactivity
watchdog can accumulate stuck-turn idle time and fire the 30-min
timeout (#15654). The depth-0 reset is still needed: a session
idle for 29 min would otherwise trip the watchdog before the new
turn makes its first API call (#9051).
"""
if interrupt_depth == 0:
agent._last_activity_ts = time.time()
agent._last_activity_desc = "starting new turn (cached)"
agent._last_activity_desc = "starting new turn (cached)"
agent._api_call_count = 0
def _release_evicted_agent_soft(self, agent: Any) -> None:

View File

@@ -1045,6 +1045,9 @@ class TestAgentCacheIdleResume:
pass
_FAKE_NOW = 10_000.0 # Fixed epoch for deterministic time assertions
class TestCachedAgentInactivityReset:
"""Inactivity-clock reset must be gated on _interrupt_depth == 0.
@@ -1057,9 +1060,8 @@ class TestCachedAgentInactivityReset:
"""
def _fake_agent(self, stale_seconds: float = 1800.0):
import time as _t
m = MagicMock()
m._last_activity_ts = _t.time() - stale_seconds
m._last_activity_ts = _FAKE_NOW - stale_seconds
m._api_call_count = 10
m._last_activity_desc = "previous turn activity"
return m
@@ -1067,22 +1069,34 @@ class TestCachedAgentInactivityReset:
def test_fresh_turn_resets_idle_clock(self):
"""interrupt_depth=0: clock resets so a post-idle turn gets a
fresh 30-min inactivity window (guard for #9051)."""
import time as _t
from gateway.run import GatewayRunner
agent = self._fake_agent(stale_seconds=1800.0)
old_ts = agent._last_activity_ts
before = _t.time()
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=0)
with patch("gateway.run.time") as mock_time:
mock_time.time.return_value = _FAKE_NOW
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=0)
assert agent._last_activity_ts >= before, (
assert agent._last_activity_ts == _FAKE_NOW, (
"_last_activity_ts was not reset on a fresh turn (interrupt_depth=0)"
)
assert agent._last_activity_ts > old_ts, (
"Stale idle time should be cleared so the new turn gets a fresh window"
)
def test_fresh_turn_resets_desc(self):
"""interrupt_depth=0: description is updated to reflect the new turn."""
from gateway.run import GatewayRunner
agent = self._fake_agent()
with patch("gateway.run.time") as mock_time:
mock_time.time.return_value = _FAKE_NOW
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=0)
assert agent._last_activity_desc == "starting new turn (cached)"
def test_interrupt_turn_preserves_idle_clock(self):
"""interrupt_depth=1: clock preserved so accumulated stuck-turn
idle time is not discarded by an interrupt-recursive re-entry (#15654)."""
@@ -1098,6 +1112,19 @@ class TestCachedAgentInactivityReset:
"(interrupt_depth>0) — the watchdog needs the accumulated idle time"
)
def test_interrupt_turn_preserves_desc(self):
"""interrupt_depth=1: desc preserved — it is semantically paired with ts."""
from gateway.run import GatewayRunner
agent = self._fake_agent(stale_seconds=1200.0)
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=1)
assert agent._last_activity_desc == "previous turn activity", (
"_last_activity_desc must not change on interrupt-recursive turns; "
"it describes the activity *at* _last_activity_ts"
)
def test_deep_interrupt_recursion_preserves_idle_clock(self):
"""interrupt_depth=MAX-1: clock still preserved at any non-zero depth."""
from gateway.run import GatewayRunner
@@ -1116,7 +1143,9 @@ class TestCachedAgentInactivityReset:
agent_fresh = self._fake_agent()
agent_interrupted = self._fake_agent()
GatewayRunner._init_cached_agent_for_turn(agent_fresh, interrupt_depth=0)
with patch("gateway.run.time") as mock_time:
mock_time.time.return_value = _FAKE_NOW
GatewayRunner._init_cached_agent_for_turn(agent_fresh, interrupt_depth=0)
GatewayRunner._init_cached_agent_for_turn(agent_interrupted, interrupt_depth=1)
assert agent_fresh._api_call_count == 0
@@ -1128,7 +1157,6 @@ class TestCachedAgentInactivityReset:
The idle time seen by the watchdog must reflect the full stuck
duration, not restart from zero on the recursive re-entry.
"""
import time as _t
from gateway.run import GatewayRunner
STUCK_FOR = 1750.0
@@ -1139,7 +1167,7 @@ class TestCachedAgentInactivityReset:
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=1)
# Watchdog sees time.time() - _last_activity_ts ≥ STUCK_FOR.
idle_secs = _t.time() - agent._last_activity_ts
idle_secs = _FAKE_NOW - agent._last_activity_ts
assert idle_secs >= STUCK_FOR - 1.0, (
f"Watchdog would see {idle_secs:.0f}s idle, expected ~{STUCK_FOR}s. "
"Inactivity timeout could not fire for a stuck interrupted turn."