mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix: stale agent timeout, uv venv detection, empty response after tools, compression model fallback (#9051, #8620, #9400) (#10093)
Four independent fixes: 1. Reset activity timestamp on cached agent reuse (#9051) When the gateway reuses a cached AIAgent for a new turn, the _last_activity_ts from the previous turn (possibly hours ago) carried over. The inactivity timeout handler immediately saw the agent as idle for hours and killed it. Fix: reset _last_activity_ts, _last_activity_desc, and _api_call_count when retrieving an agent from the cache. 2. Detect uv-managed virtual environments (#8620 sub-issue 1) The systemd unit generator fell back to sys.executable (uv's standalone Python) when running under 'uv run', because sys.prefix == sys.base_prefix. The generated ExecStart pointed to a Python binary without site-packages. Fix: check VIRTUAL_ENV env var before falling back to sys.executable. uv sets VIRTUAL_ENV even when sys.prefix doesn't reflect the venv. 3. Nudge model to continue after empty post-tool response (#9400) Weaker models sometimes return empty after tool calls. The agent silently abandoned the remaining work. Fix: append assistant('(empty)') + user nudge message and retry once. Resets after each successful tool round. 4. Compression model fallback on permanent errors (#8620 sub-issue 4) When the default summary model (gemini-3-flash) returns 503 'model_not_found' on custom proxies, the compressor entered a 600s cooldown, leaving context growing unbounded. Fix: detect permanent model-not-found errors (503, 404, 'model_not_found', 'no available channel') and fall back to using the main model for compression instead of entering cooldown. One-time fallback with immediate retry. Test plan: 40 compressor tests + 97 gateway/CLI tests + 9 venv tests pass
This commit is contained in:
@@ -693,6 +693,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||||||
# Store for iterative updates on next compaction
|
# Store for iterative updates on next compaction
|
||||||
self._previous_summary = summary
|
self._previous_summary = summary
|
||||||
self._summary_failure_cooldown_until = 0.0
|
self._summary_failure_cooldown_until = 0.0
|
||||||
|
self._summary_model_fallen_back = False
|
||||||
return self._with_summary_prefix(summary)
|
return self._with_summary_prefix(summary)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
# No provider configured — long cooldown, unlikely to self-resolve
|
# No provider configured — long cooldown, unlikely to self-resolve
|
||||||
@@ -703,6 +704,34 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||||||
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
|
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# If the summary model is different from the main model and the
|
||||||
|
# error looks permanent (model not found, 503, 404), fall back to
|
||||||
|
# using the main model instead of entering cooldown that leaves
|
||||||
|
# context growing unbounded. (#8620 sub-issue 4)
|
||||||
|
_status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None)
|
||||||
|
_err_str = str(e).lower()
|
||||||
|
_is_model_not_found = (
|
||||||
|
_status in (404, 503)
|
||||||
|
or "model_not_found" in _err_str
|
||||||
|
or "does not exist" in _err_str
|
||||||
|
or "no available channel" in _err_str
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
_is_model_not_found
|
||||||
|
and self.summary_model
|
||||||
|
and self.summary_model != self.model
|
||||||
|
and not getattr(self, "_summary_model_fallen_back", False)
|
||||||
|
):
|
||||||
|
self._summary_model_fallen_back = True
|
||||||
|
logging.warning(
|
||||||
|
"Summary model '%s' not available (%s). "
|
||||||
|
"Falling back to main model '%s' for compression.",
|
||||||
|
self.summary_model, e, self.model,
|
||||||
|
)
|
||||||
|
self.summary_model = "" # empty = use main model
|
||||||
|
self._summary_failure_cooldown_until = 0.0 # no cooldown
|
||||||
|
return self._generate_summary(messages, summary_budget) # retry immediately
|
||||||
|
|
||||||
# Transient errors (timeout, rate limit, network) — shorter cooldown
|
# Transient errors (timeout, rate limit, network) — shorter cooldown
|
||||||
_transient_cooldown = 60
|
_transient_cooldown = 60
|
||||||
self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
|
self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
|
||||||
|
|||||||
Reference in New Issue
Block a user