mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 10:17:17 +08:00
Two mitigations for the CLOSE_WAIT accumulation reported against QQ Bot + Feishu on macOS behind Cloudflare Warp. 1. Shared httpx.Limits helper (gateway/platforms/_http_client_limits.py). Every long-lived platform adapter now constructs httpx.AsyncClient with max_keepalive_connections=10 and keepalive_expiry=2.0, vs httpx's default of unbounded keepalive pool and 5.0s expiry. On macOS/Warp the default 5s window let idle keepalive sockets sit in CLOSE_WAIT long enough for seven persistent adapters (QQ Bot, WeCom, DingTalk, Signal, BlueBubbles, WeCom-callback, plus the transient Feishu helper) to compound to the 256-fd ulimit. Tunable via HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY and HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE env vars. 2. whatsapp.send_typing aiohttp leak. The call was 'await self._http_session.post(...)' with no 'async with' and no variable capture — the ClientResponse went out of scope unclosed, holding its TCP socket in CLOSE_WAIT until GC. Fixed by wrapping in 'async with'. This was the only bare-await aiohttp leak in the gateway/tools/plugins tree per audit; all other aiohttp sites use the context-manager pattern correctly. The underlying reporter also saw Feishu SDK (lark-oapi) connections in CLOSE_WAIT — those are inside the SDK and out of our direct control, but tightening httpx keepalive across adapters reduces the aggregate pool pressure regardless of which individual adapter leaks.
115 lines
5.0 KiB
Python
115 lines
5.0 KiB
Python
"""Tests for the shared httpx.Limits helper that all long-lived platform
|
|
adapters use to tighten their keep-alive pool.
|
|
|
|
Context: #18451 — on macOS behind Cloudflare Warp, httpx's default
|
|
keepalive_expiry=5s let idle CLOSE_WAIT sockets accumulate across
|
|
multiple long-lived gateway adapters (QQ Bot, Feishu, WeCom, DingTalk,
|
|
Signal, BlueBubbles, WeCom-callback) until the process hit the default
|
|
256 fd limit. These tests just verify the helper returns sensibly
|
|
tuned limits and respects env-var overrides; the actual fd-pressure
|
|
behaviour is only observable at runtime under load.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _clear_env(monkeypatch):
|
|
monkeypatch.delenv("HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY", raising=False)
|
|
monkeypatch.delenv("HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE", raising=False)
|
|
|
|
|
|
def test_returns_none_when_httpx_unavailable(monkeypatch):
|
|
"""If httpx can't be imported, the helper returns None so callers
|
|
fall back to httpx's built-in Limits default without raising."""
|
|
import gateway.platforms._http_client_limits as mod
|
|
monkeypatch.setattr(mod, "httpx", None)
|
|
assert mod.platform_httpx_limits() is None
|
|
|
|
|
|
def test_default_limits_tighten_keepalive_below_httpx_default():
|
|
import httpx
|
|
from gateway.platforms._http_client_limits import platform_httpx_limits
|
|
limits = platform_httpx_limits()
|
|
assert isinstance(limits, httpx.Limits)
|
|
# httpx default keepalive_expiry is 5.0 — ours must be shorter so
|
|
# CLOSE_WAIT sockets drain promptly behind proxies like Warp.
|
|
assert limits.keepalive_expiry is not None
|
|
assert limits.keepalive_expiry < 5.0
|
|
# max_keepalive_connections must be positive and reasonable for a
|
|
# single adapter (platform APIs rarely parallelise beyond ~10).
|
|
assert limits.max_keepalive_connections is not None
|
|
assert 1 <= limits.max_keepalive_connections <= 50
|
|
|
|
|
|
def test_env_override_keepalive_expiry(monkeypatch):
|
|
monkeypatch.setenv("HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY", "7.5")
|
|
from gateway.platforms._http_client_limits import platform_httpx_limits
|
|
limits = platform_httpx_limits()
|
|
assert limits.keepalive_expiry == 7.5
|
|
|
|
|
|
def test_env_override_max_keepalive(monkeypatch):
|
|
monkeypatch.setenv("HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE", "25")
|
|
from gateway.platforms._http_client_limits import platform_httpx_limits
|
|
limits = platform_httpx_limits()
|
|
assert limits.max_keepalive_connections == 25
|
|
|
|
|
|
def test_env_override_rejects_garbage(monkeypatch):
|
|
"""Malformed env values fall back to defaults rather than raising."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_HTTPX_KEEPALIVE_EXPIRY", "not-a-number")
|
|
monkeypatch.setenv("HERMES_GATEWAY_HTTPX_MAX_KEEPALIVE", "-3")
|
|
from gateway.platforms._http_client_limits import platform_httpx_limits
|
|
limits = platform_httpx_limits()
|
|
# Non-positive / non-numeric → fell back to defaults (not the override values)
|
|
assert limits.keepalive_expiry is not None and limits.keepalive_expiry > 0
|
|
assert limits.max_keepalive_connections is not None
|
|
assert limits.max_keepalive_connections > 0
|
|
|
|
|
|
def test_helper_is_importable_from_every_platform_that_uses_it():
|
|
"""Every persistent-httpx-client platform adapter imports this helper.
|
|
If any of those modules fails to import, this test surfaces it before
|
|
the regression shows up as a runtime adapter-startup crash."""
|
|
# Just importing exercises the helper's import path for each adapter.
|
|
import gateway.platforms.qqbot.adapter # noqa: F401
|
|
import gateway.platforms.wecom # noqa: F401
|
|
import gateway.platforms.dingtalk # noqa: F401
|
|
import gateway.platforms.signal # noqa: F401
|
|
import gateway.platforms.bluebubbles # noqa: F401
|
|
import gateway.platforms.wecom_callback # noqa: F401
|
|
|
|
|
|
class TestWhatsappTypingLeakFix:
|
|
"""#18451 — whatsapp.send_typing previously used a bare
|
|
`await self._http_session.post(...)` which leaked the aiohttp
|
|
response object until GC, holding its TCP socket in CLOSE_WAIT.
|
|
Must now wrap the call in `async with` so the response is
|
|
released immediately when the call returns.
|
|
|
|
We verify by inspecting the source text rather than exercising
|
|
the coroutine — the test suite would otherwise need a live
|
|
aiohttp server, and the contract we care about is structural.
|
|
"""
|
|
|
|
def test_bare_await_removed(self):
|
|
import inspect
|
|
import gateway.platforms.whatsapp as mod
|
|
|
|
src = inspect.getsource(mod.WhatsAppAdapter.send_typing)
|
|
# The fix must be structural: the post() call is inside an
|
|
# `async with`, not a bare `await`.
|
|
assert "async with self._http_session.post(" in src, (
|
|
"send_typing must wrap self._http_session.post(...) in "
|
|
"`async with` to release the aiohttp response socket "
|
|
"(#18451). Otherwise the response sits in CLOSE_WAIT "
|
|
"until GC."
|
|
)
|
|
# The old bare-await form must be gone.
|
|
assert "await self._http_session.post(" not in src
|