diff --git a/run_agent.py b/run_agent.py index 2fda4d54e6..325df9beb1 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4395,6 +4395,41 @@ class AIAgent: self._client_log_context(), ) return client + # Inject TCP keepalives so the kernel detects dead provider connections + # instead of letting them sit silently in CLOSE-WAIT (#10324). Without + # this, a peer that drops mid-stream leaves the socket in a state where + # epoll_wait never fires, ``httpx`` read timeout may not trigger, and + # the agent hangs until manually killed. Probes after 30s idle, retry + # every 10s, give up after 3 → dead peer detected within ~60s. + # + # Safety against #10933: the ``client_kwargs = dict(client_kwargs)`` + # above means this injection only lands in the local per-call copy, + # never back into ``self._client_kwargs``. Each ``_create_openai_client`` + # invocation therefore gets its OWN fresh ``httpx.Client`` whose + # lifetime is tied to the OpenAI client it is passed to. When the + # OpenAI client is closed (rebuild, teardown, credential rotation), + # the paired ``httpx.Client`` closes with it, and the next call + # constructs a fresh one — no stale closed transport can be reused. + # Tests in ``tests/run_agent/test_create_openai_client_reuse.py`` and + # ``tests/run_agent/test_sequential_chats_live.py`` pin this invariant. + if "http_client" not in client_kwargs: + try: + import httpx as _httpx + import socket as _socket + _sock_opts = [(_socket.SOL_SOCKET, _socket.SO_KEEPALIVE, 1)] + if hasattr(_socket, "TCP_KEEPIDLE"): + # Linux + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPIDLE, 30)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPINTVL, 10)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPCNT, 3)) + elif hasattr(_socket, "TCP_KEEPALIVE"): + # macOS (uses TCP_KEEPALIVE instead of TCP_KEEPIDLE) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPALIVE, 30)) + client_kwargs["http_client"] = _httpx.Client( + transport=_httpx.HTTPTransport(socket_options=_sock_opts), + ) + except Exception: + pass # Fall through to default transport if socket opts fail client = OpenAI(**client_kwargs) logger.info( "OpenAI client created (%s, shared=%s) %s",