plugins/google_meet/audio_bridge.py

"""Virtual audio bridge for feeding generated speech into Chrome's mic.

v2 module. Provisions a platform-specific virtual audio device so the
Meet bot's Chromium instance can be pointed at an input source we
control. The OpenAI Realtime client writes PCM bytes into this device;
Chrome reads them as if they were coming from a microphone.

Linux (primary): uses pactl (PulseAudio) to create a null-sink plus a
virtual source whose master is the null-sink's monitor. Callers set
PULSE_SOURCE=<source_name> in Chrome's env and pass the fake-mic flag.

macOS: requires BlackHole 2ch to be installed. This module only
verifies its presence and returns the device name; routing OS default
input is left to the user (or a future switchaudio-osx integration) to
avoid surprising the user's system audio state.

Windows: not supported in v2.
"""

from __future__ import annotations

import platform
import subprocess
from typing import Optional


_BLACKHOLE_DEVICE = "BlackHole 2ch"


class AudioBridge:
    """Manages a virtual audio device for Chrome fake-mic input.

    Call ``setup()`` once before launching the Meet bot and
    ``teardown()`` when the session ends. ``teardown()`` is idempotent.
    """

    def __init__(self, name_prefix: str = "hermes_meet") -> None:
        self._name_prefix = name_prefix
        self._platform: Optional[str] = None
        self._device_name: Optional[str] = None
        self._write_target: Optional[str] = None
        self._module_ids: list[int] = []
        self._torn_down = False

    # ── public properties ─────────────────────────────────────────────────

    @property
    def device_name(self) -> str:
        if not self._device_name:
            raise RuntimeError("AudioBridge not set up yet")
        return self._device_name

    @property
    def write_target(self) -> str:
        if not self._write_target:
            raise RuntimeError("AudioBridge not set up yet")
        return self._write_target

    # ── lifecycle ─────────────────────────────────────────────────────────

    def setup(self) -> dict:
        """Provision the virtual audio device.

        Returns a dict describing the device. Raises RuntimeError on
        unsupported platforms or when required system tools are missing.
        """
        system = platform.system()
        if system == "Linux":
            return self._setup_linux()
        if system == "Darwin":
            return self._setup_darwin()
        if system == "Windows":
            raise RuntimeError("windows not supported in v2")
        raise RuntimeError(f"unsupported platform: {system}")

    def teardown(self) -> None:
        """Release the virtual audio device. Idempotent."""
        if self._torn_down:
            return
        # Only Linux needs explicit unloading.
        if self._platform == "linux" and self._module_ids:
            # Unload in reverse order (virtual-source before null-sink).
            for mod_id in reversed(self._module_ids):
                try:
                    subprocess.run(
                        ["pactl", "unload-module", str(mod_id)],
                        check=False,
                        capture_output=True,
                    )
                except Exception:
                    # Best-effort teardown — never raise from here.
                    pass
            self._module_ids = []
        self._torn_down = True

    # ── platform impls ────────────────────────────────────────────────────

    def _setup_linux(self) -> dict:
        sink_name = f"{self._name_prefix}_sink"
        src_name = f"{self._name_prefix}_src"

        try:
            sink_out = subprocess.run(
                [
                    "pactl",
                    "load-module",
                    "module-null-sink",
                    f"sink_name={sink_name}",
                    f"sink_properties=device.description=HermesMeetSink",
                ],
                check=True,
                capture_output=True,
                text=True,
            )
        except FileNotFoundError as exc:
            raise RuntimeError(
                "pactl not found — install PulseAudio/pipewire-pulse"
            ) from exc
        except subprocess.CalledProcessError as exc:
            raise RuntimeError(
                f"pactl load-module null-sink failed: {exc.stderr or exc}"
            ) from exc

        sink_mod_id = self._parse_module_id(sink_out.stdout)

        try:
            src_out = subprocess.run(
                [
                    "pactl",
                    "load-module",
                    "module-virtual-source",
                    f"source_name={src_name}",
                    f"master={sink_name}.monitor",
                ],
                check=True,
                capture_output=True,
                text=True,
            )
        except subprocess.CalledProcessError as exc:
            # Roll back the null-sink we just created so we don't leak it.
            subprocess.run(
                ["pactl", "unload-module", str(sink_mod_id)],
                check=False,
                capture_output=True,
            )
            raise RuntimeError(
                f"pactl load-module virtual-source failed: {exc.stderr or exc}"
            ) from exc

        src_mod_id = self._parse_module_id(src_out.stdout)

        self._platform = "linux"
        self._device_name = src_name
        self._write_target = sink_name
        self._module_ids = [sink_mod_id, src_mod_id]
        self._torn_down = False

        return {
            "platform": "linux",
            "device_name": src_name,
            "sample_rate": 48000,
            "channels": 2,
            "module_ids": list(self._module_ids),
            "write_target": sink_name,
        }

    def _setup_darwin(self) -> dict:
        try:
            out = subprocess.check_output(
                ["system_profiler", "SPAudioDataType"],
                text=True,
                stderr=subprocess.STDOUT,
            )
        except FileNotFoundError as exc:
            raise RuntimeError(
                "system_profiler not found (macOS-only command)"
            ) from exc
        except subprocess.CalledProcessError as exc:
            raise RuntimeError(
                f"system_profiler failed: {exc.output}"
            ) from exc

        if "BlackHole" not in out:
            raise RuntimeError(
                "BlackHole virtual audio device not installed. "
                "Install via: brew install blackhole-2ch"
            )

        self._platform = "darwin"
        self._device_name = _BLACKHOLE_DEVICE
        self._write_target = _BLACKHOLE_DEVICE
        self._module_ids = []
        self._torn_down = False

        return {
            "platform": "darwin",
            "device_name": _BLACKHOLE_DEVICE,
            "sample_rate": 48000,
            "channels": 2,
            "module_ids": [],
            "write_target": _BLACKHOLE_DEVICE,
        }

    # ── helpers ──────────────────────────────────────────────────────────

    @staticmethod
    def _parse_module_id(stdout: str) -> int:
        """pactl load-module prints the new module ID to stdout."""
        text = (stdout or "").strip()
        if not text:
            raise RuntimeError("pactl load-module returned empty stdout")
        # Take the last whitespace-separated token on the first non-empty line.
        first = text.splitlines()[0].strip()
        token = first.split()[-1]
        try:
            return int(token)
        except ValueError as exc:
            raise RuntimeError(
                f"could not parse pactl module id from: {stdout!r}"
            ) from exc


def chrome_fake_audio_flags(bridge_info: dict) -> list[str]:
    """Return Chrome flags for using the fake audio input.

    The PulseAudio source is selected via the ``PULSE_SOURCE`` env var,
    which callers must set in Chrome's environment before launch:

        env["PULSE_SOURCE"] = bridge_info["device_name"]

    On macOS the caller must ensure the system default audio input is
    set to the returned BlackHole device (we do not flip that switch).
    """
    system = platform.system()
    if system == "Linux":
        # Chromium on Linux picks up the PulseAudio source selected via
        # PULSE_SOURCE env var; the fake-ui flag skips the permission
        # prompt so the bot can pick "use my mic" without user input.
        return ["--use-fake-ui-for-media-stream"]
    if system == "Darwin":
        return ["--use-fake-ui-for-media-stream"]
    if system == "Windows":
        raise RuntimeError("windows not supported in v2")
    raise RuntimeError(f"unsupported platform: {system}")
feat(plugins): google_meet \u2014 join, transcribe, speak, follow up (#16364) * feat(plugins): google_meet — bundled plugin for join+transcribe Meet calls v1 shipping transcribe-only. Spawns headless Chromium via Playwright, joins an explicit https://meet.google.com/ URL, enables live captions, and scrapes them into a transcript file the agent can read across turns. The agent then has the meeting content in context and can do followup work (send recap, file issues, schedule followups) with its regular tools. Surface: - Tools: meet_join, meet_status, meet_transcript, meet_leave, meet_say (meet_say is a v1 stub — returns not-implemented; v2 will wire realtime duplex audio via OpenAI Realtime / Gemini Live + BlackHole / PulseAudio null-sink.) - CLI: hermes meet setup \| auth \| join \| status \| transcript \| stop - Lifecycle: on_session_end auto-leaves any still-running bot. Safety: - URL regex rejects anything that isn't https://meet.google.com/... - No calendar scanning, no auto-dial, no auto-consent announcement. - Single active meeting per install; a second meet_join leaves the first. - Platform-gated to Linux + macOS (Windows audio routing for v2 untested). - Opt-in: standalone plugin, user must add 'google_meet' to plugins.enabled in config.yaml. Zero core changes. Plugin uses existing register_tool / register_cli_command / register_hook surfaces. 21 new unit tests cover the URL safety gate, transcript dedup + status round-trip, process-manager refusals/start/stop paths, tool-handler JSON shape under each branch, session-end cleanup, and platform-gated register(). * feat(plugins/google_meet): v2 realtime audio + v3 remote node host v2 \u2014 agent speaks in-meeting audio_bridge.py: PulseAudio null-sink (Linux) + BlackHole probe (macOS). On Linux we load pactl module-null-sink + module-virtual-source, track module ids for teardown; Chrome gets PULSE_SOURCE=<virt src> env so its fake mic reads what we write to the sink. macOS just probes BlackHole 2ch and returns its device name \u2014 the plugin refuses to switch the user's default audio input (that would surprise them). realtime/openai_client.py: sync WebSocket client for the OpenAI Realtime API. RealtimeSession.speak(text) sends conversation.item.create + response.create, accumulates response.audio.delta PCM bytes, appends them to a file. RealtimeSpeaker runs a JSONL-queue loop consuming meet_say calls. 'websockets' is an optional dep imported lazily. meet_bot.py: when HERMES_MEET_MODE=realtime, provisions AudioBridge, starts RealtimeSession + speaker thread, spawns paplay to pump PCM into the null-sink, then cleans everything up on SIGTERM. If any realtime setup step fails, falls back cleanly to transcribe mode with an error flagged in status.json. process_manager.enqueue_say(): writes a JSONL line to say_queue.jsonl; refuses when no active meeting or active meeting is transcribe-only. tools.meet_say: real implementation; requires active mode='realtime'. meet_join: adds mode='transcribe'\|'realtime' param. v3 \u2014 remote node host node/protocol.py: JSON envelope (type, id, token, payload) + validate. node/registry.py: $HERMES_HOME/workspace/meetings/nodes.json, with resolve() auto-selecting the sole registered node when name is None. node/server.py: NodeServer \u2014 websockets.serve, bearer-token auth, dispatches start_bot/stop/status/transcript/say/ping onto the local process_manager. Token auto-generated + persisted on first run. node/client.py: NodeClient \u2014 short-lived sync WS per RPC, raises RuntimeError on error envelopes, clean API matching the server. node/cli.py: 'hermes meet node {run,list,approve,remove,status,ping}' subtree; wired into the main meet CLI by cli.py so 'hermes meet node' Just Works. tools.py: every meet_* tool accepts node='<name>'\|'auto'; when set, routes through NodeClient to the remote bot instead of running locally. Unknown node \u2192 clear 'no registered meet node matches ...' error. cli.py: 'hermes meet join --node my-mac --mode realtime' and 'hermes meet say "..." --node my-mac' route to the node; 'hermes meet node approve <name> <url> <token>' registers one. Tests 21 v1 tests updated (meet_say is no longer a stub; active-record now carries mode). 20 new audio_bridge + realtime tests. 42 new node tests (protocol/registry/server/client/cli). 17 new v1/v2/v3 integration tests at the plugin level covering enqueue_say edge cases, env var passthrough, mode validation, node routing (known/unknown/auto/ambiguous), and argparse wiring for `hermes meet say` + `hermes meet node` + --mode/--node flags. Total: 100 plugin tests + 58 plugin-system tests = 158 passing. E2E verified on Linux with fresh HERMES_HOME: plugin loads, 5 tools register, on_session_end hook wires, 'hermes meet' CLI tree wires including the node subtree, NodeRegistry round-trips, meet_join routes correctly to NodeClient under node='my-mac' with mode='realtime', enqueue_say accepts realtime/rejects transcribe, argparse parses every new flag cleanly. Zero changes to core. All new code lives under plugins/google_meet/. * feat(plugins/google_meet): auto-install, admission detect, mac PCM pump, barge-in, richer status Ready-for-live-test follow-up on PR #16364. Five additions that matter for the first live run on a real Meet, in priority order: 1. hermes meet install [--realtime] [--yes] pip install playwright websockets + python -m playwright install chromium --realtime: installs platform audio deps (pulseaudio-utils on Linux via sudo apt, blackhole-2ch + ffmpeg on macOS via brew). Prompts before sudo/brew unless --yes. Refuses on Windows. Refuses to auto-flip the macOS default input — user still selects BlackHole in System Settings (deliberate; surprise audio rerouting is worse than a manual step). 2. Admission detection _detect_admission(page): Leave-button visible OR caption region attached OR participants list present → we're in-call. _detect_denied(page): 'You can\'t join this video call' / 'You were removed' / 'No one responded to your request' → bail out. HERMES_MEET_LOBBY_TIMEOUT (default 300s) caps how long we sit in the lobby before giving up. in_call stays False until admitted. Status surfaces leaveReason: duration_expired \| lobby_timeout \| denied \| page_closed. 3. macOS PCM pump ffmpeg reads speaker.pcm (24kHz s16le mono) and writes to the BlackHole AVFoundation output via -f audiotoolbox -audio_device_index <N>. _mac_audio_device_index() probes ffmpeg -f avfoundation -list_devices true to resolve 'BlackHole 2ch' → numeric index. Falls back to index 0 on probe failure. Linux paplay pump unchanged. 4. Richer status dict _BotState now tracks realtime, realtimeReady, realtimeDevice, audioBytesOut, lastAudioOutAt, lastBargeInAt, joinAttemptedAt, leaveReason. RealtimeSession.audio_bytes_out / last_audio_out_at counters fold into the status file once a second so meet_status() can show the agent's voice activity in near-real-time. 5. Barge-in RealtimeSession.cancel_response() sends type='response.cancel' over the same WS (lock-guarded so it's safe to call from the caption thread while speak() is reading frames). Handles response.cancelled as a terminal frame type. _looks_like_human_speaker() gates triggers so the bot's own name, 'You', 'Unknown', and blanks don't self-cancel. Called from the caption drain loop: when a new caption arrives attributed to a real participant while rt.session exists, we fire cancel_response() and stamp lastBargeInAt. Tests: 20 new unit tests across _BotState telemetry, barge-in gating, admission/denied probe error handling, cancel_response with and without a connected WS, and `hermes meet install` CLI wiring (flag parsing + end-to-end subprocess.run verification + Linux-already-installed fast path). Total 171 passing across all google_meet test files + the plugin-system regression suite. E2E verified on Linux: plugin loads, all 5 tools register, `hermes meet install --realtime --yes` parses, fresh-bot status.json has every new telemetry key, cancel_response on a disconnected session returns False without raising, barge-in helper gates the bot's own name correctly. Still out of scope (for a future PR, not blocking live test): mic → Realtime duplex (the agent listening to meeting audio via WebRTC), node-host TLS/pairing UX, Windows audio, Meet create+Twilio. Docs updated: SKILL.md now lists the installer subcommand, lobby timeout, barge-in caveat, and the full status-dict reference table. README.md quick-start uses hermes meet install. 2026-04-27 06:22:25 -07:00			`"""Virtual audio bridge for feeding generated speech into Chrome's mic.`

			`v2 module. Provisions a platform-specific virtual audio device so the`
			`Meet bot's Chromium instance can be pointed at an input source we`
			`control. The OpenAI Realtime client writes PCM bytes into this device;`
			`Chrome reads them as if they were coming from a microphone.`

			`Linux (primary): uses pactl (PulseAudio) to create a null-sink plus a`
			`virtual source whose master is the null-sink's monitor. Callers set`
			`PULSE_SOURCE=<source_name> in Chrome's env and pass the fake-mic flag.`

			`macOS: requires BlackHole 2ch to be installed. This module only`
			`verifies its presence and returns the device name; routing OS default`
			`input is left to the user (or a future switchaudio-osx integration) to`
			`avoid surprising the user's system audio state.`

			`Windows: not supported in v2.`
			`"""`

			`from __future__ import annotations`

			`import platform`
			`import subprocess`
			`from typing import Optional`


			`_BLACKHOLE_DEVICE = "BlackHole 2ch"`


			`class AudioBridge:`
			`"""Manages a virtual audio device for Chrome fake-mic input.`

			Call ``setup()`` once before launching the Meet bot and
			``teardown()`` when the session ends. ``teardown()`` is idempotent.
			`"""`

			`def __init__(self, name_prefix: str = "hermes_meet") -> None:`
			`self._name_prefix = name_prefix`
			`self._platform: Optional[str] = None`
			`self._device_name: Optional[str] = None`
			`self._write_target: Optional[str] = None`
			`self._module_ids: list[int] = []`
			`self._torn_down = False`

			`# ── public properties ─────────────────────────────────────────────────`

			`@property`
			`def device_name(self) -> str:`
			`if not self._device_name:`
			`raise RuntimeError("AudioBridge not set up yet")`
			`return self._device_name`

			`@property`
			`def write_target(self) -> str:`
			`if not self._write_target:`
			`raise RuntimeError("AudioBridge not set up yet")`
			`return self._write_target`

			`# ── lifecycle ─────────────────────────────────────────────────────────`

			`def setup(self) -> dict:`
			`"""Provision the virtual audio device.`

			`Returns a dict describing the device. Raises RuntimeError on`
			`unsupported platforms or when required system tools are missing.`
			`"""`
			`system = platform.system()`
			`if system == "Linux":`
			`return self._setup_linux()`
			`if system == "Darwin":`
			`return self._setup_darwin()`
			`if system == "Windows":`
			`raise RuntimeError("windows not supported in v2")`
			`raise RuntimeError(f"unsupported platform: {system}")`

			`def teardown(self) -> None:`
			`"""Release the virtual audio device. Idempotent."""`
			`if self._torn_down:`
			`return`
			`# Only Linux needs explicit unloading.`
			`if self._platform == "linux" and self._module_ids:`
			`# Unload in reverse order (virtual-source before null-sink).`
			`for mod_id in reversed(self._module_ids):`
			`try:`
			`subprocess.run(`
			`["pactl", "unload-module", str(mod_id)],`
			`check=False,`
			`capture_output=True,`
			`)`
			`except Exception:`
			`# Best-effort teardown — never raise from here.`
			`pass`
			`self._module_ids = []`
			`self._torn_down = True`

			`# ── platform impls ────────────────────────────────────────────────────`

			`def _setup_linux(self) -> dict:`
			`sink_name = f"{self._name_prefix}_sink"`
			`src_name = f"{self._name_prefix}_src"`

			`try:`
			`sink_out = subprocess.run(`
			`[`
			`"pactl",`
			`"load-module",`
			`"module-null-sink",`
			`f"sink_name={sink_name}",`
			`f"sink_properties=device.description=HermesMeetSink",`
			`],`
			`check=True,`
			`capture_output=True,`
			`text=True,`
			`)`
			`except FileNotFoundError as exc:`
			`raise RuntimeError(`
			`"pactl not found — install PulseAudio/pipewire-pulse"`
			`) from exc`
			`except subprocess.CalledProcessError as exc:`
			`raise RuntimeError(`
			`f"pactl load-module null-sink failed: {exc.stderr or exc}"`
			`) from exc`

			`sink_mod_id = self._parse_module_id(sink_out.stdout)`

			`try:`
			`src_out = subprocess.run(`
			`[`
			`"pactl",`
			`"load-module",`
			`"module-virtual-source",`
			`f"source_name={src_name}",`
			`f"master={sink_name}.monitor",`
			`],`
			`check=True,`
			`capture_output=True,`
			`text=True,`
			`)`
			`except subprocess.CalledProcessError as exc:`
			`# Roll back the null-sink we just created so we don't leak it.`
			`subprocess.run(`
			`["pactl", "unload-module", str(sink_mod_id)],`
			`check=False,`
			`capture_output=True,`
			`)`
			`raise RuntimeError(`
			`f"pactl load-module virtual-source failed: {exc.stderr or exc}"`
			`) from exc`

			`src_mod_id = self._parse_module_id(src_out.stdout)`

			`self._platform = "linux"`
			`self._device_name = src_name`
			`self._write_target = sink_name`
			`self._module_ids = [sink_mod_id, src_mod_id]`
			`self._torn_down = False`

			`return {`
			`"platform": "linux",`
			`"device_name": src_name,`
			`"sample_rate": 48000,`
			`"channels": 2,`
			`"module_ids": list(self._module_ids),`
			`"write_target": sink_name,`
			`}`

			`def _setup_darwin(self) -> dict:`
			`try:`
			`out = subprocess.check_output(`
			`["system_profiler", "SPAudioDataType"],`
			`text=True,`
			`stderr=subprocess.STDOUT,`
			`)`
			`except FileNotFoundError as exc:`
			`raise RuntimeError(`
			`"system_profiler not found (macOS-only command)"`
			`) from exc`
			`except subprocess.CalledProcessError as exc:`
			`raise RuntimeError(`
			`f"system_profiler failed: {exc.output}"`
			`) from exc`

			`if "BlackHole" not in out:`
			`raise RuntimeError(`
			`"BlackHole virtual audio device not installed. "`
			`"Install via: brew install blackhole-2ch"`
			`)`

			`self._platform = "darwin"`
			`self._device_name = _BLACKHOLE_DEVICE`
			`self._write_target = _BLACKHOLE_DEVICE`
			`self._module_ids = []`
			`self._torn_down = False`

			`return {`
			`"platform": "darwin",`
			`"device_name": _BLACKHOLE_DEVICE,`
			`"sample_rate": 48000,`
			`"channels": 2,`
			`"module_ids": [],`
			`"write_target": _BLACKHOLE_DEVICE,`
			`}`

			`# ── helpers ──────────────────────────────────────────────────────────`

			`@staticmethod`
			`def _parse_module_id(stdout: str) -> int:`
			`"""pactl load-module prints the new module ID to stdout."""`
			`text = (stdout or "").strip()`
			`if not text:`
			`raise RuntimeError("pactl load-module returned empty stdout")`
			`# Take the last whitespace-separated token on the first non-empty line.`
			`first = text.splitlines()[0].strip()`
			`token = first.split()[-1]`
			`try:`
			`return int(token)`
			`except ValueError as exc:`
			`raise RuntimeError(`
			`f"could not parse pactl module id from: {stdout!r}"`
			`) from exc`


			`def chrome_fake_audio_flags(bridge_info: dict) -> list[str]:`
			`"""Return Chrome flags for using the fake audio input.`

			The PulseAudio source is selected via the ``PULSE_SOURCE`` env var,
			`which callers must set in Chrome's environment before launch:`

			`env["PULSE_SOURCE"] = bridge_info["device_name"]`

			`On macOS the caller must ensure the system default audio input is`
			`set to the returned BlackHole device (we do not flip that switch).`
			`"""`
			`system = platform.system()`
			`if system == "Linux":`
			`# Chromium on Linux picks up the PulseAudio source selected via`
			`# PULSE_SOURCE env var; the fake-ui flag skips the permission`
			`# prompt so the bot can pick "use my mic" without user input.`
			`return ["--use-fake-ui-for-media-stream"]`
			`if system == "Darwin":`
			`return ["--use-fake-ui-for-media-stream"]`
			`if system == "Windows":`
			`raise RuntimeError("windows not supported in v2")`
			`raise RuntimeError(f"unsupported platform: {system}")`