diff --git a/plugins/google_meet/README.md b/plugins/google_meet/README.md new file mode 100644 index 0000000000..53049a5846 --- /dev/null +++ b/plugins/google_meet/README.md @@ -0,0 +1,131 @@ +# google_meet plugin + +Let the hermes agent join a Google Meet call, transcribe it, optionally speak +in it, and do the followup work afterwards. + +## What ships + +| Version | What | Status | +|---|---|---| +| v1 | Transcribe-only: Playwright joins Meet, scrapes captions to transcript file | ✓ ships by default | +| v2 | Realtime duplex audio: bot speaks in-call via OpenAI Realtime + BlackHole/PulseAudio null-sink | ✓ opt in with `mode='realtime'` | +| v3 | Remote node host: run the bot on a different machine than the gateway | ✓ opt in with `node=''` | + +## Architecture + +``` +┌─ gateway (Linux box, where hermes runs) ────────────────────────────┐ +│ │ +│ agent → meet_join(url, mode='realtime', node='my-mac') │ +│ │ │ +│ └─ NodeClient ─── ws ────┐ │ +│ │ │ +└──────────────────────────────────┼───────────────────────────────────┘ + │ wss (token auth) + ▼ +┌─ node host (user's Mac, signed-in Chrome lives here) ───────────────┐ +│ │ +│ NodeServer (from `hermes meet node run`) │ +│ │ │ +│ ├─ start_bot → process_manager.start() → spawns meet_bot │ +│ │ │ +│ └─ meet_bot (Playwright) │ +│ ├─ Chromium → meet.google.com │ +│ ├─ caption scraper → transcript.txt │ +│ └─ (realtime mode only) RealtimeSpeaker thread │ +│ ↓ │ +│ OpenAI Realtime WS → speaker.pcm │ +│ ↓ │ +│ paplay → null-sink ← Chrome fake mic │ +│ │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +Without v3: the whole right column runs on the gateway machine. +Without v2: the "realtime" path is skipped; transcribe runs alone. + +## Files + +| Path | Purpose | +|---|---| +| `plugin.yaml` | manifest | +| `__init__.py` | `register(ctx)` — registers 5 tools + `on_session_end` hook + `hermes meet` CLI | +| `meet_bot.py` | Playwright bot subprocess (standalone, `python -m plugins.google_meet.meet_bot`) | +| `process_manager.py` | local bot lifecycle + `enqueue_say` | +| `tools.py` | agent-facing tools + node-routing helper | +| `cli.py` | `hermes meet setup / auth / join / status / transcript / say / stop / node ...` | +| `audio_bridge.py` | v2: PulseAudio null-sink (Linux) + BlackHole probe (macOS) | +| `realtime/openai_client.py` | v2: `RealtimeSession` + `RealtimeSpeaker` (file-queue → OpenAI Realtime WS → PCM) | +| `node/protocol.py` | v3: message envelope + validation | +| `node/registry.py` | v3: `$HERMES_HOME/workspace/meetings/nodes.json` | +| `node/server.py` | v3: `NodeServer` (runs on host machine) | +| `node/client.py` | v3: `NodeClient` (used by tool handlers + CLI on gateway) | +| `node/cli.py` | v3: `hermes meet node {run,list,approve,remove,status,ping}` | +| `SKILL.md` | agent usage guide | + +## Local quick start + +```bash +hermes plugins enable google_meet +hermes meet install # pip + Chromium +hermes meet setup # preflight +hermes meet auth # optional +hermes meet join https://meet.google.com/abc-defg-hij # transcribe +``` + +## Realtime mode + +Linux (preferred, most automated): +```bash +hermes meet install --realtime # installs pulseaudio-utils +echo 'OPENAI_API_KEY=sk-...' >> ~/.hermes/.env +hermes meet join https://meet.google.com/abc-defg-hij --mode realtime +# then from the agent or CLI: +hermes meet say "Good morning everyone, I'm the note-taker bot." +``` + +macOS: +```bash +hermes meet install --realtime # runs: brew install blackhole-2ch ffmpeg +# then — manually! — open System Settings → Sound → Input → BlackHole 2ch +echo 'OPENAI_API_KEY=sk-...' >> ~/.hermes/.env +hermes meet join https://meet.google.com/abc-defg-hij --mode realtime +``` + +On macOS, hermes will **not** switch your system audio input automatically — the +user has to do it. This is deliberate: switching default input on a whim would +be a surprising side effect. + +## Remote node host + +On the node machine (e.g. user's Mac with a signed-in Chrome): +```bash +pip install playwright websockets +python -m playwright install chromium +hermes plugins enable google_meet +hermes meet node run --display-name my-mac --host 0.0.0.0 --port 18789 +# prints the bearer token on first run; copy it +``` + +On the gateway: +```bash +hermes meet node approve my-mac ws://:18789 +hermes meet node ping my-mac +# now any meet_* tool call accepts node='my-mac' (or 'auto') +``` + +## Safety + +- URL gate: only `https://meet.google.com/abc-defg-hij`, `/new`, `/lookup/`. +- No calendar scanning, no auto-dial, no auto-consent announcement. +- Node server uses bearer-token auth; no key exchange, no TLS termination + built in — run it on a LAN or behind a reverse proxy you trust. +- One active meeting per (gateway, node) pair. A second `meet_join` leaves the first. +- `meet_say` refuses unless the active meeting was started with `mode='realtime'`. + +## Out of scope + +- **Calendar scanning** — deliberately not implemented. Join URLs must be explicit. +- **Multi-tenant node sharing** — a node serves one gateway at a time. +- **Windows** — audio bridging isn't tested; `register()` no-ops on Windows. +- **System audio input switching on macOS** — user responsibility, not the bot's. diff --git a/plugins/google_meet/SKILL.md b/plugins/google_meet/SKILL.md new file mode 100644 index 0000000000..4f009f9d1e --- /dev/null +++ b/plugins/google_meet/SKILL.md @@ -0,0 +1,148 @@ +--- +name: google_meet +description: Join a Google Meet call, transcribe live captions, optionally speak in realtime, and do the followup work afterwards. Use when the user asks the agent to sit in on a meeting, take notes, summarize, respond in-call, or action items from it. +version: 0.2.0 +platforms: + - linux + - macos +metadata: + hermes: + tags: [meetings, google-meet, transcription, realtime-voice] +--- + +# google_meet + +## When to use + +The user says any of: + +- "join my Meet at " +- "take notes on this meeting" +- "summarize the meeting and send followups" +- "sit in on my standup" +- "be a bot in this call and speak up when X" + +## Two modes + +| Mode | What the bot does | +|---|---| +| `transcribe` (default) | Joins, enables captions, scrapes a transcript. Listen-only. | +| `realtime` | Same as transcribe PLUS speaks into the meeting via OpenAI Realtime. The agent calls `meet_say(text)` and the bot's voice comes out of the call. | + +Pick `realtime` only when the user actually wants the agent to speak. It costs real money (OpenAI Realtime is pay-per-audio-minute) and requires a virtual audio device set up on the machine running the bot. + +## Two locations + +| Location | When | +|---|---| +| Local (default) | Gateway machine runs the Playwright bot directly. | +| Remote node (`node=""`) | Bot runs on a different machine that has a signed-in Chrome and (for realtime) a configured audio bridge. Useful when the gateway runs on a headless Linux box but the user's real signed-in Chrome lives on their Mac. | + +## Prerequisites the user must handle once + +Easiest path — run the built-in installer: + +```bash +hermes plugins enable google_meet +hermes meet install # pip deps + Chromium (transcribe only) +hermes meet install --realtime # + pulseaudio-utils / brew blackhole+ffmpeg +hermes meet auth # optional; skips guest-lobby wait +hermes meet setup # preflight checks +``` + +`hermes meet install --realtime` prompts before running `sudo apt-get` (Linux) +or `brew install` (macOS). Pass `--yes` to skip the prompt. It will NOT touch +your macOS default-input setting — you have to select BlackHole 2ch in +System Settings yourself before starting a realtime meeting. + +Or do it manually: +```bash +pip install playwright websockets && python -m playwright install chromium + +# For realtime mode, additionally: +# Linux: sudo apt install pulseaudio-utils +# macOS: brew install blackhole-2ch ffmpeg +# → System Settings → Sound → Input → BlackHole 2ch +# Then set OPENAI_API_KEY or HERMES_MEET_REALTIME_KEY in ~/.hermes/.env +``` + +For a remote node: +```bash +# on the user's Mac (where Chrome is signed in): +pip install playwright websockets && python -m playwright install chromium +hermes plugins enable google_meet +hermes meet node run --display-name my-mac # persistent server +# copy the printed token + +# on the gateway: +hermes meet node approve my-mac ws://:18789 +hermes meet node ping my-mac # confirm reachable +``` + +Run `hermes meet setup` to preflight local prereqs. + +## Flow + +1. **Join** — call `meet_join(url=..., mode=..., node=...)`. Returns immediately. +2. **Announce yourself** — no auto-consent. Say (in whatever channel the user is watching): "A Hermes agent bot is in this call taking notes." +3. **Poll** — `meet_status()` for liveness, `meet_transcript(last=20)` for recent captions. Don't re-read the whole transcript every turn. +4. **Speak (realtime only)** — `meet_say(text="...")` queues text for TTS. The speech lags by ~2s. Don't spam it. +5. **Leave** — `meet_leave()` when done, or set `duration="30m"` on `meet_join` for auto-leave. +6. **Follow up** — read `meet_transcript()` in full, summarize, and use regular tools to send the recap, file issues, schedule followups. + +## Tool reference + +| Tool | Parameters | Use | +|---|---|---| +| `meet_join` | `url`, `mode?`, `guest_name?`, `duration?`, `headed?`, `node?` | Start bot | +| `meet_status` | `node?` | Liveness + progress | +| `meet_transcript` | `last?`, `node?` | Read captions | +| `meet_leave` | `node?` | Close bot | +| `meet_say` | `text`, `node?` | Speak in realtime meeting | + +`node?` on all tools: pass a registered node name (or `"auto"` for the sole node) to operate a remote bot instead of a local one. Omit for local. + +## Important limits + +- Captions are only as good as Google Meet's live captions. English-biased, lossy on overlapping speakers. +- Guest mode sits in the lobby until a host admits. Warn the user; `hermes meet auth` avoids this. +- **Lobby timeout**: if the host doesn't admit the bot within 5 minutes (configurable via `HERMES_MEET_LOBBY_TIMEOUT` env), the bot leaves and `meet_status` reports `leaveReason: "lobby_timeout"`. +- **One active meeting per install per location.** A second `meet_join` leaves the first. +- **Windows not supported.** +- Realtime mode needs a virtual audio device. If the audio bridge setup fails, the bot falls back to transcribe mode and flags it in `meet_status().error`. +- `meet_say` requires `mode='realtime'` on the originating `meet_join`. Calling it against a transcribe-mode meeting returns a clear error. +- **Barge-in is best-effort.** When a caption arrives attributed to a real participant while the bot is generating audio, the bot sends `response.cancel` to OpenAI Realtime. Captions take ~500ms to show up, so the bot will talk over the first second or so of a human interruption. + +## Status dict reference + +`meet_status()` returns (subset shown, there are more): + +| Key | Meaning | +|---|---| +| `inCall` | Past the lobby. False while waiting for admission. | +| `lobbyWaiting` | Clicked "Ask to join", waiting on host. | +| `joinAttemptedAt` / `joinedAt` | Timestamps for lobby-click and actual admission. | +| `captioning` | Caption observer is installed. | +| `transcriptLines` / `lastCaptionAt` | Transcript progress. | +| `realtime` / `realtimeReady` | Realtime mode provisioned / WS connected. | +| `realtimeDevice` | Audio device name the bot is feeding (e.g. `hermes_meet_src`). | +| `audioBytesOut` / `lastAudioOutAt` | How much PCM the OpenAI session has produced. | +| `lastBargeInAt` | Timestamp of the most recent `response.cancel` sent. | +| `leaveReason` | `duration_expired`, `lobby_timeout`, `denied`, `page_closed`, or null. | +| `error` | Last error (soft — bot may still be running). | + +## Transcript location + +Local: +``` +$HERMES_HOME/workspace/meetings//transcript.txt +``` + +Remote node: transcript lives on the node host's disk. Use `meet_transcript(node=...)` to read it over RPC. + +## Safety + +- URL regex: only `https://meet.google.com/...` URLs pass. +- No calendar scanning. No auto-dial. +- Remote nodes use bearer-token auth; tokens are generated on the node (32 hex chars, persisted in `$HERMES_HOME/workspace/meetings/node_token.json`) and must be copied to the gateway via `hermes meet node approve`. +- `meet_say` text is rate-limited by the OpenAI Realtime session; spam-protection is the bot's problem, not yours, but still — don't queue hundreds of lines. diff --git a/plugins/google_meet/__init__.py b/plugins/google_meet/__init__.py new file mode 100644 index 0000000000..feca75667b --- /dev/null +++ b/plugins/google_meet/__init__.py @@ -0,0 +1,103 @@ +"""google_meet plugin — let the agent join a Meet call, transcribe it, follow up. + +v1: transcribe-only. Spawns a headless Chromium via Playwright, joins the Meet +URL, enables live captions, scrapes them into a transcript file. The agent then +has the transcript in its workspace and can do whatever followup work it needs +using its regular tools. + +v2 (not in this PR): realtime duplex audio so the agent can speak in the +meeting, via OpenAI Realtime / Gemini Live + BlackHole / PulseAudio null-sink. +``meet_say`` exists as a stub today so the tool surface is stable. + +Explicit-by-design: only joins ``https://meet.google.com/`` URLs explicitly +passed in. No calendar scanning, no auto-dial, no consent announcement. +""" + +from __future__ import annotations + +import logging +import platform + +from plugins.google_meet import process_manager as pm +from plugins.google_meet.cli import register_cli as _register_meet_cli +from plugins.google_meet.cli import meet_command as _meet_command +from plugins.google_meet.tools import ( + MEET_JOIN_SCHEMA, + MEET_LEAVE_SCHEMA, + MEET_SAY_SCHEMA, + MEET_STATUS_SCHEMA, + MEET_TRANSCRIPT_SCHEMA, + check_meet_requirements, + handle_meet_join, + handle_meet_leave, + handle_meet_say, + handle_meet_status, + handle_meet_transcript, +) + +logger = logging.getLogger(__name__) + + +_TOOLS = ( + ("meet_join", MEET_JOIN_SCHEMA, handle_meet_join, "📞"), + ("meet_status", MEET_STATUS_SCHEMA, handle_meet_status, "🟢"), + ("meet_transcript", MEET_TRANSCRIPT_SCHEMA, handle_meet_transcript, "📝"), + ("meet_leave", MEET_LEAVE_SCHEMA, handle_meet_leave, "👋"), + ("meet_say", MEET_SAY_SCHEMA, handle_meet_say, "🗣️"), +) + + +def _on_session_end(**kwargs) -> None: + """Best-effort cleanup — if a meet bot is still running when the session + ends, leave the call so we don't orphan a headless Chromium. + + No-ops when nothing is active. Swallows all exceptions — session end must + not fail because the bot cleanup hit an edge case. + """ + try: + status = pm.status() + if status.get("ok") and status.get("alive"): + pm.stop(reason="session ended") + except Exception as e: # pragma: no cover — defensive + logger.debug("google_meet on_session_end cleanup failed: %s", e) + + +def register(ctx) -> None: + """Register tools, CLI, and lifecycle hooks. + + Called once by the plugin loader when the plugin is enabled via + ``plugins.enabled`` in config.yaml. + """ + # Windows is not supported in v1 — audio routing for v2 doesn't have a + # tested path there and guest-join Chromium is flakier. Refuse to register + # rather than half-working. + system = platform.system().lower() + if system not in ("linux", "darwin"): + logger.info( + "google_meet plugin: platform=%s not supported (linux/macos only)", + system, + ) + return + + for name, schema, handler, emoji in _TOOLS: + ctx.register_tool( + name=name, + toolset="google_meet", + schema=schema, + handler=handler, + check_fn=check_meet_requirements, + emoji=emoji, + ) + + ctx.register_cli_command( + name="meet", + help="Google Meet bot (join, transcribe, follow up)", + setup_fn=_register_meet_cli, + handler_fn=_meet_command, + description=( + "Let the hermes agent join a Google Meet call and scrape live " + "captions into a transcript. See: hermes meet setup" + ), + ) + + ctx.register_hook("on_session_end", _on_session_end) diff --git a/plugins/google_meet/audio_bridge.py b/plugins/google_meet/audio_bridge.py new file mode 100644 index 0000000000..11fdd3ff85 --- /dev/null +++ b/plugins/google_meet/audio_bridge.py @@ -0,0 +1,244 @@ +"""Virtual audio bridge for feeding generated speech into Chrome's mic. + +v2 module. Provisions a platform-specific virtual audio device so the +Meet bot's Chromium instance can be pointed at an input source we +control. The OpenAI Realtime client writes PCM bytes into this device; +Chrome reads them as if they were coming from a microphone. + +Linux (primary): uses pactl (PulseAudio) to create a null-sink plus a +virtual source whose master is the null-sink's monitor. Callers set +PULSE_SOURCE= in Chrome's env and pass the fake-mic flag. + +macOS: requires BlackHole 2ch to be installed. This module only +verifies its presence and returns the device name; routing OS default +input is left to the user (or a future switchaudio-osx integration) to +avoid surprising the user's system audio state. + +Windows: not supported in v2. +""" + +from __future__ import annotations + +import platform +import subprocess +from typing import Optional + + +_BLACKHOLE_DEVICE = "BlackHole 2ch" + + +class AudioBridge: + """Manages a virtual audio device for Chrome fake-mic input. + + Call ``setup()`` once before launching the Meet bot and + ``teardown()`` when the session ends. ``teardown()`` is idempotent. + """ + + def __init__(self, name_prefix: str = "hermes_meet") -> None: + self._name_prefix = name_prefix + self._platform: Optional[str] = None + self._device_name: Optional[str] = None + self._write_target: Optional[str] = None + self._module_ids: list[int] = [] + self._torn_down = False + + # ── public properties ───────────────────────────────────────────────── + + @property + def device_name(self) -> str: + if not self._device_name: + raise RuntimeError("AudioBridge not set up yet") + return self._device_name + + @property + def write_target(self) -> str: + if not self._write_target: + raise RuntimeError("AudioBridge not set up yet") + return self._write_target + + # ── lifecycle ───────────────────────────────────────────────────────── + + def setup(self) -> dict: + """Provision the virtual audio device. + + Returns a dict describing the device. Raises RuntimeError on + unsupported platforms or when required system tools are missing. + """ + system = platform.system() + if system == "Linux": + return self._setup_linux() + if system == "Darwin": + return self._setup_darwin() + if system == "Windows": + raise RuntimeError("windows not supported in v2") + raise RuntimeError(f"unsupported platform: {system}") + + def teardown(self) -> None: + """Release the virtual audio device. Idempotent.""" + if self._torn_down: + return + # Only Linux needs explicit unloading. + if self._platform == "linux" and self._module_ids: + # Unload in reverse order (virtual-source before null-sink). + for mod_id in reversed(self._module_ids): + try: + subprocess.run( + ["pactl", "unload-module", str(mod_id)], + check=False, + capture_output=True, + ) + except Exception: + # Best-effort teardown — never raise from here. + pass + self._module_ids = [] + self._torn_down = True + + # ── platform impls ──────────────────────────────────────────────────── + + def _setup_linux(self) -> dict: + sink_name = f"{self._name_prefix}_sink" + src_name = f"{self._name_prefix}_src" + + try: + sink_out = subprocess.run( + [ + "pactl", + "load-module", + "module-null-sink", + f"sink_name={sink_name}", + f"sink_properties=device.description=HermesMeetSink", + ], + check=True, + capture_output=True, + text=True, + ) + except FileNotFoundError as exc: + raise RuntimeError( + "pactl not found — install PulseAudio/pipewire-pulse" + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"pactl load-module null-sink failed: {exc.stderr or exc}" + ) from exc + + sink_mod_id = self._parse_module_id(sink_out.stdout) + + try: + src_out = subprocess.run( + [ + "pactl", + "load-module", + "module-virtual-source", + f"source_name={src_name}", + f"master={sink_name}.monitor", + ], + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as exc: + # Roll back the null-sink we just created so we don't leak it. + subprocess.run( + ["pactl", "unload-module", str(sink_mod_id)], + check=False, + capture_output=True, + ) + raise RuntimeError( + f"pactl load-module virtual-source failed: {exc.stderr or exc}" + ) from exc + + src_mod_id = self._parse_module_id(src_out.stdout) + + self._platform = "linux" + self._device_name = src_name + self._write_target = sink_name + self._module_ids = [sink_mod_id, src_mod_id] + self._torn_down = False + + return { + "platform": "linux", + "device_name": src_name, + "sample_rate": 48000, + "channels": 2, + "module_ids": list(self._module_ids), + "write_target": sink_name, + } + + def _setup_darwin(self) -> dict: + try: + out = subprocess.check_output( + ["system_profiler", "SPAudioDataType"], + text=True, + stderr=subprocess.STDOUT, + ) + except FileNotFoundError as exc: + raise RuntimeError( + "system_profiler not found (macOS-only command)" + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"system_profiler failed: {exc.output}" + ) from exc + + if "BlackHole" not in out: + raise RuntimeError( + "BlackHole virtual audio device not installed. " + "Install via: brew install blackhole-2ch" + ) + + self._platform = "darwin" + self._device_name = _BLACKHOLE_DEVICE + self._write_target = _BLACKHOLE_DEVICE + self._module_ids = [] + self._torn_down = False + + return { + "platform": "darwin", + "device_name": _BLACKHOLE_DEVICE, + "sample_rate": 48000, + "channels": 2, + "module_ids": [], + "write_target": _BLACKHOLE_DEVICE, + } + + # ── helpers ────────────────────────────────────────────────────────── + + @staticmethod + def _parse_module_id(stdout: str) -> int: + """pactl load-module prints the new module ID to stdout.""" + text = (stdout or "").strip() + if not text: + raise RuntimeError("pactl load-module returned empty stdout") + # Take the last whitespace-separated token on the first non-empty line. + first = text.splitlines()[0].strip() + token = first.split()[-1] + try: + return int(token) + except ValueError as exc: + raise RuntimeError( + f"could not parse pactl module id from: {stdout!r}" + ) from exc + + +def chrome_fake_audio_flags(bridge_info: dict) -> list[str]: + """Return Chrome flags for using the fake audio input. + + The PulseAudio source is selected via the ``PULSE_SOURCE`` env var, + which callers must set in Chrome's environment before launch: + + env["PULSE_SOURCE"] = bridge_info["device_name"] + + On macOS the caller must ensure the system default audio input is + set to the returned BlackHole device (we do not flip that switch). + """ + system = platform.system() + if system == "Linux": + # Chromium on Linux picks up the PulseAudio source selected via + # PULSE_SOURCE env var; the fake-ui flag skips the permission + # prompt so the bot can pick "use my mic" without user input. + return ["--use-fake-ui-for-media-stream"] + if system == "Darwin": + return ["--use-fake-ui-for-media-stream"] + if system == "Windows": + raise RuntimeError("windows not supported in v2") + raise RuntimeError(f"unsupported platform: {system}") diff --git a/plugins/google_meet/cli.py b/plugins/google_meet/cli.py new file mode 100644 index 0000000000..b7d8097fc7 --- /dev/null +++ b/plugins/google_meet/cli.py @@ -0,0 +1,478 @@ +"""CLI commands for the google_meet plugin. + +Wires ``hermes meet ``: + setup — preflight playwright, chromium, auth file, print fixes + auth — open a browser to sign into Google, save storage state + join — join a Meet URL synchronously (also callable from the agent) + status — print current bot state + transcript — print the transcript + stop — leave the current meeting +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Optional + +from hermes_constants import get_hermes_home + +from plugins.google_meet import process_manager as pm +from plugins.google_meet.meet_bot import _is_safe_meet_url + + +def _auth_state_path() -> Path: + return Path(get_hermes_home()) / "workspace" / "meetings" / "auth.json" + + +# --------------------------------------------------------------------------- +# argparse wiring +# --------------------------------------------------------------------------- + +def register_cli(subparser: argparse.ArgumentParser) -> None: + """Build the ``hermes meet`` argparse tree. + + Called by :func:`_register_cli_commands` at plugin load time. + """ + subs = subparser.add_subparsers(dest="meet_command") + + subs.add_parser("setup", help="Preflight: playwright, chromium, auth") + + inst_p = subs.add_parser( + "install", + help="Install prerequisites (pip deps, Chromium, platform audio tools)", + ) + inst_p.add_argument( + "--realtime", action="store_true", + help="Also install realtime audio tools (pulseaudio-utils on Linux, BlackHole+ffmpeg on macOS). Uses sudo/brew, prompts before invoking either.", + ) + inst_p.add_argument( + "--yes", "-y", action="store_true", + help="Answer yes to all prompts (use with care; will run sudo apt-get or brew without asking).", + ) + + subs.add_parser("auth", help="Sign in to Google and save session state") + + join_p = subs.add_parser("join", help="Join a Meet URL") + join_p.add_argument("url", help="https://meet.google.com/...") + join_p.add_argument("--guest-name", default="Hermes Agent") + join_p.add_argument("--duration", default=None, help="e.g. 30m, 2h, 90s") + join_p.add_argument("--headed", action="store_true", help="show browser") + join_p.add_argument( + "--mode", choices=("transcribe", "realtime"), default="transcribe", + help="transcribe (default, listen-only) or realtime (speak via OpenAI Realtime)" + ) + join_p.add_argument( + "--node", default=None, + help="remote node name, or 'auto' to use the sole registered node" + ) + + subs.add_parser("status", help="Print current Meet bot state") + + tr_p = subs.add_parser("transcript", help="Print the scraped transcript") + tr_p.add_argument("--last", type=int, default=None) + + say_p = subs.add_parser("say", help="Speak text in an active realtime meeting") + say_p.add_argument("text", help="what to say") + say_p.add_argument("--node", default=None) + + subs.add_parser("stop", help="Leave the current meeting") + + # v3: remote node host management. + node_p = subs.add_parser( + "node", + help="Manage remote meet node hosts (run/list/approve/remove/status/ping)", + ) + try: + from plugins.google_meet.node.cli import register_cli as _register_node_cli + _register_node_cli(node_p) + except Exception as e: # pragma: no cover — defensive + # If the node module fails to import for any reason (optional dep + # missing at import time etc.), leave the subparser present but + # flag it. The argparse dispatch will surface a clear error. + def _node_unavailable(args): + print(f"hermes meet node: module unavailable ({e})") + return 1 + node_p.set_defaults(func=_node_unavailable) + + subparser.set_defaults(func=meet_command) + + +# --------------------------------------------------------------------------- +# Dispatch +# --------------------------------------------------------------------------- + +def meet_command(args: argparse.Namespace) -> int: + sub = getattr(args, "meet_command", None) + if not sub: + print("usage: hermes meet {setup,auth,join,status,transcript,say,stop,node}") + return 2 + if sub == "setup": + return _cmd_setup() + if sub == "install": + return _cmd_install( + realtime=bool(getattr(args, "realtime", False)), + assume_yes=bool(getattr(args, "yes", False)), + ) + if sub == "auth": + return _cmd_auth() + if sub == "join": + return _cmd_join( + url=args.url, + guest_name=args.guest_name, + duration=args.duration, + headed=args.headed, + mode=getattr(args, "mode", "transcribe"), + node=getattr(args, "node", None), + ) + if sub == "status": + return _cmd_status() + if sub == "transcript": + return _cmd_transcript(last=args.last) + if sub == "say": + return _cmd_say(text=args.text, node=getattr(args, "node", None)) + if sub == "stop": + return _cmd_stop() + if sub == "node": + # Dispatch was set by the node cli's register_cli; fall through to + # whatever its subparsers wired. + fn = getattr(args, "func", None) + if fn is None or fn is meet_command: + print("usage: hermes meet node {run,list,approve,remove,status,ping}") + return 2 + return fn(args) + print(f"unknown subcommand: {sub}") + return 2 + + +# --------------------------------------------------------------------------- +# Subcommand handlers +# --------------------------------------------------------------------------- + +def _cmd_setup() -> int: + import platform as _p + + print("google_meet preflight") + print("---------------------") + + system = _p.system() + system_ok = system in ("Linux", "Darwin") + print(f" platform : {system} [{'ok' if system_ok else 'unsupported'}]") + + try: + import playwright # noqa: F401 + pw_ok = True + pw_msg = "installed" + except ImportError: + pw_ok = False + pw_msg = "NOT installed — run: pip install playwright" + print(f" playwright : {pw_msg}") + + chromium_ok = False + chromium_msg = "unknown" + if pw_ok: + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + try: + exe = p.chromium.executable_path + if exe and Path(exe).exists(): + chromium_ok = True + chromium_msg = f"ok ({exe})" + else: + chromium_msg = ( + "not installed — run: " + "python -m playwright install chromium" + ) + except Exception as e: + chromium_msg = f"probe failed: {e}" + except Exception as e: + chromium_msg = f"probe failed: {e}" + print(f" chromium : {chromium_msg}") + + auth_path = _auth_state_path() + auth_ok = auth_path.is_file() + print( + " google auth : " + + (f"ok ({auth_path})" if auth_ok else "not saved — run: hermes meet auth") + ) + + print() + all_ok = system_ok and pw_ok and chromium_ok + if all_ok: + print( + "ready. Join a meeting: " + "hermes meet join https://meet.google.com/abc-defg-hij" + ) + else: + print("not ready yet — fix the items above.") + return 0 if all_ok else 1 + + +def _cmd_install(*, realtime: bool, assume_yes: bool) -> int: + """Install the plugin's prerequisites. + + Always: pip install playwright + websockets, then + ``python -m playwright install chromium``. + + With ``--realtime``: also install the platform audio bridge deps. + Linux : ``sudo apt-get install -y pulseaudio-utils`` + macOS : ``brew install blackhole-2ch ffmpeg`` (+ remind the user + to select BlackHole as the default input device manually) + + Prompts before every package-manager invocation unless ``--yes``. + Refuses to run on Windows. + """ + import platform as _p + import shutil as _shutil + import subprocess as _sp + + system = _p.system() + if system not in ("Linux", "Darwin"): + print(f"google_meet install: {system} is not supported (linux/macos only)") + return 1 + + def _confirm(prompt: str) -> bool: + if assume_yes: + return True + try: + ans = input(f"{prompt} [y/N] ").strip().lower() + except EOFError: + return False + return ans in ("y", "yes") + + print("google_meet install") + print("-------------------") + + # 1) pip deps — always safe, venv-scoped. + pip_pkgs = ["playwright", "websockets"] + print(f"\n[1/3] pip install: {' '.join(pip_pkgs)}") + try: + res = _sp.run( + [sys.executable, "-m", "pip", "install", "--upgrade", *pip_pkgs], + check=False, + ) + if res.returncode != 0: + print(" pip install failed") + return 1 + except Exception as e: + print(f" pip install failed: {e}") + return 1 + + # 2) Playwright browsers — pulls chromium (~300MB first run). + print("\n[2/3] python -m playwright install chromium") + try: + res = _sp.run( + [sys.executable, "-m", "playwright", "install", "chromium"], + check=False, + ) + if res.returncode != 0: + print(" playwright install failed (may already be installed)") + except Exception as e: + print(f" playwright install failed: {e}") + return 1 + + # 3) Platform audio deps for realtime mode. + if realtime: + print("\n[3/3] realtime audio deps") + if system == "Linux": + if _shutil.which("paplay") and _shutil.which("pactl"): + print(" pulseaudio-utils already installed.") + else: + if not _confirm( + " install pulseaudio-utils? this runs `sudo apt-get install -y pulseaudio-utils`" + ): + print(" skipped (you can run it manually later)") + else: + cmd = ["sudo", "apt-get", "install", "-y", "pulseaudio-utils"] + print(f" $ {' '.join(cmd)}") + res = _sp.run(cmd, check=False) + if res.returncode != 0: + print(" apt install failed — install pulseaudio-utils manually") + elif system == "Darwin": + have_bh = False + try: + out = _sp.check_output(["system_profiler", "SPAudioDataType"], text=True) + have_bh = "BlackHole" in out + except Exception: + pass + have_ffmpeg = bool(_shutil.which("ffmpeg")) + needs = [] + if not have_bh: + needs.append("blackhole-2ch") + if not have_ffmpeg: + needs.append("ffmpeg") + if not needs: + print(" BlackHole and ffmpeg already installed.") + elif not _shutil.which("brew"): + print( + " missing: " + ", ".join(needs) + "\n" + " install Homebrew first (https://brew.sh) or install the packages manually." + ) + else: + if not _confirm(f" install via brew: {' '.join(needs)}?"): + print(" skipped (you can run it manually later)") + else: + cmd = ["brew", "install", *needs] + print(f" $ {' '.join(cmd)}") + res = _sp.run(cmd, check=False) + if res.returncode != 0: + print(" brew install failed — install them manually") + print( + "\n NOTE: macOS does not auto-route audio. Open\n" + " System Settings → Sound → Input\n" + " and select 'BlackHole 2ch' before starting a realtime meeting.\n" + " hermes will not switch your default input for you." + ) + else: + print("\n[3/3] skipped (pass --realtime to install audio tooling too)") + + print("\ndone. verify with: hermes meet setup") + return 0 + + +def _cmd_auth() -> int: + """Open a headed Chromium, let the user sign in, save storage_state.""" + try: + from playwright.sync_api import sync_playwright + except ImportError: + print( + "playwright is not installed. run:\n" + " pip install playwright && python -m playwright install chromium" + ) + return 1 + + path = _auth_state_path() + path.parent.mkdir(parents=True, exist_ok=True) + + print(f"opening Chromium — sign in to Google, then return here and press Enter.") + print(f"saving storage state to: {path}") + try: + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=False) + context = browser.new_context() + page = context.new_page() + page.goto("https://accounts.google.com/", wait_until="domcontentloaded") + try: + input("press Enter after you've signed in ... ") + except EOFError: + pass + context.storage_state(path=str(path)) + browser.close() + except Exception as e: + print(f"auth failed: {e}") + return 1 + print("saved. you can now run: hermes meet join ") + return 0 + + +def _cmd_join( + url: str, + *, + guest_name: str, + duration: Optional[str], + headed: bool, + mode: str = "transcribe", + node: Optional[str] = None, +) -> int: + if not _is_safe_meet_url(url): + print(f"refusing: not a meet.google.com URL: {url}") + return 2 + if node: + # Remote: go through NodeClient. + try: + from plugins.google_meet.node.registry import NodeRegistry + from plugins.google_meet.node.client import NodeClient + except ImportError as e: + print(f"node module unavailable: {e}") + return 1 + reg = NodeRegistry() + entry = reg.resolve(node if node != "auto" else None) + if entry is None: + print(f"no registered node matches {node!r}") + return 1 + client = NodeClient(url=entry["url"], token=entry["token"]) + try: + res = client.start_bot( + url=url, guest_name=guest_name, duration=duration, + headed=headed, mode=mode, + ) + except Exception as e: + print(f"remote start_bot failed: {e}") + return 1 + print(json.dumps({"node": entry.get("name"), **res}, indent=2)) + return 0 if res.get("ok") else 1 + + auth = _auth_state_path() + res = pm.start( + url=url, + headed=headed, + guest_name=guest_name, + duration=duration, + auth_state=str(auth) if auth.is_file() else None, + mode=mode, + ) + print(json.dumps(res, indent=2)) + return 0 if res.get("ok") else 1 + + +def _cmd_say(text: str, node: Optional[str] = None) -> int: + if not (text or "").strip(): + print("refusing: empty text") + return 2 + if node: + try: + from plugins.google_meet.node.registry import NodeRegistry + from plugins.google_meet.node.client import NodeClient + except ImportError as e: + print(f"node module unavailable: {e}") + return 1 + reg = NodeRegistry() + entry = reg.resolve(node if node != "auto" else None) + if entry is None: + print(f"no registered node matches {node!r}") + return 1 + client = NodeClient(url=entry["url"], token=entry["token"]) + try: + res = client.say(text) + except Exception as e: + print(f"remote say failed: {e}") + return 1 + print(json.dumps({"node": entry.get("name"), **res}, indent=2)) + return 0 if res.get("ok") else 1 + + res = pm.enqueue_say(text) + print(json.dumps(res, indent=2)) + return 0 if res.get("ok") else 1 + + +def _cmd_status() -> int: + res = pm.status() + print(json.dumps(res, indent=2)) + return 0 if res.get("ok") else 1 + + +def _cmd_transcript(last: Optional[int]) -> int: + res = pm.transcript(last=last) + if not res.get("ok"): + print(json.dumps(res, indent=2)) + return 1 + for ln in res.get("lines", []): + print(ln) + return 0 + + +def _cmd_stop() -> int: + res = pm.stop(reason="hermes meet stop") + print(json.dumps(res, indent=2)) + return 0 if res.get("ok") else 1 + + +if __name__ == "__main__": # pragma: no cover + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + ns = parser.parse_args() + sys.exit(meet_command(ns)) diff --git a/plugins/google_meet/meet_bot.py b/plugins/google_meet/meet_bot.py new file mode 100644 index 0000000000..eb9318ae4a --- /dev/null +++ b/plugins/google_meet/meet_bot.py @@ -0,0 +1,852 @@ +"""Headless Google Meet bot — Playwright + live-caption scraping. + +Runs as a standalone subprocess spawned by ``process_manager.py``. Reads config +from env vars, writes status + transcript to files under +``$HERMES_HOME/workspace/meetings//``. The main hermes process +reads those files via the ``meet_*`` tools — no IPC beyond filesystem. + +The scraping strategy mirrors OpenUtter (sumansid/openutter): we don't parse +WebRTC audio, we enable Google Meet's built-in live captions and observe the +captions container in the DOM via a MutationObserver. This is lossy and +English-biased but it is: + +* deterministic (no API keys, no STT billing), +* works behind Meet's normal login / admission, +* survives Meet UI rewrites fairly well because the caption container has a + stable ARIA role. + +Run standalone for debugging:: + + HERMES_MEET_URL=https://meet.google.com/abc-defg-hij \\ + HERMES_MEET_OUT_DIR=/tmp/meet-debug \\ + HERMES_MEET_HEADED=1 \\ + python -m plugins.google_meet.meet_bot + +No meet.google.com URL → exits non-zero. Any URL that doesn't start with +``https://meet.google.com/`` is rejected (explicit-by-design). +""" + +from __future__ import annotations + +import json +import os +import re +import signal +import sys +import threading +import time +from pathlib import Path +from typing import Optional + +# Match ``https://meet.google.com/abc-defg-hij`` or ``.../lookup/...`` — the +# short three-segment code or a lookup URL. Anything else is rejected. +MEET_URL_RE = re.compile( + r"^https://meet\.google\.com/(" + r"[a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,}" + r"|lookup/[^/?#]+" + r"|new" + r")(?:[/?#].*)?$" +) + + +# Filenames the bot reads/writes in ``HERMES_MEET_OUT_DIR``. +SAY_QUEUE_FILENAME = "say_queue.jsonl" +SAY_PCM_FILENAME = "speaker.pcm" + + +def _is_safe_meet_url(url: str) -> bool: + """Return True if *url* is a Google Meet URL we're willing to navigate to.""" + if not isinstance(url, str): + return False + return bool(MEET_URL_RE.match(url.strip())) + + +def _meeting_id_from_url(url: str) -> str: + """Extract the 3-segment meeting code from a Meet URL. + + For ``https://meet.google.com/abc-defg-hij`` → ``abc-defg-hij``. + For ``.../lookup/`` or ``/new`` we fall back to a timestamped id — the + bot won't know the real code until after redirect, and callers pass this + through to filename anyway. + """ + m = re.search( + r"meet\.google\.com/([a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,})", + url or "", + ) + if m: + return m.group(1) + return f"meet-{int(time.time())}" + + +# --------------------------------------------------------------------------- +# Status + transcript file writers +# --------------------------------------------------------------------------- + +class _BotState: + """Single-process mutable state, flushed to ``status.json`` on each change.""" + + def __init__(self, out_dir: Path, meeting_id: str, url: str): + self.out_dir = out_dir + self.meeting_id = meeting_id + self.url = url + self.in_call = False + self.captioning = False + self.captions_enabled_attempted = False + self.lobby_waiting = False + self.join_attempted_at: Optional[float] = None + self.joined_at: Optional[float] = None + self.last_caption_at: Optional[float] = None + self.transcript_lines = 0 + self.error: Optional[str] = None + self.exited = False + # v2 realtime fields. + self.realtime = False + self.realtime_ready = False + self.realtime_device: Optional[str] = None + self.audio_bytes_out: int = 0 + self.last_audio_out_at: Optional[float] = None + self.last_barge_in_at: Optional[float] = None + self.leave_reason: Optional[str] = None + # Scraped captions, in order, deduped. Each entry is a dict of + # {"ts": , "speaker": str, "text": str}. + self._seen: set = set() + out_dir.mkdir(parents=True, exist_ok=True) + self.transcript_path = out_dir / "transcript.txt" + self.status_path = out_dir / "status.json" + self._flush() + + # -------- transcript ------------------------------------------------ + + def record_caption(self, speaker: str, text: str) -> None: + """Append a caption line if we haven't seen this exact (speaker, text).""" + speaker = (speaker or "").strip() or "Unknown" + text = (text or "").strip() + if not text: + return + key = f"{speaker}|{text}" + if key in self._seen: + return + self._seen.add(key) + self.transcript_lines += 1 + self.last_caption_at = time.time() + ts = time.strftime("%H:%M:%S", time.localtime(self.last_caption_at)) + line = f"[{ts}] {speaker}: {text}\n" + # Atomic-ish append — good enough for a single-writer. + with self.transcript_path.open("a", encoding="utf-8") as f: + f.write(line) + self._flush() + + # -------- status file ---------------------------------------------- + + def _flush(self) -> None: + data = { + "meetingId": self.meeting_id, + "url": self.url, + "inCall": self.in_call, + "captioning": self.captioning, + "captionsEnabledAttempted": self.captions_enabled_attempted, + "lobbyWaiting": self.lobby_waiting, + "joinAttemptedAt": self.join_attempted_at, + "joinedAt": self.joined_at, + "lastCaptionAt": self.last_caption_at, + "transcriptLines": self.transcript_lines, + "transcriptPath": str(self.transcript_path), + "error": self.error, + "exited": self.exited, + "pid": os.getpid(), + # v2 realtime telemetry. + "realtime": self.realtime, + "realtimeReady": self.realtime_ready, + "realtimeDevice": self.realtime_device, + "audioBytesOut": self.audio_bytes_out, + "lastAudioOutAt": self.last_audio_out_at, + "lastBargeInAt": self.last_barge_in_at, + "leaveReason": self.leave_reason, + } + tmp = self.status_path.with_suffix(".json.tmp") + tmp.write_text(json.dumps(data, indent=2), encoding="utf-8") + tmp.replace(self.status_path) + + def set(self, **kwargs) -> None: + for k, v in kwargs.items(): + setattr(self, k, v) + self._flush() + + +# --------------------------------------------------------------------------- +# Playwright bot entry point +# --------------------------------------------------------------------------- + +# JavaScript injected into the Meet tab to observe captions. Captures +# {speaker, text} tuples via a MutationObserver on the caption container, +# and exposes ``window.__hermesMeetDrain()`` to pull new entries. This +# mirrors the OpenUtter caption scraping approach. +_CAPTION_OBSERVER_JS = r""" +(() => { + if (window.__hermesMeetInstalled) return; + window.__hermesMeetInstalled = true; + window.__hermesMeetQueue = []; + + const captionSelector = '[role="region"][aria-label*="aption" i], ' + + 'div[jsname="YSxPC"], ' + // legacy + 'div[jsname="tgaKEf"]'; // current (Apr 2026) + + function pushEntry(speaker, text) { + if (!text || !text.trim()) return; + window.__hermesMeetQueue.push({ + ts: Date.now(), + speaker: (speaker || '').trim(), + text: text.trim(), + }); + } + + function scan(root) { + // Meet captions render as a list of rows; each row contains a speaker + // label and a text block. Selectors vary across Meet rewrites; we try + // a few shapes and fall back to raw text. + const rows = root.querySelectorAll('div[jsname="dsyhDe"], div.CNusmb, div.TBMuR'); + if (rows.length) { + rows.forEach((row) => { + const spkEl = row.querySelector('div.KcIKyf, div.zs7s8d, span[jsname="YSxPC"]'); + const txtEl = row.querySelector('div.bh44bd, span[jsname="tgaKEf"], div.iTTPOb'); + const speaker = spkEl ? spkEl.innerText : ''; + const text = txtEl ? txtEl.innerText : row.innerText; + pushEntry(speaker, text); + }); + return; + } + // Fallback: treat the whole region's innerText as one anonymous line. + const text = (root.innerText || '').split('\n').filter(Boolean).pop(); + pushEntry('', text); + } + + function attach() { + const el = document.querySelector(captionSelector); + if (!el) return false; + const obs = new MutationObserver(() => scan(el)); + obs.observe(el, { childList: true, subtree: true, characterData: true }); + scan(el); + return true; + } + + // Try now and retry on interval — the caption region only appears after + // captions are enabled and someone speaks. + if (!attach()) { + const iv = setInterval(() => { if (attach()) clearInterval(iv); }, 1500); + } + + window.__hermesMeetDrain = () => { + const out = window.__hermesMeetQueue.slice(); + window.__hermesMeetQueue = []; + return out; + }; +})(); +""" + + +def _enable_captions_js() -> str: + """Return a small JS snippet that tries to click the 'Turn on captions' button. + + Best-effort — Meet's caption toggle is keyboard-accessible via ``c``. We + dispatch that keystroke as a cheap fallback. Real click targeting is too + brittle to rely on. + """ + return r""" + (() => { + const ev = new KeyboardEvent('keydown', { + key: 'c', code: 'KeyC', keyCode: 67, which: 67, bubbles: true, + }); + document.body.dispatchEvent(ev); + return true; + })(); + """ + + +def _start_realtime_speaker( + *, + rt: dict, + out_dir: Path, + bridge_info: dict, + api_key: str, + model: str, + voice: str, + instructions: str, + stop_flag: dict, + state: "_BotState", +) -> None: + """Wire up the OpenAI Realtime session + speaker thread + PCM pump. + + The speaker thread reads text lines from ``say_queue.jsonl``, sends each + to OpenAI Realtime, and writes PCM audio into ``speaker.pcm``. A + separate *pump* thread forwards that PCM into the OS audio sink so + Chrome's fake mic picks it up. On Linux we pipe to ``paplay`` against + the null-sink; on macOS the caller is expected to have the BlackHole + device selected as default input. + """ + try: + from plugins.google_meet.realtime.openai_client import ( + RealtimeSession, + RealtimeSpeaker, + ) + except Exception as e: + state.set(error=f"realtime import failed: {e}") + return + + pcm_path = out_dir / SAY_PCM_FILENAME + queue_path = out_dir / SAY_QUEUE_FILENAME + processed_path = out_dir / "say_processed.jsonl" + # Reset the sink file so we start clean each session. + pcm_path.write_bytes(b"") + # Make sure the queue exists so the speaker poller doesn't error on + # first iteration. + queue_path.touch() + + try: + session = RealtimeSession( + api_key=api_key, + model=model, + voice=voice, + instructions=instructions, + audio_sink_path=pcm_path, + sample_rate=24000, + ) + session.connect() + except Exception as e: + state.set(error=f"realtime connect failed: {e}") + return + + rt["session"] = session + + def _stop_fn(): + return stop_flag.get("stop", False) + + rt["speaker_stop"] = lambda: stop_flag.__setitem__("stop", stop_flag.get("stop", False)) + + speaker = RealtimeSpeaker( + session=session, + queue_path=queue_path, + processed_path=processed_path, + ) + + def _speaker_loop(): + try: + speaker.run_until_stopped(_stop_fn) + except Exception as e: + state.set(error=f"realtime speaker crashed: {e}") + + t_speaker = threading.Thread(target=_speaker_loop, name="meet-speaker", daemon=True) + t_speaker.start() + rt["speaker_thread"] = t_speaker + + # PCM pump: feeds speaker.pcm (24kHz s16le mono) into the OS audio + # device that Chrome's fake mic reads from. Different tools per + # platform, but the contract is the same — block-read the growing + # PCM file and stream it to the device in near-real-time. + platform_tag = (bridge_info or {}).get("platform") + if platform_tag == "linux": + import subprocess as _sp + + sink = (bridge_info or {}).get("write_target") or "hermes_meet_sink" + try: + proc = _sp.Popen( + [ + "paplay", + "--raw", + "--rate=24000", + "--format=s16le", + "--channels=1", + f"--device={sink}", + str(pcm_path), + ], + stdin=_sp.DEVNULL, + stdout=_sp.DEVNULL, + stderr=_sp.DEVNULL, + ) + rt["pcm_pump"] = proc + except FileNotFoundError: + state.set(error="paplay not found — install pulseaudio-utils for realtime on Linux") + elif platform_tag == "darwin": + # macOS: use ffmpeg to tail-read speaker.pcm and write it to the + # BlackHole output device. The user must have BlackHole selected + # as the default input in System Settings → Sound for Chrome to + # pick it up. We prefer ffmpeg because it's scriptable and can + # target AVFoundation devices by name; fall back to afplay-ing + # the file in a tight loop if ffmpeg is absent. + import shutil as _shutil + import subprocess as _sp + + device_name = (bridge_info or {}).get("write_target") or "BlackHole 2ch" + if _shutil.which("ffmpeg"): + try: + # -re: read input at native frame rate. + # -f avfoundation -i: speaker path as raw PCM. + # -f s16le -ar 24000 -ac 1 -i : interpret the file. + # -f audiotoolbox -audio_device_index: write to BlackHole. + # Simpler: output as raw via coreaudio using "-f audiotoolbox". + # ffmpeg's audiotoolbox output picks the current default + # output device, which isn't what we want. Instead we use + # -f avfoundation with the named device as OUTPUT via + # -vn and the device name. + proc = _sp.Popen( + [ + "ffmpeg", + "-nostdin", "-hide_banner", "-loglevel", "error", + "-re", + "-f", "s16le", "-ar", "24000", "-ac", "1", + "-i", str(pcm_path), + "-f", "audiotoolbox", + "-audio_device_index", _mac_audio_device_index(device_name), + "-", + ], + stdin=_sp.DEVNULL, + stdout=_sp.DEVNULL, + stderr=_sp.DEVNULL, + ) + rt["pcm_pump"] = proc + except FileNotFoundError: + state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS") + except Exception as e: + state.set(error=f"macOS pcm pump failed to start: {e}") + else: + state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS") + + +def _mac_audio_device_index(device_name: str) -> str: + """Return the ffmpeg ``-audio_device_index`` for *device_name*, as a string. + + Probes ``ffmpeg -f avfoundation -list_devices true -i ''`` (which prints + the device table on stderr) and matches *device_name* case-insensitively. + Defaults to ``"0"`` if the device can't be found — caller will get a + misrouted stream but not a crash, and the error will be obvious. + """ + import subprocess as _sp + + try: + out = _sp.run( + ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""], + capture_output=True, + text=True, + timeout=10, + ) + except Exception: + return "0" + # ffmpeg prints the table on stderr. Lines look like: + # [AVFoundation indev @ 0x...] [0] BlackHole 2ch + import re as _re + + needle = device_name.strip().lower() + for line in (out.stderr or "").splitlines(): + m = _re.search(r"\[(\d+)\]\s+(.+)$", line) + if not m: + continue + if m.group(2).strip().lower() == needle: + return m.group(1) + return "0" + + +def run_bot() -> int: # noqa: C901 — orchestration, explicit branches + url = os.environ.get("HERMES_MEET_URL", "").strip() + out_dir_env = os.environ.get("HERMES_MEET_OUT_DIR", "").strip() + headed = os.environ.get("HERMES_MEET_HEADED", "").lower() in ("1", "true", "yes") + auth_state = os.environ.get("HERMES_MEET_AUTH_STATE", "").strip() + guest_name = os.environ.get("HERMES_MEET_GUEST_NAME", "Hermes Agent") + duration_s = _parse_duration(os.environ.get("HERMES_MEET_DURATION", "")) + # v2: optional realtime mode. Enabled when HERMES_MEET_MODE=realtime. + mode = os.environ.get("HERMES_MEET_MODE", "transcribe").strip().lower() + realtime_model = os.environ.get("HERMES_MEET_REALTIME_MODEL", "gpt-realtime") + realtime_voice = os.environ.get("HERMES_MEET_REALTIME_VOICE", "alloy") + realtime_instructions = os.environ.get("HERMES_MEET_REALTIME_INSTRUCTIONS", "") + realtime_api_key = os.environ.get("HERMES_MEET_REALTIME_KEY") or os.environ.get("OPENAI_API_KEY", "") + + if not url or not _is_safe_meet_url(url): + sys.stderr.write( + "google_meet bot: refusing to launch — HERMES_MEET_URL must be a " + "meet.google.com URL. got: %r\n" % url + ) + return 2 + if not out_dir_env: + sys.stderr.write("google_meet bot: HERMES_MEET_OUT_DIR is required\n") + return 2 + + out_dir = Path(out_dir_env) + meeting_id = _meeting_id_from_url(url) + state = _BotState(out_dir=out_dir, meeting_id=meeting_id, url=url) + + # SIGTERM → exit cleanly so the parent ``meet_leave`` gets a finalized + # transcript. We set a flag instead of raising so the Playwright context + # teardown runs in the finally block below. + stop_flag = {"stop": False} + + def _on_signal(_sig, _frame): + stop_flag["stop"] = True + + signal.signal(signal.SIGTERM, _on_signal) + signal.signal(signal.SIGINT, _on_signal) + + # v2 realtime: provision virtual audio device + start speaker thread. + # We track these in a dict so the finally block can tear them down + # regardless of how we exit. If anything in the realtime setup fails we + # fall back to transcribe mode with a status flag. + rt = { + "enabled": mode == "realtime", + "bridge": None, # AudioBridge | None + "bridge_info": None, # dict | None + "session": None, # RealtimeSession | None + "speaker_thread": None, # threading.Thread | None + "speaker_stop": None, # callable | None + } + if rt["enabled"]: + if not realtime_api_key: + state.set(error="realtime mode requested but no API key in HERMES_MEET_REALTIME_KEY/OPENAI_API_KEY — falling back to transcribe") + rt["enabled"] = False + else: + try: + from plugins.google_meet.audio_bridge import AudioBridge + bridge = AudioBridge() + rt["bridge_info"] = bridge.setup() + rt["bridge"] = bridge + state.set(realtime=True, realtime_device=rt["bridge_info"].get("device_name")) + except Exception as e: + state.set(error=f"audio bridge setup failed: {e} — falling back to transcribe") + rt["enabled"] = False + + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + state.set(error=f"playwright not installed: {e}", exited=True) + sys.stderr.write( + "google_meet bot: playwright is not installed. Run " + "`pip install playwright && python -m playwright install chromium`\n" + ) + if rt["bridge"]: + rt["bridge"].teardown() + return 3 + + # Chrome env: if realtime is live on Linux, point PULSE_SOURCE at the + # virtual source so Chrome's fake mic reads the audio we generate. + chrome_env = os.environ.copy() + chrome_args = [ + "--use-fake-ui-for-media-stream", + "--disable-blink-features=AutomationControlled", + ] + if not rt["enabled"]: + # v1-style fake device (silence) — we don't care about mic content + # when we're not speaking. + chrome_args.insert(1, "--use-fake-device-for-media-stream") + elif rt["bridge_info"] and rt["bridge_info"].get("platform") == "linux": + chrome_env["PULSE_SOURCE"] = rt["bridge_info"].get("device_name", "") + + try: + with sync_playwright() as pw: + # Playwright's launch() doesn't take env; we set PULSE_SOURCE + # via the process env before launch so the child Chrome inherits it. + for k, v in chrome_env.items(): + os.environ[k] = v + browser = pw.chromium.launch( + headless=not headed, + args=chrome_args, + ) + context_args = { + "viewport": {"width": 1280, "height": 800}, + "user_agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" + ), + "permissions": ["microphone", "camera"], + } + if auth_state and Path(auth_state).is_file(): + context_args["storage_state"] = auth_state + context = browser.new_context(**context_args) + page = context.new_page() + + try: + page.goto(url, wait_until="domcontentloaded", timeout=30_000) + except Exception as e: + state.set(error=f"navigate failed: {e}", exited=True) + return 4 + + # Guest-mode: Meet shows a name field before "Ask to join". When + # we're authed, we instead see "Join now". + _try_guest_name(page, guest_name) + _click_join(page, state) + + # Install caption observer and attempt to enable captions. + try: + page.evaluate(_enable_captions_js()) + state.set(captions_enabled_attempted=True) + except Exception: + pass + try: + page.evaluate(_CAPTION_OBSERVER_JS) + except Exception as e: + state.set(error=f"caption observer install failed: {e}") + + # Note: in_call=False until admission is confirmed (we detect + # either the Leave button or the caption region, signalling we + # made it past the lobby). + state.set(captioning=True, join_attempted_at=time.time()) + + # v2 realtime: start the speaker thread reading from the + # plugin-side say queue. The thread reads JSONL lines written by + # meet_say, calls OpenAI Realtime, and streams the audio PCM to + # the virtual sink that Chrome's fake-mic is pointed at. + if rt["enabled"]: + _start_realtime_speaker( + rt=rt, + out_dir=out_dir, + bridge_info=rt["bridge_info"], + api_key=realtime_api_key, + model=realtime_model, + voice=realtime_voice, + instructions=realtime_instructions, + stop_flag=stop_flag, + state=state, + ) + if rt["session"] is not None: + state.set(realtime_ready=True) + + # Admission + drain loop. Runs until SIGTERM, duration expiry, + # or the page detects "You were removed / you left the + # meeting". Responsible for: + # * detecting admission (Leave button visible → in_call=True) + # * timing out stuck-in-lobby (default 5 minutes) + # * draining scraped captions into the transcript + # * triggering realtime barge-in when a human speaks while + # the bot is generating audio + # * periodically flushing realtime counters into status.json + deadline = (time.time() + duration_s) if duration_s else None + lobby_deadline = time.time() + float( + os.environ.get("HERMES_MEET_LOBBY_TIMEOUT", "300") + ) + last_admission_check = 0.0 + while not stop_flag["stop"]: + now = time.time() + if deadline and now > deadline: + state.set(leave_reason="duration_expired") + break + + # Admission detection every ~3s until admitted. + if not state.in_call and (now - last_admission_check) > 3.0: + last_admission_check = now + admitted = _detect_admission(page) + if admitted: + state.set( + in_call=True, + lobby_waiting=False, + joined_at=now, + ) + elif now > lobby_deadline: + state.set( + error=( + "lobby timeout — host never admitted the bot " + f"within {int(lobby_deadline - state.join_attempted_at) if state.join_attempted_at else 0}s" + ), + leave_reason="lobby_timeout", + ) + break + elif _detect_denied(page): + state.set( + error="host denied admission", + leave_reason="denied", + ) + break + + try: + queued = page.evaluate("window.__hermesMeetDrain && window.__hermesMeetDrain()") + if isinstance(queued, list): + for entry in queued: + if not isinstance(entry, dict): + continue + speaker = str(entry.get("speaker", "")) + text = str(entry.get("text", "")) + state.record_caption(speaker=speaker, text=text) + # Barge-in: if the bot is currently generating + # audio AND a real human just spoke, cancel the + # in-flight response so we don't talk over them. + if rt["enabled"] and rt["session"] is not None: + if _looks_like_human_speaker(speaker, guest_name): + try: + cancelled = rt["session"].cancel_response() + if cancelled: + state.set(last_barge_in_at=now) + except Exception: + pass + except Exception: + # Meet reloaded or we got booted — try to detect and + # exit gracefully rather than spinning. + if page.is_closed(): + state.set(leave_reason="page_closed") + break + + # Fold the realtime session's byte/timestamp counters into + # the status file so meet_status can surface them. + if rt["session"] is not None: + state.set( + audio_bytes_out=getattr(rt["session"], "audio_bytes_out", 0), + last_audio_out_at=getattr(rt["session"], "last_audio_out_at", None), + ) + + time.sleep(1.0) + + # Try to leave cleanly — click "Leave call" button if present. + try: + page.evaluate( + "() => { const b = document.querySelector('button[aria-label*=\"eave call\"]');" + " if (b) b.click(); }" + ) + except Exception: + pass + + context.close() + browser.close() + # v2: teardown realtime speaker + audio bridge. + if rt["speaker_stop"]: + try: + rt["speaker_stop"]() + except Exception: + pass + if rt["speaker_thread"] is not None: + try: + rt["speaker_thread"].join(timeout=5.0) + except Exception: + pass + if rt["session"]: + try: + rt["session"].close() + except Exception: + pass + if rt["bridge"]: + try: + rt["bridge"].teardown() + except Exception: + pass + state.set(in_call=False, captioning=False, exited=True) + return 0 + + except Exception as e: + state.set(error=f"unhandled: {e}", exited=True) + return 1 + + +def _try_guest_name(page, guest_name: str) -> None: + """If Meet is showing a guest-name input, type *guest_name* into it.""" + try: + # Meet's guest name input has placeholder "Your name". + locator = page.locator('input[aria-label*="name" i]').first + if locator.count() and locator.is_visible(): + locator.fill(guest_name, timeout=2_000) + except Exception: + pass + + +def _detect_admission(page) -> bool: + """True if we're clearly past the lobby and in the call itself. + + Uses a JS-side probe because Meet's DOM structure varies by client + version. We check several high-signal indicators and declare admission + on the first hit: + + 1. Leave-call button is present (``aria-label`` contains "eave call"). + 2. Caption region has appeared (we installed the observer and it attached). + 3. The participant list container is visible. + + Conservative by default — returns False on any error. + """ + probe = r""" + (() => { + const leave = document.querySelector('button[aria-label*="eave call" i]'); + if (leave) return true; + if (window.__hermesMeetInstalled) { + const caps = document.querySelector( + '[role="region"][aria-label*="aption" i], ' + + 'div[jsname="YSxPC"], div[jsname="tgaKEf"]' + ); + if (caps) return true; + } + const parts = document.querySelector('[aria-label*="articipants" i]'); + if (parts) return true; + return false; + })(); + """ + try: + return bool(page.evaluate(probe)) + except Exception: + return False + + +def _detect_denied(page) -> bool: + """True when Meet is showing a 'you were denied' / 'no one admitted' page.""" + probe = r""" + (() => { + const text = document.body ? document.body.innerText || '' : ''; + // English only — matches what shows up when the host denies or + // removes a guest. + if (/You can't join this video call/i.test(text)) return true; + if (/You were removed from the meeting/i.test(text)) return true; + if (/No one responded to your request to join/i.test(text)) return true; + return false; + })(); + """ + try: + return bool(page.evaluate(probe)) + except Exception: + return False + + +def _looks_like_human_speaker(speaker: str, bot_guest_name: str) -> bool: + """Whether a caption line's speaker is probably a human, not our bot echo. + + Meet attributes captions to the speaker's display name. When Chrome is + reading our fake mic, Meet still attributes captions to *our* bot name + (because the bot is the one "speaking"). We don't want those to trigger + barge-in. Anything else — real participant names — does. + + Conservative: unknown / blank speakers (common when caption scraping + falls back to raw text) do NOT trigger barge-in, because we can't tell + whether it was a human or us. + """ + if not speaker or not speaker.strip(): + return False + spk = speaker.strip().lower() + if spk in ("unknown", "you", bot_guest_name.strip().lower()): + return False + return True + + +def _click_join(page, state: _BotState) -> None: + """Click 'Join now' or 'Ask to join' if either button is visible. + + Flags ``lobby_waiting`` when we hit the "waiting for host to admit you" + state so the agent can surface that in status. + """ + for label in ("Join now", "Ask to join"): + try: + btn = page.get_by_role("button", name=label, exact=False).first + if btn.count() and btn.is_visible(): + btn.click(timeout=3_000) + if label == "Ask to join": + state.set(lobby_waiting=True) + break + except Exception: + continue + + +def _parse_duration(raw: str) -> Optional[float]: + """Parse ``30m`` / ``2h`` / ``90`` (seconds) → float seconds, or None.""" + if not raw: + return None + raw = raw.strip().lower() + try: + if raw.endswith("h"): + return float(raw[:-1]) * 3600 + if raw.endswith("m"): + return float(raw[:-1]) * 60 + if raw.endswith("s"): + return float(raw[:-1]) + return float(raw) + except ValueError: + return None + + +if __name__ == "__main__": # pragma: no cover — subprocess entry point + sys.exit(run_bot()) diff --git a/plugins/google_meet/node/__init__.py b/plugins/google_meet/node/__init__.py new file mode 100644 index 0000000000..338203b329 --- /dev/null +++ b/plugins/google_meet/node/__init__.py @@ -0,0 +1,54 @@ +"""Remote 'node host' primitive for the google_meet plugin. + +Lets the Meet bot (Playwright + Chrome) run on a different machine than +the hermes-agent gateway. The gateway speaks a small JSON-over-WebSocket +RPC protocol to the remote node; the node wraps the existing +``plugins.google_meet.process_manager`` API. + +Topology +-------- + gateway (Linux) ── ws://mac.local:18789 ──▶ node server (Mac) + └─ process_manager + └─ meet_bot (Playwright) + +Why: Google sign-in + Chrome profile live on the user's laptop. Running +the bot there reuses that profile without shipping credentials to the +server. + +Public surface +-------------- + NodeClient — gateway-side RPC client (short-lived sync WS per call) + NodeServer — long-running server that hosts the bot + NodeRegistry — local JSON registry of approved nodes (name → url+token) + protocol — message envelope helpers (make_request, encode, decode, ...) +""" + +from __future__ import annotations + +from plugins.google_meet.node import protocol +from plugins.google_meet.node.client import NodeClient +from plugins.google_meet.node.protocol import ( + VALID_REQUEST_TYPES, + decode, + encode, + make_error, + make_request, + make_response, + validate_request, +) +from plugins.google_meet.node.registry import NodeRegistry +from plugins.google_meet.node.server import NodeServer + +__all__ = [ + "NodeClient", + "NodeServer", + "NodeRegistry", + "protocol", + "make_request", + "make_response", + "make_error", + "encode", + "decode", + "validate_request", + "VALID_REQUEST_TYPES", +] diff --git a/plugins/google_meet/node/cli.py b/plugins/google_meet/node/cli.py new file mode 100644 index 0000000000..4e10161e0c --- /dev/null +++ b/plugins/google_meet/node/cli.py @@ -0,0 +1,125 @@ +"""`hermes meet node ...` subcommand tree. + +Wired into the existing ``hermes meet`` parser by the plugin's top-level +CLI. This module only defines the subparsers and their dispatch — it +does not mutate the existing cli.py. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +from typing import Any + +from plugins.google_meet.node.client import NodeClient +from plugins.google_meet.node.registry import NodeRegistry +from plugins.google_meet.node.server import NodeServer + + +def register_cli(subparser: argparse.ArgumentParser) -> None: + """Add ``run / list / approve / remove / status / ping`` subparsers. + + *subparser* is the ``hermes meet node`` argparse object — typically + the result of ``meet_parser.add_parser('node', ...)``. + """ + sp = subparser.add_subparsers(dest="node_cmd", required=True) + + run = sp.add_parser("run", help="Start a node server on this machine.") + run.add_argument("--host", default="0.0.0.0") + run.add_argument("--port", type=int, default=18789) + run.add_argument("--display-name", default="hermes-meet-node") + run.set_defaults(func=node_command) + + lst = sp.add_parser("list", help="List approved remote nodes.") + lst.set_defaults(func=node_command) + + app = sp.add_parser("approve", help="Register a remote node on the gateway.") + app.add_argument("name") + app.add_argument("url") + app.add_argument("token") + app.set_defaults(func=node_command) + + rm = sp.add_parser("remove", help="Forget a registered node.") + rm.add_argument("name") + rm.set_defaults(func=node_command) + + st = sp.add_parser("status", help="Ping a registered node.") + st.add_argument("name") + st.set_defaults(func=node_command) + + pg = sp.add_parser("ping", help="Alias for status.") + pg.add_argument("name") + pg.set_defaults(func=node_command) + + +def node_command(args: argparse.Namespace) -> int: + """Dispatch for ``hermes meet node ...``. + + Returns a process exit code. Side-effects print to stdout/stderr. + """ + cmd = getattr(args, "node_cmd", None) + + if cmd == "run": + server = NodeServer( + host=args.host, + port=args.port, + display_name=args.display_name, + ) + token = server.ensure_token() + print(f"[meet-node] display_name={server.display_name}") + print(f"[meet-node] listening on ws://{args.host}:{args.port}") + print(f"[meet-node] token (copy to gateway): {token}") + print(f"[meet-node] approve with:") + print(f" hermes meet node approve ws://:{args.port} {token}") + try: + asyncio.run(server.serve()) + except KeyboardInterrupt: + return 0 + except RuntimeError as exc: + print(f"[meet-node] error: {exc}", file=sys.stderr) + return 2 + return 0 + + reg = NodeRegistry() + + if cmd == "list": + nodes = reg.list_all() + if not nodes: + print("no nodes registered") + return 0 + for n in nodes: + print(f"{n['name']}\t{n['url']}\ttoken={n['token'][:6]}…") + return 0 + + if cmd == "approve": + reg.add(args.name, args.url, args.token) + print(f"approved node {args.name!r} at {args.url}") + return 0 + + if cmd == "remove": + ok = reg.remove(args.name) + print(f"removed {args.name!r}" if ok else f"no such node: {args.name!r}") + return 0 if ok else 1 + + if cmd in ("status", "ping"): + entry = reg.get(args.name) + if entry is None: + print(f"no such node: {args.name!r}", file=sys.stderr) + return 1 + client = NodeClient(entry["url"], entry["token"]) + try: + result = client.ping() + except Exception as exc: # noqa: BLE001 — surface any connection error + print(json.dumps({"ok": False, "error": str(exc)})) + return 1 + print(json.dumps({"ok": True, "node": args.name, **_coerce_dict(result)})) + return 0 + + print(f"unknown node command: {cmd!r}", file=sys.stderr) + return 2 + + +def _coerce_dict(value: Any) -> dict: + return value if isinstance(value, dict) else {"result": value} diff --git a/plugins/google_meet/node/client.py b/plugins/google_meet/node/client.py new file mode 100644 index 0000000000..1965333c0b --- /dev/null +++ b/plugins/google_meet/node/client.py @@ -0,0 +1,107 @@ +"""Gateway-side RPC client for a remote meet node. + +Each call opens a short-lived synchronous WebSocket to the node, sends +exactly one request, reads exactly one response, and closes. This keeps +the client trivial to use from non-async tool handlers and avoids +maintaining persistent connection state across agent turns. + +The ``websockets`` package is an optional dep — we import it lazily so +plugin load doesn't require it. +""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +from plugins.google_meet.node import protocol as _proto + + +class NodeClient: + """Thin synchronous WS client matching the server's request surface.""" + + def __init__(self, url: str, token: str, timeout: float = 10.0) -> None: + if not isinstance(url, str) or not url: + raise ValueError("url must be a non-empty string") + if not isinstance(token, str) or not token: + raise ValueError("token must be a non-empty string") + self.url = url + self.token = token + self.timeout = float(timeout) + + # ----- core RPC ----------------------------------------------------- + + def _rpc(self, type: str, payload: Dict[str, Any]) -> Dict[str, Any]: + """Send one request, return the response payload dict. + + Raises RuntimeError when the server sends an ``error`` envelope + or the response id doesn't match. + """ + try: + from websockets.sync.client import connect # type: ignore + except ImportError as exc: + raise RuntimeError( + "NodeClient requires the 'websockets' package. " + "Install it with: pip install websockets" + ) from exc + + req = _proto.make_request(type, self.token, payload) + raw_out = _proto.encode(req) + + with connect(self.url, open_timeout=self.timeout, + close_timeout=self.timeout) as ws: + ws.send(raw_out) + raw_in = ws.recv(timeout=self.timeout) + + if isinstance(raw_in, (bytes, bytearray)): + raw_in = raw_in.decode("utf-8") + resp = _proto.decode(raw_in) + + if resp.get("type") == "error": + raise RuntimeError(f"node error: {resp.get('error', '')}") + if resp.get("id") != req["id"]: + raise RuntimeError( + f"response id mismatch: sent {req['id']}, got {resp.get('id')!r}" + ) + payload_out = resp.get("payload") + if not isinstance(payload_out, dict): + # Ping returns {"type": "pong", "payload": {...}} — still a dict. + raise RuntimeError("response missing payload dict") + return payload_out + + # ----- convenience methods ----------------------------------------- + + def start_bot( + self, + url: str, + guest_name: str = "Hermes Agent", + duration: Optional[str] = None, + headed: bool = False, + mode: str = "transcribe", + ) -> Dict[str, Any]: + payload: Dict[str, Any] = { + "url": url, + "guest_name": guest_name, + "headed": bool(headed), + "mode": mode, + } + if duration is not None: + payload["duration"] = duration + return self._rpc("start_bot", payload) + + def stop(self) -> Dict[str, Any]: + return self._rpc("stop", {}) + + def status(self) -> Dict[str, Any]: + return self._rpc("status", {}) + + def transcript(self, last: Optional[int] = None) -> Dict[str, Any]: + payload: Dict[str, Any] = {} + if last is not None: + payload["last"] = int(last) + return self._rpc("transcript", payload) + + def say(self, text: str) -> Dict[str, Any]: + return self._rpc("say", {"text": str(text)}) + + def ping(self) -> Dict[str, Any]: + return self._rpc("ping", {}) diff --git a/plugins/google_meet/node/protocol.py b/plugins/google_meet/node/protocol.py new file mode 100644 index 0000000000..8794d8a533 --- /dev/null +++ b/plugins/google_meet/node/protocol.py @@ -0,0 +1,124 @@ +"""Wire protocol for gateway ↔ node RPC. + +Everything is a JSON object with the same envelope shape: + + Request: {"type": , "id": , "token": , "payload": } + Response: {"type": "_res", "id": , "payload": } + Error: {"type": "error", "id": , "error": } + +Requests must carry the shared bearer token (set up via +``hermes meet node approve`` on the gateway and read off disk on the +server). Mismatched tokens are rejected before dispatch. +""" + +from __future__ import annotations + +import json +import uuid +from typing import Any, Dict, Tuple + + +VALID_REQUEST_TYPES = frozenset({ + "start_bot", + "stop", + "status", + "transcript", + "say", + "ping", +}) + + +def make_request( + type: str, + token: str, + payload: Dict[str, Any], + req_id: str | None = None, +) -> Dict[str, Any]: + """Construct a request envelope. + + ``req_id`` is auto-generated (uuid4 hex) when not supplied so callers + can correlate async responses. + """ + if not isinstance(type, str) or not type: + raise ValueError("type must be a non-empty string") + if type not in VALID_REQUEST_TYPES: + raise ValueError(f"unknown request type: {type!r}") + if not isinstance(token, str): + raise ValueError("token must be a string") + if not isinstance(payload, dict): + raise ValueError("payload must be a dict") + return { + "type": type, + "id": req_id or uuid.uuid4().hex, + "token": token, + "payload": payload, + } + + +def make_response(req_id: str, payload: Dict[str, Any]) -> Dict[str, Any]: + """Build a success response. The caller supplies the *request* type; + we suffix it with ``_res`` so clients can assert they got the right + reply. + + For simplicity we don't require the type here — clients usually just + key off ``id``. But we still emit a generic ``*_res`` envelope. + """ + if not isinstance(payload, dict): + raise ValueError("payload must be a dict") + return {"type": "response", "id": req_id, "payload": payload} + + +def make_error(req_id: str, error: str) -> Dict[str, Any]: + return {"type": "error", "id": req_id, "error": str(error)} + + +def encode(msg: Dict[str, Any]) -> str: + """Serialize a message envelope to a JSON string.""" + return json.dumps(msg, separators=(",", ":"), ensure_ascii=False) + + +def decode(raw: str) -> Dict[str, Any]: + """Parse a JSON envelope, raising ValueError on anything malformed. + + Minimal type validation: must be an object, must contain ``type`` and + ``id``. Heavier validation (token match, payload shape) happens in + :func:`validate_request` on the server side. + """ + try: + obj = json.loads(raw) + except (TypeError, json.JSONDecodeError) as exc: + raise ValueError(f"malformed JSON: {exc}") from exc + if not isinstance(obj, dict): + raise ValueError("envelope must be a JSON object") + if "type" not in obj or not isinstance(obj["type"], str): + raise ValueError("envelope missing string 'type'") + if "id" not in obj or not isinstance(obj["id"], str): + raise ValueError("envelope missing string 'id'") + return obj + + +def validate_request(msg: Dict[str, Any], expected_token: str) -> Tuple[bool, str]: + """Check a decoded request against the server's shared token. + + Returns ``(True, "")`` when the envelope is acceptable or + ``(False, )`` otherwise. Reason strings are safe to surface + back to the client in an error envelope. + """ + if not isinstance(msg, dict): + return False, "envelope must be a dict" + t = msg.get("type") + if not isinstance(t, str) or not t: + return False, "missing or non-string 'type'" + if t not in VALID_REQUEST_TYPES: + return False, f"unknown request type: {t!r}" + if not isinstance(msg.get("id"), str) or not msg.get("id"): + return False, "missing or non-string 'id'" + token = msg.get("token") + if not isinstance(token, str) or not token: + return False, "missing token" + if token != expected_token: + return False, "token mismatch" + payload = msg.get("payload") + if not isinstance(payload, dict): + return False, "payload must be a dict" + return True, "" diff --git a/plugins/google_meet/node/registry.py b/plugins/google_meet/node/registry.py new file mode 100644 index 0000000000..9be8575562 --- /dev/null +++ b/plugins/google_meet/node/registry.py @@ -0,0 +1,112 @@ +"""Local JSON registry of approved remote meet nodes. + +Lives at ``$HERMES_HOME/workspace/meetings/nodes.json``. The gateway +consults it to resolve a ``chrome_node`` name to a ``(url, token)`` pair +before opening a WebSocket to the remote bot host. + +Schema +------ + { + "nodes": { + "": { + "url": "ws://host:port", + "token": "...", + "added_at": + } + } + } +""" + +from __future__ import annotations + +import json +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +from hermes_constants import get_hermes_home + + +def _default_path() -> Path: + return Path(get_hermes_home()) / "workspace" / "meetings" / "nodes.json" + + +class NodeRegistry: + """Simple file-backed registry. Not concurrent-safe across processes + — single writer assumed (the gateway CLI).""" + + def __init__(self, path: Optional[Path] = None) -> None: + self.path = Path(path) if path is not None else _default_path() + + # ----- storage ------------------------------------------------------ + + def _load(self) -> Dict[str, Any]: + if not self.path.is_file(): + return {"nodes": {}} + try: + data = json.loads(self.path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {"nodes": {}} + if not isinstance(data, dict) or not isinstance(data.get("nodes"), dict): + return {"nodes": {}} + return data + + def _save(self, data: Dict[str, Any]) -> None: + self.path.parent.mkdir(parents=True, exist_ok=True) + tmp = self.path.with_suffix(".json.tmp") + tmp.write_text(json.dumps(data, indent=2), encoding="utf-8") + tmp.replace(self.path) + + # ----- public API --------------------------------------------------- + + def get(self, name: str) -> Optional[Dict[str, Any]]: + data = self._load() + entry = data["nodes"].get(name) + if entry is None: + return None + return {"name": name, **entry} + + def add(self, name: str, url: str, token: str) -> None: + if not isinstance(name, str) or not name: + raise ValueError("node name must be a non-empty string") + if not isinstance(url, str) or not url: + raise ValueError("url must be a non-empty string") + if not isinstance(token, str) or not token: + raise ValueError("token must be a non-empty string") + data = self._load() + data["nodes"][name] = { + "url": url, + "token": token, + "added_at": time.time(), + } + self._save(data) + + def remove(self, name: str) -> bool: + data = self._load() + if name in data["nodes"]: + del data["nodes"][name] + self._save(data) + return True + return False + + def list_all(self) -> List[Dict[str, Any]]: + data = self._load() + out: List[Dict[str, Any]] = [] + for name, entry in sorted(data["nodes"].items()): + out.append({"name": name, **entry}) + return out + + def resolve(self, chrome_node: Optional[str]) -> Optional[Dict[str, Any]]: + """Resolve a node name to its entry. + + If ``chrome_node`` is provided, return that named node (or None). + If ``chrome_node`` is None, return the sole registered node when + exactly one is registered; otherwise return None (ambiguous or + empty). + """ + if chrome_node: + return self.get(chrome_node) + nodes = self.list_all() + if len(nodes) == 1: + return nodes[0] + return None diff --git a/plugins/google_meet/node/server.py b/plugins/google_meet/node/server.py new file mode 100644 index 0000000000..a0d802dfdc --- /dev/null +++ b/plugins/google_meet/node/server.py @@ -0,0 +1,193 @@ +"""Remote node server. + +Runs on the machine that will host the Meet bot (typically the user's +Mac laptop with a signed-in Chrome). Exposes a WebSocket endpoint that +accepts signed RPC requests and dispatches them to the existing +``plugins.google_meet.process_manager`` module. + +Launched by ``hermes meet node run``. + +Token handling +-------------- +On first boot we mint 32 hex chars of entropy and persist them at +``$HERMES_HOME/workspace/meetings/node_token.json``. Subsequent boots +reuse the same token so previously-approved gateways don't need to be +re-paired. The operator copies this token out-of-band to the gateway +via ``hermes meet node approve ``. + +Dependencies +------------ +``websockets`` is an optional dep. We import it lazily inside +:meth:`serve` so installing the plugin doesn't require it unless you +actually host a node. +""" + +from __future__ import annotations + +import json +import secrets +import time +from pathlib import Path +from typing import Any, Dict, Optional + +from hermes_constants import get_hermes_home +from plugins.google_meet.node import protocol as _proto + + +def _default_token_path() -> Path: + return Path(get_hermes_home()) / "workspace" / "meetings" / "node_token.json" + + +class NodeServer: + """WebSocket server that executes meet bot RPCs locally.""" + + def __init__( + self, + host: str = "0.0.0.0", + port: int = 18789, + token_path: Optional[Path] = None, + display_name: str = "hermes-meet-node", + ) -> None: + self.host = host + self.port = port + self.display_name = display_name + self.token_path = Path(token_path) if token_path is not None else _default_token_path() + self._token: Optional[str] = None + + # ----- token management -------------------------------------------- + + def ensure_token(self) -> str: + """Return the persisted shared secret, generating one on first use.""" + if self._token: + return self._token + if self.token_path.is_file(): + try: + data = json.loads(self.token_path.read_text(encoding="utf-8")) + tok = data.get("token") + if isinstance(tok, str) and tok: + self._token = tok + return tok + except (OSError, json.JSONDecodeError): + pass + tok = secrets.token_hex(16) # 32 hex chars + self.token_path.parent.mkdir(parents=True, exist_ok=True) + tmp = self.token_path.with_suffix(".json.tmp") + tmp.write_text( + json.dumps({"token": tok, "generated_at": time.time()}, indent=2), + encoding="utf-8", + ) + tmp.replace(self.token_path) + self._token = tok + return tok + + def get_token(self) -> str: + """Alias for :meth:`ensure_token`; does not mutate on subsequent calls.""" + return self.ensure_token() + + # ----- dispatch ----------------------------------------------------- + + async def _handle_request(self, msg: Dict[str, Any]) -> Dict[str, Any]: + """Validate + dispatch a single decoded request envelope. + + Always returns a response envelope (success or error); never + raises. Errors from inside the process_manager are wrapped into + the response payload's ``ok``/``error`` keys (which pm already + does) rather than being re-encoded as error envelopes — the + envelope-level error channel is reserved for auth / protocol + failures. + """ + expected = self.ensure_token() + ok, reason = _proto.validate_request(msg, expected) + if not ok: + return _proto.make_error(str(msg.get("id") or ""), reason) + + req_id = msg["id"] + t = msg["type"] + payload = msg["payload"] + + # Import lazily so test mocks can monkeypatch freely. + from plugins.google_meet import process_manager as pm + + try: + if t == "ping": + return {"type": "pong", "id": req_id, + "payload": {"display_name": self.display_name, + "ts": time.time()}} + if t == "start_bot": + # Whitelist kwargs we pass through to pm.start. + kwargs = { + k: payload[k] + for k in ("url", "guest_name", "duration", "headed", + "auth_state", "session_id", "out_dir") + if k in payload + } + if "url" not in kwargs: + return _proto.make_error(req_id, "missing 'url' in payload") + result = pm.start(**kwargs) + return _proto.make_response(req_id, result) + if t == "stop": + reason_arg = payload.get("reason", "requested") + result = pm.stop(reason=reason_arg) + return _proto.make_response(req_id, result) + if t == "status": + return _proto.make_response(req_id, pm.status()) + if t == "transcript": + last = payload.get("last") + result = pm.transcript(last=last) + return _proto.make_response(req_id, result) + if t == "say": + # v2 wiring: enqueue into say_queue.jsonl inside the + # active meeting's out_dir when present. The bot-side + # consumer is v3+ (for v1 this is a stub returning ok). + text = payload.get("text", "") + active = pm._read_active() # type: ignore[attr-defined] + enqueued = False + if active and active.get("out_dir"): + queue = Path(active["out_dir"]) / "say_queue.jsonl" + try: + queue.parent.mkdir(parents=True, exist_ok=True) + with queue.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"text": text, "ts": time.time()}) + "\n") + enqueued = True + except OSError: + enqueued = False + return _proto.make_response( + req_id, + {"ok": True, "enqueued": enqueued, "text": text}, + ) + except Exception as exc: # noqa: BLE001 — surface any pm crash to client + return _proto.make_error(req_id, f"{type(exc).__name__}: {exc}") + + return _proto.make_error(req_id, f"unhandled type: {t!r}") + + # ----- server loop -------------------------------------------------- + + async def serve(self) -> None: + """Run the WebSocket server until cancelled. + + Blocks forever. Callers typically wrap this in ``asyncio.run``. + """ + try: + import websockets # type: ignore + except ImportError as exc: + raise RuntimeError( + "NodeServer.serve requires the 'websockets' package. " + "Install it with: pip install websockets" + ) from exc + + self.ensure_token() + + async def _handler(ws): + async for raw in ws: + try: + msg = _proto.decode(raw if isinstance(raw, str) else raw.decode("utf-8")) + except ValueError as exc: + await ws.send(_proto.encode(_proto.make_error("", f"decode: {exc}"))) + continue + reply = await self._handle_request(msg) + await ws.send(_proto.encode(reply)) + + async with websockets.serve(_handler, self.host, self.port): + # Run until cancelled. + import asyncio + await asyncio.Future() diff --git a/plugins/google_meet/plugin.yaml b/plugins/google_meet/plugin.yaml new file mode 100644 index 0000000000..519d6e09c8 --- /dev/null +++ b/plugins/google_meet/plugin.yaml @@ -0,0 +1,16 @@ +name: google_meet +version: 0.2.0 +description: "Join a Google Meet call, transcribe live captions, speak in realtime, and follow up afterwards. v1 transcribe-only is the default; v2 realtime duplex audio via OpenAI Realtime + BlackHole/PulseAudio ships with mode='realtime'; v3 remote node host lets the bot run on a different machine than the gateway (gateway on Linux, Chrome+signed-in profile on the user's Mac). Explicit-by-design: only joins meet.google.com URLs passed in \u2014 no calendar scanning, no auto-dial." +author: NousResearch +kind: standalone +platforms: + - linux + - macos +provides_tools: + - meet_join + - meet_leave + - meet_status + - meet_transcript + - meet_say +hooks: + - on_session_end diff --git a/plugins/google_meet/process_manager.py b/plugins/google_meet/process_manager.py new file mode 100644 index 0000000000..a5da48b83b --- /dev/null +++ b/plugins/google_meet/process_manager.py @@ -0,0 +1,326 @@ +"""Subprocess lifecycle manager for the google_meet bot. + +Single active meeting at a time. Stores the running pid + out_dir in a +session-scoped state file under ``$HERMES_HOME/workspace/meetings/.active.json`` +so tool calls across turns can find the bot, and ``on_session_end`` can clean +it up. + +The bot runs as a detached subprocess — we don't hold file descriptors open, +so the parent agent loop can't block on it. We communicate via files only. +""" + +from __future__ import annotations + +import json +import os +import signal +import subprocess +import sys +import time +from pathlib import Path +from typing import Any, Dict, Optional + +from hermes_constants import get_hermes_home + +# File + directory layout (under $HERMES_HOME): +# +# workspace/meetings/ +# .active.json # pointer to current session's bot +# / +# status.json # live bot state (written by bot each tick) +# transcript.txt # scraped captions +# +# .active.json holds: +# {"pid": 12345, "meeting_id": "abc-defg-hij", "out_dir": "...", +# "url": "https://meet.google.com/...", "started_at": 1714159200.0, +# "session_id": "optional"} + + +def _root() -> Path: + return Path(get_hermes_home()) / "workspace" / "meetings" + + +def _active_file() -> Path: + return _root() / ".active.json" + + +def _read_active() -> Optional[Dict[str, Any]]: + p = _active_file() + if not p.is_file(): + return None + try: + return json.loads(p.read_text(encoding="utf-8")) + except Exception: + return None + + +def _write_active(data: Dict[str, Any]) -> None: + p = _active_file() + p.parent.mkdir(parents=True, exist_ok=True) + tmp = p.with_suffix(".json.tmp") + tmp.write_text(json.dumps(data, indent=2), encoding="utf-8") + tmp.replace(p) + + +def _clear_active() -> None: + try: + _active_file().unlink() + except FileNotFoundError: + pass + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + # Process exists but we can't signal it — treat as alive. + return True + return True + + +# --------------------------------------------------------------------------- +# Public API — used by tool handlers + CLI +# --------------------------------------------------------------------------- + +def start( + url: str, + *, + out_dir: Optional[Path] = None, + headed: bool = False, + auth_state: Optional[str] = None, + guest_name: str = "Hermes Agent", + duration: Optional[str] = None, + session_id: Optional[str] = None, + mode: str = "transcribe", + realtime_model: Optional[str] = None, + realtime_voice: Optional[str] = None, + realtime_instructions: Optional[str] = None, + realtime_api_key: Optional[str] = None, +) -> Dict[str, Any]: + """Spawn the meet_bot subprocess for *url*. + + If a bot is already running for this hermes install, leave it first — + we enforce single-active-meeting semantics. + + Returns a dict summarizing the started bot. + """ + from plugins.google_meet.meet_bot import _is_safe_meet_url, _meeting_id_from_url + + if not _is_safe_meet_url(url): + return { + "ok": False, + "error": ( + "refusing: only https://meet.google.com/ URLs are allowed. " + "got: " + repr(url) + ), + } + + existing = _read_active() + if existing and _pid_alive(int(existing.get("pid", 0))): + stop(reason="replaced by new meet_join") + + meeting_id = _meeting_id_from_url(url) + out = out_dir or (_root() / meeting_id) + out.mkdir(parents=True, exist_ok=True) + + # Wipe any stale transcript/status files from a previous run of this + # meeting id so polling isn't confused. + for name in ("transcript.txt", "status.json"): + f = out / name + if f.exists(): + try: + f.unlink() + except OSError: + pass + + env = os.environ.copy() + env["HERMES_MEET_URL"] = url + env["HERMES_MEET_OUT_DIR"] = str(out) + env["HERMES_MEET_GUEST_NAME"] = guest_name + if headed: + env["HERMES_MEET_HEADED"] = "1" + if auth_state: + env["HERMES_MEET_AUTH_STATE"] = auth_state + if duration: + env["HERMES_MEET_DURATION"] = duration + # v2: realtime mode + passthroughs. The bot defaults to transcribe + # mode if HERMES_MEET_MODE isn't set, matching v1 behavior. + if mode: + env["HERMES_MEET_MODE"] = mode + if realtime_model: + env["HERMES_MEET_REALTIME_MODEL"] = realtime_model + if realtime_voice: + env["HERMES_MEET_REALTIME_VOICE"] = realtime_voice + if realtime_instructions: + env["HERMES_MEET_REALTIME_INSTRUCTIONS"] = realtime_instructions + if realtime_api_key: + env["HERMES_MEET_REALTIME_KEY"] = realtime_api_key + + log_path = out / "bot.log" + # Detach: stdin=devnull, stdout/stderr → log file, new session so parent + # signals don't propagate. + log_fh = open(log_path, "ab", buffering=0) + try: + proc = subprocess.Popen( + [sys.executable, "-m", "plugins.google_meet.meet_bot"], + stdin=subprocess.DEVNULL, + stdout=log_fh, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + close_fds=True, + ) + finally: + # The subprocess now owns the log fd; we can close ours. + log_fh.close() + + record = { + "pid": proc.pid, + "meeting_id": meeting_id, + "out_dir": str(out), + "url": url, + "started_at": time.time(), + "session_id": session_id, + "log_path": str(log_path), + "mode": mode, + } + _write_active(record) + return {"ok": True, **record} + + +def status() -> Dict[str, Any]: + """Return the current meeting state, or ``{"ok": False, "reason": ...}``.""" + active = _read_active() + if not active: + return {"ok": False, "reason": "no active meeting"} + + pid = int(active.get("pid", 0)) + alive = _pid_alive(pid) if pid else False + + status_path = Path(active.get("out_dir", "")) / "status.json" + bot_status: Dict[str, Any] = {} + if status_path.is_file(): + try: + bot_status = json.loads(status_path.read_text(encoding="utf-8")) + except Exception: + pass + + return { + "ok": True, + "alive": alive, + "pid": pid, + "meetingId": active.get("meeting_id"), + "url": active.get("url"), + "startedAt": active.get("started_at"), + "outDir": active.get("out_dir"), + **bot_status, + } + + +def transcript(last: Optional[int] = None) -> Dict[str, Any]: + """Read the current transcript file. Returns ok=False if none exists.""" + active = _read_active() + if not active: + return {"ok": False, "reason": "no active meeting"} + + tp = Path(active.get("out_dir", "")) / "transcript.txt" + if not tp.is_file(): + return { + "ok": True, + "meetingId": active.get("meeting_id"), + "lines": [], + "total": 0, + "path": str(tp), + } + text = tp.read_text(encoding="utf-8", errors="replace") + all_lines = [ln for ln in text.splitlines() if ln.strip()] + lines = all_lines[-last:] if last else all_lines + return { + "ok": True, + "meetingId": active.get("meeting_id"), + "lines": lines, + "total": len(all_lines), + "path": str(tp), + } + + +def enqueue_say(text: str) -> Dict[str, Any]: + """Append a ``say`` request to the active bot's JSONL queue. + + Returns ``{"ok": False, "reason": ...}`` when no meeting is active or + the active bot is in transcribe-only mode. Otherwise writes a line to + ``/say_queue.jsonl`` that the bot's realtime speaker thread + will consume. + """ + import uuid + + text = (text or "").strip() + if not text: + return {"ok": False, "reason": "text is required"} + + active = _read_active() + if not active: + return {"ok": False, "reason": "no active meeting"} + if active.get("mode") != "realtime": + return { + "ok": False, + "reason": ( + "active meeting is in transcribe mode — pass mode='realtime' " + "to meet_join to enable agent speech" + ), + } + + out_dir = Path(active.get("out_dir", "")) + if not out_dir.is_dir(): + return {"ok": False, "reason": f"out_dir missing: {out_dir}"} + + queue_path = out_dir / "say_queue.jsonl" + entry = {"id": uuid.uuid4().hex[:12], "text": text} + with queue_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry) + "\n") + return { + "ok": True, + "meetingId": active.get("meeting_id"), + "enqueued_id": entry["id"], + "queue_path": str(queue_path), + } + + +def stop(*, reason: str = "requested") -> Dict[str, Any]: + """Signal the active bot to leave cleanly, then clear the active pointer. + + Sends SIGTERM and waits up to 10s for the bot to exit. Falls back to + SIGKILL if the bot doesn't respond. + """ + active = _read_active() + if not active: + return {"ok": False, "reason": "no active meeting"} + + pid = int(active.get("pid", 0)) + out_dir = active.get("out_dir") + transcript_path = Path(out_dir) / "transcript.txt" if out_dir else None + + if pid and _pid_alive(pid): + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + for _ in range(20): + if not _pid_alive(pid): + break + time.sleep(0.5) + if _pid_alive(pid): + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + _clear_active() + return { + "ok": True, + "reason": reason, + "meetingId": active.get("meeting_id"), + "transcriptPath": str(transcript_path) if transcript_path else None, + } diff --git a/plugins/google_meet/realtime/__init__.py b/plugins/google_meet/realtime/__init__.py new file mode 100644 index 0000000000..37eb16add3 --- /dev/null +++ b/plugins/google_meet/realtime/__init__.py @@ -0,0 +1,10 @@ +"""Realtime speech subpackage for the google_meet plugin (v2). + +Provides a thin OpenAI Realtime API client and a file-queue speaker +wrapper so the Meet bot can play synthesized speech through the +virtual audio bridge. +""" + +from .openai_client import RealtimeSession, RealtimeSpeaker # noqa: F401 + +__all__ = ["RealtimeSession", "RealtimeSpeaker"] diff --git a/plugins/google_meet/realtime/openai_client.py b/plugins/google_meet/realtime/openai_client.py new file mode 100644 index 0000000000..258723180a --- /dev/null +++ b/plugins/google_meet/realtime/openai_client.py @@ -0,0 +1,332 @@ +"""OpenAI Realtime API WebSocket client + file-queue speaker. + +This module is the "output" side of the v2 voice bridge: it takes text, +sends it to the OpenAI Realtime API, receives audio deltas back, and +appends the PCM bytes to a file. A separate consumer (the audio +bridge) streams that file into Chrome's fake microphone. + +Designed for simplicity: a single synchronous WebSocket connection per +speaker, per session. The ``websockets`` package is imported lazily so +that importing this module never fails just because the optional dep +is missing. +""" + +from __future__ import annotations + +import base64 +import json +import time +import uuid +from pathlib import Path +from typing import Any, Callable, Optional + + +REALTIME_URL = "wss://api.openai.com/v1/realtime" + + +def _require_websockets(): + """Import ``websockets.sync.client.connect`` or raise with hint.""" + try: + from websockets.sync.client import connect as _connect # type: ignore + except ImportError as exc: # pragma: no cover - exercised via test + raise RuntimeError( + "websockets package is required for OpenAI Realtime; " + "install with: pip install websockets" + ) from exc + return _connect + + +class RealtimeSession: + """Minimal sync client for the OpenAI Realtime WebSocket API. + + Usage: + sess = RealtimeSession(api_key=..., audio_sink_path=Path("out.pcm")) + sess.connect() + sess.speak("Hello team.") + sess.close() + + Thread safety: ``speak`` and ``cancel_response`` may be called from + different threads; a lock serializes WebSocket writes. + """ + + def __init__( + self, + api_key: str, + model: str = "gpt-realtime", + voice: str = "alloy", + instructions: str = "", + audio_sink_path: Optional[Path] = None, + sample_rate: int = 24000, + ) -> None: + import threading as _threading + self.api_key = api_key + self.model = model + self.voice = voice + self.instructions = instructions + self.audio_sink_path = Path(audio_sink_path) if audio_sink_path else None + self.sample_rate = sample_rate + self._ws: Any = None + self._send_lock = _threading.Lock() + self._last_response_id: Optional[str] = None + # Public counters for status reporting. + self.audio_bytes_out: int = 0 + self.last_audio_out_at: Optional[float] = None + + # ── lifecycle ───────────────────────────────────────────────────────── + + def connect(self) -> None: + """Open WS and send session.update with voice+instructions.""" + connect = _require_websockets() + url = f"{REALTIME_URL}?model={self.model}" + headers = [ + ("Authorization", f"Bearer {self.api_key}"), + ("OpenAI-Beta", "realtime=v1"), + ] + # websockets.sync.client.connect accepts either additional_headers= + # (newer) or extra_headers= depending on version; try the newer + # name first and fall back. + try: + self._ws = connect(url, additional_headers=headers) + except TypeError: + self._ws = connect(url, extra_headers=headers) + + self._send_json( + { + "type": "session.update", + "session": { + "voice": self.voice, + "instructions": self.instructions, + "modalities": ["audio", "text"], + "output_audio_format": "pcm16", + "input_audio_format": "pcm16", + }, + } + ) + + def close(self) -> None: + if self._ws is not None: + try: + self._ws.close() + except Exception: + pass + self._ws = None + + # ── speaking ────────────────────────────────────────────────────────── + + def speak(self, text: str, timeout: float = 30.0) -> dict: + """Send ``text`` and accumulate the audio response. + + Audio deltas are base64-decoded and appended to + ``audio_sink_path`` (opened 'ab' and closed per call, so a + separate streaming reader can consume whatever is there). + """ + if self._ws is None: + raise RuntimeError("RealtimeSession.connect() must be called first") + + start = time.monotonic() + + self._send_json( + { + "type": "conversation.item.create", + "item": { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": text}], + }, + } + ) + self._send_json( + { + "type": "response.create", + "response": {"modalities": ["audio"]}, + } + ) + + bytes_written = 0 + sink_fp = None + if self.audio_sink_path is not None: + self.audio_sink_path.parent.mkdir(parents=True, exist_ok=True) + sink_fp = open(self.audio_sink_path, "ab") + + try: + while True: + remaining = timeout - (time.monotonic() - start) + if remaining <= 0: + raise TimeoutError( + f"realtime response did not complete within {timeout}s" + ) + raw = self._recv(timeout=remaining) + if raw is None: + # Connection closed by peer. + break + try: + frame = json.loads(raw) if isinstance(raw, (str, bytes, bytearray)) else raw + except (TypeError, ValueError): + continue + if not isinstance(frame, dict): + continue + ftype = frame.get("type") + if ftype == "response.audio.delta": + b64 = frame.get("delta") or frame.get("audio") or "" + if b64 and sink_fp is not None: + try: + chunk = base64.b64decode(b64) + except (ValueError, TypeError): + chunk = b"" + if chunk: + sink_fp.write(chunk) + sink_fp.flush() + bytes_written += len(chunk) + self.audio_bytes_out += len(chunk) + self.last_audio_out_at = time.time() + elif ftype == "response.created": + rid = (frame.get("response") or {}).get("id") + if rid: + self._last_response_id = rid + elif ftype in ("response.done", "response.completed", "response.cancelled"): + break + elif ftype == "error": + err = frame.get("error") or frame + raise RuntimeError(f"realtime error: {err}") + # All other frames (response.created, response.output_item.*, + # response.audio_transcript.delta, rate_limits.updated, ...) + # are ignored for v2. + finally: + if sink_fp is not None: + sink_fp.close() + + duration_ms = (time.monotonic() - start) * 1000.0 + return { + "ok": True, + "bytes_written": bytes_written, + "duration_ms": duration_ms, + } + + # ── ws plumbing ─────────────────────────────────────────────────────── + + def cancel_response(self) -> bool: + """Interrupt the in-flight response (barge-in). + + Sends ``response.cancel`` on the current WebSocket so the model + stops generating audio immediately. Safe to call at any time; + returns True if a cancel was actually sent, False when there's + nothing to cancel or the socket isn't open. + """ + if self._ws is None: + return False + try: + self._send_json({"type": "response.cancel"}) + return True + except Exception: + return False + + def _send_json(self, payload: dict) -> None: + assert self._ws is not None + with self._send_lock: + self._ws.send(json.dumps(payload)) + + def _recv(self, timeout: Optional[float] = None): + assert self._ws is not None + try: + if timeout is None: + return self._ws.recv() + return self._ws.recv(timeout=timeout) + except TypeError: + # Older websockets may not accept timeout kwarg. + return self._ws.recv() + + +class RealtimeSpeaker: + """File-based JSONL queue wrapper around :class:`RealtimeSession`. + + Each line in ``queue_path`` is a JSON object of the form + ``{"id": "", "text": "..."}``. Processed lines are appended + to ``processed_path`` (if set) and then removed from the queue; + if ``processed_path`` is ``None``, processed lines are simply + dropped. + """ + + def __init__( + self, + session: RealtimeSession, + queue_path: Path, + processed_path: Optional[Path] = None, + ) -> None: + self.session = session + self.queue_path = Path(queue_path) + self.processed_path = Path(processed_path) if processed_path else None + + # ── helpers ────────────────────────────────────────────────────────── + + def _read_queue(self) -> list[dict]: + if not self.queue_path.exists(): + return [] + out: list[dict] = [] + for line in self.queue_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + except ValueError: + continue + if not isinstance(entry, dict): + continue + if "id" not in entry: + entry["id"] = str(uuid.uuid4()) + out.append(entry) + return out + + def _rewrite_queue(self, remaining: list[dict]) -> None: + if not remaining: + # Keep the file but empty — consumers may be watching for + # new writes via mtime, and delete-then-recreate is a race. + self.queue_path.write_text("") + return + self.queue_path.write_text( + "\n".join(json.dumps(e) for e in remaining) + "\n" + ) + + def _append_processed(self, entry: dict, result: dict) -> None: + if self.processed_path is None: + return + self.processed_path.parent.mkdir(parents=True, exist_ok=True) + record = {"id": entry.get("id"), "text": entry.get("text", ""), "result": result} + with open(self.processed_path, "a") as fp: + fp.write(json.dumps(record) + "\n") + + # ── main loop ──────────────────────────────────────────────────────── + + def run_until_stopped( + self, + stop_fn: Callable[[], bool], + poll_interval: float = 0.5, + ) -> None: + while not stop_fn(): + entries = self._read_queue() + if not entries: + time.sleep(poll_interval) + continue + # Process one at a time; re-check the queue file after each + # speak() call because new entries may have arrived. + head = entries[0] + text = (head.get("text") or "").strip() + if text: + try: + result = self.session.speak(text) + except Exception as exc: + result = {"ok": False, "error": str(exc)} + else: + result = {"ok": True, "bytes_written": 0, "duration_ms": 0.0} + self._append_processed(head, result) + + # Re-read the queue from disk in case it was appended to + # while we were speaking, then drop the head. + latest = self._read_queue() + if latest and latest[0].get("id") == head.get("id"): + self._rewrite_queue(latest[1:]) + else: + # Fallback: drop-by-id anywhere in the queue. + self._rewrite_queue( + [e for e in latest if e.get("id") != head.get("id")] + ) diff --git a/plugins/google_meet/tools.py b/plugins/google_meet/tools.py new file mode 100644 index 0000000000..9af804288c --- /dev/null +++ b/plugins/google_meet/tools.py @@ -0,0 +1,348 @@ +"""Agent-facing tools for the google_meet plugin. + +Tools: + meet_join — join a Google Meet URL (spawns Playwright bot locally + OR on a remote node host via node=) + meet_status — report bot liveness + transcript progress + meet_transcript — read the current transcript (optional last-N) + meet_leave — signal the bot to leave cleanly + meet_say — (v2) speak text through the realtime audio bridge. + Requires the active meeting to have been joined with + mode='realtime'. +""" + +from __future__ import annotations + +import json +from typing import Any, Dict, Optional + +from plugins.google_meet import process_manager as pm + + +# --------------------------------------------------------------------------- +# Runtime gate +# --------------------------------------------------------------------------- + +def check_meet_requirements() -> bool: + """Return True when the plugin can actually run LOCALLY. + + Gates on: + * Python ``playwright`` package importable + * the plugin being on a supported platform (Linux or macOS) + + Note: remote-node operation (``node=``) only needs the + ``websockets`` dep on the gateway side — Chromium lives on the node. + But the plugin-level gate keeps the v1 semantics; individual tool + handlers relax the requirement when a node is addressed. + """ + import platform as _p + if _p.system().lower() not in ("linux", "darwin"): + return False + try: + import playwright # noqa: F401 + except ImportError: + return False + return True + + +# --------------------------------------------------------------------------- +# Node client helper +# --------------------------------------------------------------------------- + +def _resolve_node_client(node: Optional[str]): + """Return (NodeClient, node_name) for *node*, or (None, None) to run local. + + Raises RuntimeError with a readable message if the node is named but + unresolvable, so the handler can surface a clear error to the agent. + """ + if node is None or node == "": + return None, None + from plugins.google_meet.node.registry import NodeRegistry + from plugins.google_meet.node.client import NodeClient + + reg = NodeRegistry() + entry = reg.resolve(node if node != "auto" else None) + if entry is None: + raise RuntimeError( + f"no registered meet node matches {node!r} — " + "run `hermes meet node approve ` first" + ) + client = NodeClient(url=entry["url"], token=entry["token"]) + return client, entry.get("name") + + +# --------------------------------------------------------------------------- +# Schemas +# --------------------------------------------------------------------------- + +MEET_JOIN_SCHEMA: Dict[str, Any] = { + "name": "meet_join", + "description": ( + "Join a Google Meet call and start scraping live captions into a " + "transcript file. Only meet.google.com URLs are accepted; no calendar " + "scanning, no auto-dial. Spawns a headless Chromium subprocess that " + "runs in parallel with the agent loop — returns immediately. Poll " + "with meet_status and read captions with meet_transcript. Reminder " + "to the agent: you should announce yourself in the meeting (there is " + "no automatic consent announcement)." + ), + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": ( + "Full https://meet.google.com/... URL. Required." + ), + }, + "mode": { + "type": "string", + "enum": ["transcribe", "realtime"], + "description": ( + "transcribe (default): listen-only, scrape captions. " + "realtime: also enable agent speech via meet_say " + "(requires OpenAI Realtime key + platform audio bridge)." + ), + }, + "guest_name": { + "type": "string", + "description": ( + "Display name to use when joining as guest. Defaults to " + "'Hermes Agent'." + ), + }, + "duration": { + "type": "string", + "description": ( + "Optional max duration before auto-leave (e.g. '30m', " + "'2h', '90s'). Omit to stay until meet_leave is called." + ), + }, + "headed": { + "type": "boolean", + "description": ( + "Run Chromium headed instead of headless (debug only). " + "Default false." + ), + }, + "node": { + "type": "string", + "description": ( + "Name of a registered remote node to run the bot on " + "(useful when the gateway runs on a headless Linux box " + "but the user's Chrome with a signed-in Google profile " + "lives on their Mac). Pass 'auto' to use the single " + "registered node. Default: run locally. Nodes are " + "approved via `hermes meet node approve`." + ), + }, + }, + "required": ["url"], + "additionalProperties": False, + }, +} + +MEET_STATUS_SCHEMA: Dict[str, Any] = { + "name": "meet_status", + "description": ( + "Report the current Meet session state — whether the bot is alive, " + "has joined, is sitting in the lobby, number of transcript lines " + "captured, and last-caption timestamp." + ), + "parameters": { + "type": "object", + "properties": { + "node": {"type": "string"}, + }, + "additionalProperties": False, + }, +} + +MEET_TRANSCRIPT_SCHEMA: Dict[str, Any] = { + "name": "meet_transcript", + "description": ( + "Read the scraped transcript for the active Meet session. Returns " + "full transcript unless 'last' is set, in which case returns the last " + "N lines only." + ), + "parameters": { + "type": "object", + "properties": { + "last": { + "type": "integer", + "description": ( + "Optional: return only the last N caption lines. Useful " + "for polling during a meeting without re-reading the " + "whole transcript." + ), + "minimum": 1, + }, + "node": {"type": "string"}, + }, + "additionalProperties": False, + }, +} + +MEET_LEAVE_SCHEMA: Dict[str, Any] = { + "name": "meet_leave", + "description": ( + "Leave the active Meet call cleanly, stop caption scraping, and " + "finalize the transcript file. Safe to call when no meeting is " + "active — returns ok=false with a reason." + ), + "parameters": { + "type": "object", + "properties": { + "node": {"type": "string"}, + }, + "additionalProperties": False, + }, +} + +MEET_SAY_SCHEMA: Dict[str, Any] = { + "name": "meet_say", + "description": ( + "Speak text into the active Meet call. Requires the active meeting " + "to have been joined with mode='realtime'. The text is queued to " + "the bot's OpenAI Realtime session; the generated audio is streamed " + "into Chrome's fake microphone via a virtual audio device " + "(PulseAudio null-sink on Linux, BlackHole on macOS). Returns " + "immediately — the actual speech lags by a couple of seconds." + ), + "parameters": { + "type": "object", + "properties": { + "text": {"type": "string", "description": "Text to speak."}, + "node": {"type": "string"}, + }, + "required": ["text"], + "additionalProperties": False, + }, +} + + +# --------------------------------------------------------------------------- +# Handlers +# --------------------------------------------------------------------------- + +def _json(obj: Any) -> str: + return json.dumps(obj, ensure_ascii=False) + + +def _err(msg: str, **extra) -> str: + return _json({"success": False, "error": msg, **extra}) + + +def handle_meet_join(args: Dict[str, Any], **_kw) -> str: + url = (args.get("url") or "").strip() + if not url: + return _err("url is required") + mode = (args.get("mode") or "transcribe").strip().lower() + if mode not in ("transcribe", "realtime"): + return _err(f"mode must be 'transcribe' or 'realtime' (got {mode!r})") + + node = args.get("node") + try: + client, node_name = _resolve_node_client(node) + except RuntimeError as e: + return _err(str(e)) + + if client is not None: + # Remote path — delegate to the node host. + try: + res = client.start_bot( + url=url, + guest_name=str(args.get("guest_name") or "Hermes Agent"), + duration=str(args.get("duration")) if args.get("duration") else None, + headed=bool(args.get("headed", False)), + mode=mode, + ) + return _json({"success": bool(res.get("ok")), "node": node_name, **res}) + except Exception as e: + return _err(f"remote node start_bot failed: {e}", node=node_name) + + # Local path — same as v1, with v2 params. + if not check_meet_requirements(): + return _err( + "google_meet plugin prerequisites missing — install with " + "`pip install playwright && python -m playwright install " + "chromium`. Plugin is supported on Linux and macOS only." + ) + res = pm.start( + url=url, + headed=bool(args.get("headed", False)), + guest_name=str(args.get("guest_name") or "Hermes Agent"), + duration=str(args.get("duration")) if args.get("duration") else None, + mode=mode, + ) + return _json({"success": bool(res.get("ok")), **res}) + + +def handle_meet_status(args: Dict[str, Any], **_kw) -> str: + try: + client, node_name = _resolve_node_client(args.get("node")) + except RuntimeError as e: + return _err(str(e)) + if client is not None: + try: + res = client.status() + return _json({"success": bool(res.get("ok")), "node": node_name, **res}) + except Exception as e: + return _err(f"remote node status failed: {e}", node=node_name) + res = pm.status() + return _json({"success": bool(res.get("ok")), **res}) + + +def handle_meet_transcript(args: Dict[str, Any], **_kw) -> str: + last = args.get("last") + try: + last_i = int(last) if last is not None else None + if last_i is not None and last_i < 1: + last_i = None + except (TypeError, ValueError): + last_i = None + try: + client, node_name = _resolve_node_client(args.get("node")) + except RuntimeError as e: + return _err(str(e)) + if client is not None: + try: + res = client.transcript(last=last_i) + return _json({"success": bool(res.get("ok")), "node": node_name, **res}) + except Exception as e: + return _err(f"remote node transcript failed: {e}", node=node_name) + res = pm.transcript(last=last_i) + return _json({"success": bool(res.get("ok")), **res}) + + +def handle_meet_leave(args: Dict[str, Any], **_kw) -> str: + try: + client, node_name = _resolve_node_client(args.get("node")) + except RuntimeError as e: + return _err(str(e)) + if client is not None: + try: + res = client.stop() + return _json({"success": bool(res.get("ok")), "node": node_name, **res}) + except Exception as e: + return _err(f"remote node stop failed: {e}", node=node_name) + res = pm.stop(reason="agent called meet_leave") + return _json({"success": bool(res.get("ok")), **res}) + + +def handle_meet_say(args: Dict[str, Any], **_kw) -> str: + text = (args.get("text") or "").strip() + if not text: + return _err("text is required") + try: + client, node_name = _resolve_node_client(args.get("node")) + except RuntimeError as e: + return _err(str(e)) + if client is not None: + try: + res = client.say(text) + return _json({"success": bool(res.get("ok")), "node": node_name, **res}) + except Exception as e: + return _err(f"remote node say failed: {e}", node=node_name) + res = pm.enqueue_say(text) + return _json({"success": bool(res.get("ok")), **res}) diff --git a/tests/plugins/test_google_meet_audio.py b/tests/plugins/test_google_meet_audio.py new file mode 100644 index 0000000000..9af0f76f81 --- /dev/null +++ b/tests/plugins/test_google_meet_audio.py @@ -0,0 +1,266 @@ +"""Tests for plugins.google_meet.audio_bridge (v2). + +Covers the platform gating and pactl / system_profiler plumbing +without actually invoking those tools on the host. +""" + +from __future__ import annotations + +import subprocess +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture(autouse=True) +def _isolate_home(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + yield hermes_home + + +# --------------------------------------------------------------------------- +# Linux setup / teardown +# --------------------------------------------------------------------------- + + +def _linux_pactl_result(stdout: str) -> MagicMock: + """Build a fake CompletedProcess-ish object for subprocess.run.""" + m = MagicMock() + m.stdout = stdout + m.stderr = "" + m.returncode = 0 + return m + + +def test_setup_linux_loads_null_sink_and_virtual_source(): + from plugins.google_meet.audio_bridge import AudioBridge + + calls: list[list[str]] = [] + + def _fake_run(argv, **kwargs): + calls.append(list(argv)) + # First call = null-sink → module id 42 + # Second call = virtual-source → module id 43 + if "module-null-sink" in argv: + return _linux_pactl_result("42\n") + if "module-virtual-source" in argv: + return _linux_pactl_result("43\n") + raise AssertionError(f"unexpected pactl invocation: {argv}") + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Linux"), \ + patch("plugins.google_meet.audio_bridge.subprocess.run", + side_effect=_fake_run): + br = AudioBridge() + info = br.setup() + + # Two pactl load-module calls, in order. + assert len(calls) == 2 + assert calls[0][0] == "pactl" and calls[0][1] == "load-module" + assert "module-null-sink" in calls[0] + assert any(a.startswith("sink_name=hermes_meet_sink") for a in calls[0]) + assert calls[1][0] == "pactl" and calls[1][1] == "load-module" + assert "module-virtual-source" in calls[1] + assert any(a.startswith("source_name=hermes_meet_src") for a in calls[1]) + assert any("master=hermes_meet_sink.monitor" in a for a in calls[1]) + + # Dict shape. + assert info["platform"] == "linux" + assert info["device_name"] == "hermes_meet_src" + assert info["write_target"] == "hermes_meet_sink" + assert info["sample_rate"] == 48000 + assert info["channels"] == 2 + assert info["module_ids"] == [42, 43] + + # Properties. + assert br.device_name == "hermes_meet_src" + assert br.write_target == "hermes_meet_sink" + + +def test_teardown_linux_unloads_modules_in_reverse_order(): + from plugins.google_meet.audio_bridge import AudioBridge + + def _setup_run(argv, **kwargs): + if "module-null-sink" in argv: + return _linux_pactl_result("42\n") + return _linux_pactl_result("43\n") + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Linux"), \ + patch("plugins.google_meet.audio_bridge.subprocess.run", + side_effect=_setup_run): + br = AudioBridge() + br.setup() + + unload_calls: list[list[str]] = [] + + def _teardown_run(argv, **kwargs): + unload_calls.append(list(argv)) + return _linux_pactl_result("") + + with patch("plugins.google_meet.audio_bridge.subprocess.run", + side_effect=_teardown_run): + br.teardown() + + # Two unload calls, in reverse order: 43 (virtual-source) then 42 (sink). + assert [c[1] for c in unload_calls] == ["unload-module", "unload-module"] + assert unload_calls[0][2] == "43" + assert unload_calls[1][2] == "42" + + # Second teardown is a no-op. + with patch("plugins.google_meet.audio_bridge.subprocess.run") as run_mock: + br.teardown() + run_mock.assert_not_called() + + +def test_setup_linux_parses_module_id_from_multi_line_output(): + """Some pactl builds include trailing whitespace / notices.""" + from plugins.google_meet.audio_bridge import AudioBridge + + def _fake_run(argv, **kwargs): + if "module-null-sink" in argv: + return _linux_pactl_result("42 \n") + return _linux_pactl_result("43\n") + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Linux"), \ + patch("plugins.google_meet.audio_bridge.subprocess.run", + side_effect=_fake_run): + br = AudioBridge() + info = br.setup() + + assert info["module_ids"] == [42, 43] + + +def test_setup_linux_pactl_missing_raises_clean_error(): + from plugins.google_meet.audio_bridge import AudioBridge + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Linux"), \ + patch("plugins.google_meet.audio_bridge.subprocess.run", + side_effect=FileNotFoundError("pactl")): + br = AudioBridge() + with pytest.raises(RuntimeError, match="pactl"): + br.setup() + + +# --------------------------------------------------------------------------- +# macOS setup +# --------------------------------------------------------------------------- + +_BH_PRESENT = ( + "Audio:\n" + " Devices:\n" + " BlackHole 2ch:\n" + " Manufacturer: Existential Audio\n" +) + +_BH_ABSENT = ( + "Audio:\n" + " Devices:\n" + " MacBook Pro Microphone:\n" + " Default Input: Yes\n" +) + + +def test_setup_darwin_returns_blackhole_when_present(): + from plugins.google_meet.audio_bridge import AudioBridge + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Darwin"), \ + patch("plugins.google_meet.audio_bridge.subprocess.check_output", + return_value=_BH_PRESENT) as check: + br = AudioBridge() + info = br.setup() + + check.assert_called_once() + argv = check.call_args.args[0] + assert argv[0] == "system_profiler" + assert "SPAudioDataType" in argv + + assert info["platform"] == "darwin" + assert info["device_name"] == "BlackHole 2ch" + assert info["write_target"] == "BlackHole 2ch" + assert info["module_ids"] == [] + assert info["sample_rate"] == 48000 + assert info["channels"] == 2 + + # teardown is a no-op on darwin (no modules to unload). + with patch("plugins.google_meet.audio_bridge.subprocess.run") as run_mock: + br.teardown() + run_mock.assert_not_called() + + +def test_setup_darwin_raises_when_blackhole_missing(): + from plugins.google_meet.audio_bridge import AudioBridge + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Darwin"), \ + patch("plugins.google_meet.audio_bridge.subprocess.check_output", + return_value=_BH_ABSENT): + br = AudioBridge() + with pytest.raises(RuntimeError, match="BlackHole"): + br.setup() + + +# --------------------------------------------------------------------------- +# Windows / unsupported +# --------------------------------------------------------------------------- + + +def test_setup_windows_raises(): + from plugins.google_meet.audio_bridge import AudioBridge + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Windows"): + br = AudioBridge() + with pytest.raises(RuntimeError, match="not supported"): + br.setup() + + +# --------------------------------------------------------------------------- +# chrome_fake_audio_flags +# --------------------------------------------------------------------------- + + +def test_chrome_fake_audio_flags_linux(): + from plugins.google_meet.audio_bridge import chrome_fake_audio_flags + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Linux"): + flags = chrome_fake_audio_flags( + {"platform": "linux", "device_name": "hermes_meet_src"} + ) + assert "--use-fake-ui-for-media-stream" in flags + + +def test_chrome_fake_audio_flags_darwin(): + from plugins.google_meet.audio_bridge import chrome_fake_audio_flags + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Darwin"): + flags = chrome_fake_audio_flags( + {"platform": "darwin", "device_name": "BlackHole 2ch"} + ) + assert "--use-fake-ui-for-media-stream" in flags + + +def test_chrome_fake_audio_flags_windows_raises(): + from plugins.google_meet.audio_bridge import chrome_fake_audio_flags + + with patch("plugins.google_meet.audio_bridge.platform.system", + return_value="Windows"): + with pytest.raises(RuntimeError): + chrome_fake_audio_flags({"platform": "windows"}) + + +def test_property_access_before_setup_raises(): + from plugins.google_meet.audio_bridge import AudioBridge + + br = AudioBridge() + with pytest.raises(RuntimeError): + _ = br.device_name + with pytest.raises(RuntimeError): + _ = br.write_target diff --git a/tests/plugins/test_google_meet_node.py b/tests/plugins/test_google_meet_node.py new file mode 100644 index 0000000000..bee1a18436 --- /dev/null +++ b/tests/plugins/test_google_meet_node.py @@ -0,0 +1,675 @@ +"""Tests for the google_meet node primitive. + +Covers protocol helpers, the file-backed registry, the server's +token-and-dispatch machinery, a mocked client, and the CLI plumbing. +We never open a real socket — websockets.serve / websockets.sync.client +are fully mocked. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture(autouse=True) +def _isolate_home(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + yield hermes_home + + +# --------------------------------------------------------------------------- +# protocol.py +# --------------------------------------------------------------------------- + +def test_protocol_encode_decode_roundtrip(): + from plugins.google_meet.node import protocol + + msg = protocol.make_request("ping", "tok", {"x": 1}, req_id="abc") + raw = protocol.encode(msg) + out = protocol.decode(raw) + assert out == msg + assert out["type"] == "ping" + assert out["id"] == "abc" + assert out["token"] == "tok" + assert out["payload"] == {"x": 1} + + +def test_protocol_make_request_autogenerates_id(): + from plugins.google_meet.node import protocol + + a = protocol.make_request("ping", "tok", {}) + b = protocol.make_request("ping", "tok", {}) + assert a["id"] != b["id"] + assert len(a["id"]) >= 16 # uuid4 hex + + +def test_protocol_make_request_rejects_bad_input(): + from plugins.google_meet.node import protocol + + with pytest.raises(ValueError): + protocol.make_request("", "tok", {}) + with pytest.raises(ValueError): + protocol.make_request("unknown_type", "tok", {}) + with pytest.raises(ValueError): + protocol.make_request("ping", "tok", "not a dict") # type: ignore[arg-type] + + +def test_protocol_decode_raises_on_malformed(): + from plugins.google_meet.node import protocol + + with pytest.raises(ValueError): + protocol.decode("not json at all") + with pytest.raises(ValueError): + protocol.decode("[]") # list, not object + with pytest.raises(ValueError): + protocol.decode(json.dumps({"id": "x"})) # missing type + with pytest.raises(ValueError): + protocol.decode(json.dumps({"type": "ping"})) # missing id + + +def test_protocol_validate_request_happy_path(): + from plugins.google_meet.node import protocol + + msg = protocol.make_request("status", "secret", {}) + ok, reason = protocol.validate_request(msg, "secret") + assert ok is True + assert reason == "" + + +def test_protocol_validate_request_rejects_bad_token(): + from plugins.google_meet.node import protocol + + msg = protocol.make_request("status", "wrong", {}) + ok, reason = protocol.validate_request(msg, "right") + assert ok is False + assert "token" in reason.lower() + + +def test_protocol_validate_request_rejects_unknown_type(): + from plugins.google_meet.node import protocol + + raw = {"type": "nope", "id": "1", "token": "t", "payload": {}} + ok, reason = protocol.validate_request(raw, "t") + assert ok is False + assert "unknown" in reason.lower() + + +def test_protocol_validate_request_rejects_missing_id(): + from plugins.google_meet.node import protocol + + raw = {"type": "ping", "token": "t", "payload": {}} + ok, reason = protocol.validate_request(raw, "t") + assert ok is False + assert "id" in reason.lower() + + +def test_protocol_validate_request_rejects_non_dict_payload(): + from plugins.google_meet.node import protocol + + raw = {"type": "ping", "id": "1", "token": "t", "payload": "oops"} + ok, reason = protocol.validate_request(raw, "t") + assert ok is False + + +def test_protocol_error_envelope_shape(): + from plugins.google_meet.node import protocol + + err = protocol.make_error("abc", "nope") + assert err == {"type": "error", "id": "abc", "error": "nope"} + + +# --------------------------------------------------------------------------- +# registry.py +# --------------------------------------------------------------------------- + +def test_registry_add_get_roundtrip_persists(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + p = tmp_path / "nodes.json" + r = NodeRegistry(path=p) + r.add("mac", "ws://mac.local:18789", "deadbeef") + + # Second instance sees it. + r2 = NodeRegistry(path=p) + entry = r2.get("mac") + assert entry is not None + assert entry["name"] == "mac" + assert entry["url"] == "ws://mac.local:18789" + assert entry["token"] == "deadbeef" + assert "added_at" in entry + + +def test_registry_get_returns_none_when_missing(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + assert r.get("ghost") is None + + +def test_registry_remove(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + r.add("a", "ws://a", "t") + assert r.remove("a") is True + assert r.get("a") is None + assert r.remove("a") is False # idempotent + + +def test_registry_list_all_sorted(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + r.add("zeta", "ws://z", "t1") + r.add("alpha", "ws://a", "t2") + names = [n["name"] for n in r.list_all()] + assert names == ["alpha", "zeta"] + + +def test_registry_resolve_auto_picks_single(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + r.add("mac", "ws://mac", "t") + picked = r.resolve(None) + assert picked is not None + assert picked["name"] == "mac" + + +def test_registry_resolve_ambiguous_returns_none(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + r.add("a", "ws://a", "t") + r.add("b", "ws://b", "t") + assert r.resolve(None) is None + + +def test_registry_resolve_empty_returns_none(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + assert r.resolve(None) is None + + +def test_registry_resolve_by_name(tmp_path): + from plugins.google_meet.node.registry import NodeRegistry + + r = NodeRegistry(path=tmp_path / "n.json") + r.add("a", "ws://a", "t") + r.add("b", "ws://b", "t") + picked = r.resolve("b") + assert picked is not None + assert picked["name"] == "b" + assert r.resolve("ghost") is None + + +def test_registry_defaults_to_hermes_home(tmp_path, monkeypatch): + from plugins.google_meet.node.registry import NodeRegistry + + # _isolate_home already set HERMES_HOME to tmp_path/.hermes; the + # registry default path must live inside that tree. + r = NodeRegistry() + r.add("x", "ws://x", "t") + expected = Path(tmp_path) / ".hermes" / "workspace" / "meetings" / "nodes.json" + assert expected.is_file() + + +# --------------------------------------------------------------------------- +# server.py — token + dispatch +# --------------------------------------------------------------------------- + +def test_server_ensure_token_generates_and_persists(tmp_path): + from plugins.google_meet.node.server import NodeServer + + p = tmp_path / "tok.json" + s1 = NodeServer(token_path=p) + t1 = s1.ensure_token() + assert isinstance(t1, str) and len(t1) == 32 + + # Reuse on a fresh instance. + s2 = NodeServer(token_path=p) + t2 = s2.ensure_token() + assert t1 == t2 + + data = json.loads(p.read_text(encoding="utf-8")) + assert data["token"] == t1 + assert "generated_at" in data + + +def test_server_get_token_is_idempotent(tmp_path): + from plugins.google_meet.node.server import NodeServer + + s = NodeServer(token_path=tmp_path / "t.json") + assert s.get_token() == s.get_token() + + +def _run(coro): + return asyncio.new_event_loop().run_until_complete(coro) if False else asyncio.run(coro) + + +def test_server_handle_request_rejects_bad_token(tmp_path): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + + s = NodeServer(token_path=tmp_path / "t.json") + s.ensure_token() + bad = protocol.make_request("ping", "not-the-token", {}) + resp = asyncio.run(s._handle_request(bad)) + assert resp["type"] == "error" + assert "token" in resp["error"].lower() + + +def test_server_handle_request_ping(tmp_path): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + + s = NodeServer(token_path=tmp_path / "t.json", display_name="node-x") + tok = s.ensure_token() + req = protocol.make_request("ping", tok, {}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "pong" + assert resp["id"] == req["id"] + assert resp["payload"]["display_name"] == "node-x" + + +def test_server_handle_request_status_dispatches_to_pm(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + monkeypatch.setattr(pm, "status", + lambda: {"ok": True, "alive": True, "meetingId": "abc"}) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("status", tok, {}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert resp["id"] == req["id"] + assert resp["payload"] == {"ok": True, "alive": True, "meetingId": "abc"} + + +def test_server_handle_request_start_bot_dispatches(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + captured = {} + + def fake_start(**kwargs): + captured.update(kwargs) + return {"ok": True, "pid": 42, "meeting_id": "abc-defg-hij"} + + monkeypatch.setattr(pm, "start", fake_start) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("start_bot", tok, { + "url": "https://meet.google.com/abc-defg-hij", + "guest_name": "Bot", + "duration": "30m", + }) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert resp["payload"]["ok"] is True + assert captured["url"] == "https://meet.google.com/abc-defg-hij" + assert captured["guest_name"] == "Bot" + assert captured["duration"] == "30m" + + +def test_server_handle_request_start_bot_missing_url(tmp_path): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("start_bot", tok, {"guest_name": "x"}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "error" + assert "url" in resp["error"] + + +def test_server_handle_request_stop_dispatches(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + got = {} + + def fake_stop(*, reason="requested"): + got["reason"] = reason + return {"ok": True, "reason": reason} + + monkeypatch.setattr(pm, "stop", fake_stop) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("stop", tok, {"reason": "user-cancel"}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert got["reason"] == "user-cancel" + + +def test_server_handle_request_transcript(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + got = {} + + def fake_transcript(last=None): + got["last"] = last + return {"ok": True, "lines": ["a", "b"], "total": 2} + + monkeypatch.setattr(pm, "transcript", fake_transcript) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("transcript", tok, {"last": 5}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert resp["payload"]["lines"] == ["a", "b"] + assert got["last"] == 5 + + +def test_server_handle_request_say_enqueues_when_active(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + out = tmp_path / "meet-out" + out.mkdir() + monkeypatch.setattr(pm, "_read_active", + lambda: {"pid": 1, "meeting_id": "m", "out_dir": str(out)}) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("say", tok, {"text": "hello"}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert resp["payload"]["ok"] is True + assert resp["payload"]["enqueued"] is True + q = (out / "say_queue.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert len(q) == 1 + assert json.loads(q[0])["text"] == "hello" + + +def test_server_handle_request_say_without_active_still_ok(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + monkeypatch.setattr(pm, "_read_active", lambda: None) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("say", tok, {"text": "hi"}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "response" + assert resp["payload"]["ok"] is True + assert resp["payload"]["enqueued"] is False + + +def test_server_handle_request_wraps_pm_exceptions(tmp_path, monkeypatch): + from plugins.google_meet.node.server import NodeServer + from plugins.google_meet.node import protocol + from plugins.google_meet import process_manager as pm + + def boom(): + raise ValueError("kaboom") + + monkeypatch.setattr(pm, "status", boom) + + s = NodeServer(token_path=tmp_path / "t.json") + tok = s.ensure_token() + req = protocol.make_request("status", tok, {}) + resp = asyncio.run(s._handle_request(req)) + assert resp["type"] == "error" + assert "kaboom" in resp["error"] + + +# --------------------------------------------------------------------------- +# client.py +# --------------------------------------------------------------------------- + +class _FakeWS: + """Minimal context-manager stand-in for websockets.sync.client.connect.""" + + def __init__(self, reply_builder): + self._reply_builder = reply_builder + self.sent = [] + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def send(self, raw): + self.sent.append(raw) + + def recv(self, timeout=None): + return self._reply_builder(self.sent[-1]) + + +def _install_fake_ws(monkeypatch, reply_builder): + fake_ws_holder = {} + + def _connect(url, **kwargs): + ws = _FakeWS(reply_builder) + fake_ws_holder["ws"] = ws + fake_ws_holder["url"] = url + fake_ws_holder["kwargs"] = kwargs + return ws + + # Patch the concrete import site inside client._rpc + import websockets.sync.client as wsc # type: ignore + monkeypatch.setattr(wsc, "connect", _connect) + return fake_ws_holder + + +def test_client_rpc_sends_correct_envelope_and_parses_response(monkeypatch): + from plugins.google_meet.node.client import NodeClient + from plugins.google_meet.node import protocol + + def reply(raw_out): + req = protocol.decode(raw_out) + return protocol.encode(protocol.make_response(req["id"], {"ok": True, "echo": req["type"]})) + + holder = _install_fake_ws(monkeypatch, reply) + + c = NodeClient("ws://remote:1", "tok123") + out = c._rpc("ping", {"hello": 1}) + assert out == {"ok": True, "echo": "ping"} + + sent = json.loads(holder["ws"].sent[0]) + assert sent["type"] == "ping" + assert sent["token"] == "tok123" + assert sent["payload"] == {"hello": 1} + assert sent["id"] # non-empty + assert holder["url"] == "ws://remote:1" + + +def test_client_rpc_raises_on_error_envelope(monkeypatch): + from plugins.google_meet.node.client import NodeClient + from plugins.google_meet.node import protocol + + def reply(raw_out): + req = protocol.decode(raw_out) + return protocol.encode(protocol.make_error(req["id"], "nope")) + + _install_fake_ws(monkeypatch, reply) + + c = NodeClient("ws://x", "t") + with pytest.raises(RuntimeError, match="nope"): + c._rpc("ping", {}) + + +def test_client_rpc_raises_on_id_mismatch(monkeypatch): + from plugins.google_meet.node.client import NodeClient + from plugins.google_meet.node import protocol + + def reply(raw_out): + return protocol.encode(protocol.make_response("different-id", {"ok": True})) + + _install_fake_ws(monkeypatch, reply) + + c = NodeClient("ws://x", "t") + with pytest.raises(RuntimeError, match="mismatch"): + c._rpc("ping", {}) + + +def test_client_convenience_methods_hit_correct_types(monkeypatch): + from plugins.google_meet.node.client import NodeClient + from plugins.google_meet.node import protocol + + seen = [] + + def reply(raw_out): + req = protocol.decode(raw_out) + seen.append((req["type"], req["payload"])) + return protocol.encode(protocol.make_response(req["id"], {"ok": True})) + + _install_fake_ws(monkeypatch, reply) + + c = NodeClient("ws://x", "t") + c.start_bot("https://meet.google.com/a-b-c", guest_name="G", duration="10m") + c.stop() + c.status() + c.transcript(last=3) + c.say("hi") + c.ping() + + types = [t for t, _ in seen] + assert types == ["start_bot", "stop", "status", "transcript", "say", "ping"] + # Check specific payload routing + assert seen[0][1]["url"] == "https://meet.google.com/a-b-c" + assert seen[0][1]["guest_name"] == "G" + assert seen[0][1]["duration"] == "10m" + assert seen[3][1]["last"] == 3 + assert seen[4][1]["text"] == "hi" + + +def test_client_init_rejects_bad_args(): + from plugins.google_meet.node.client import NodeClient + + with pytest.raises(ValueError): + NodeClient("", "t") + with pytest.raises(ValueError): + NodeClient("ws://x", "") + + +# --------------------------------------------------------------------------- +# cli.py +# --------------------------------------------------------------------------- + +def _build_parser(): + from plugins.google_meet.node.cli import register_cli + + parser = argparse.ArgumentParser(prog="meet-node-test") + register_cli(parser) + return parser + + +def test_cli_approve_list_remove(capsys): + from plugins.google_meet.node.registry import NodeRegistry + + p = _build_parser() + + args = p.parse_args(["approve", "mac", "ws://mac:1", "tok"]) + rc = args.func(args) + assert rc == 0 + assert NodeRegistry().get("mac") is not None + + args = p.parse_args(["list"]) + rc = args.func(args) + assert rc == 0 + out = capsys.readouterr().out + assert "mac" in out + assert "ws://mac:1" in out + + args = p.parse_args(["remove", "mac"]) + rc = args.func(args) + assert rc == 0 + assert NodeRegistry().get("mac") is None + + +def test_cli_list_empty(capsys): + p = _build_parser() + args = p.parse_args(["list"]) + rc = args.func(args) + assert rc == 0 + assert "no nodes" in capsys.readouterr().out + + +def test_cli_remove_missing_returns_nonzero(): + p = _build_parser() + args = p.parse_args(["remove", "ghost"]) + rc = args.func(args) + assert rc == 1 + + +def test_cli_status_pings_via_node_client(capsys, monkeypatch): + from plugins.google_meet.node.registry import NodeRegistry + from plugins.google_meet.node import cli as node_cli + + NodeRegistry().add("mac", "ws://mac:1", "tok") + + class _FakeClient: + def __init__(self, url, token): + assert url == "ws://mac:1" + assert token == "tok" + + def ping(self): + return {"type": "pong", "display_name": "hermes-meet-node"} + + monkeypatch.setattr(node_cli, "NodeClient", _FakeClient) + + p = _build_parser() + args = p.parse_args(["status", "mac"]) + rc = args.func(args) + assert rc == 0 + out = capsys.readouterr().out.strip() + data = json.loads(out) + assert data["ok"] is True + assert data["node"] == "mac" + + +def test_cli_status_unknown_node_fails(capsys): + p = _build_parser() + args = p.parse_args(["status", "ghost"]) + rc = args.func(args) + assert rc == 1 + + +def test_cli_status_reports_client_error(capsys, monkeypatch): + from plugins.google_meet.node.registry import NodeRegistry + from plugins.google_meet.node import cli as node_cli + + NodeRegistry().add("mac", "ws://mac:1", "tok") + + class _FakeClient: + def __init__(self, url, token): + pass + + def ping(self): + raise RuntimeError("connection refused") + + monkeypatch.setattr(node_cli, "NodeClient", _FakeClient) + + p = _build_parser() + args = p.parse_args(["status", "mac"]) + rc = args.func(args) + assert rc == 1 + data = json.loads(capsys.readouterr().out.strip()) + assert data["ok"] is False + assert "connection refused" in data["error"] diff --git a/tests/plugins/test_google_meet_plugin.py b/tests/plugins/test_google_meet_plugin.py new file mode 100644 index 0000000000..c8dacc81d2 --- /dev/null +++ b/tests/plugins/test_google_meet_plugin.py @@ -0,0 +1,814 @@ +"""Tests for the google_meet plugin. + +Covers the safety-gated pieces that don't require Playwright: + + * URL regex — only ``https://meet.google.com/`` URLs pass + * Meeting-id extraction from Meet URLs + * Status / transcript writes round-trip through the file-backed state + * Tool handlers return well-formed JSON under all branches + * Process manager refuses unsafe URLs and clears stale state cleanly + * ``_on_session_end`` hook is defensive (no-ops when no bot active) + +Does NOT spawn a real Chromium — we mock ``subprocess.Popen`` where needed. +""" + +from __future__ import annotations + +import json +import os +import signal +from pathlib import Path +from unittest.mock import patch + +import pytest + + +@pytest.fixture(autouse=True) +def _isolate_home(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + yield hermes_home + + +# --------------------------------------------------------------------------- +# URL safety gate +# --------------------------------------------------------------------------- + +def test_is_safe_meet_url_accepts_standard_meet_codes(): + from plugins.google_meet.meet_bot import _is_safe_meet_url + + assert _is_safe_meet_url("https://meet.google.com/abc-defg-hij") + assert _is_safe_meet_url("https://meet.google.com/abc-defg-hij?pli=1") + assert _is_safe_meet_url("https://meet.google.com/new") + assert _is_safe_meet_url("https://meet.google.com/lookup/ABC123") + + +def test_is_safe_meet_url_rejects_non_meet_urls(): + from plugins.google_meet.meet_bot import _is_safe_meet_url + + # wrong host + assert not _is_safe_meet_url("https://evil.example.com/abc-defg-hij") + # wrong scheme + assert not _is_safe_meet_url("http://meet.google.com/abc-defg-hij") + # malformed code + assert not _is_safe_meet_url("https://meet.google.com/not-a-meet-code") + # subdomain hijack attempts + assert not _is_safe_meet_url("https://meet.google.com.evil.com/abc-defg-hij") + assert not _is_safe_meet_url("https://notmeet.google.com/abc-defg-hij") + # empty / wrong type + assert not _is_safe_meet_url("") + assert not _is_safe_meet_url(None) # type: ignore[arg-type] + assert not _is_safe_meet_url(123) # type: ignore[arg-type] + + +def test_meeting_id_extraction(): + from plugins.google_meet.meet_bot import _meeting_id_from_url + + assert _meeting_id_from_url("https://meet.google.com/abc-defg-hij") == "abc-defg-hij" + assert _meeting_id_from_url("https://meet.google.com/abc-defg-hij?pli=1") == "abc-defg-hij" + # fallback for codes we can't parse (e.g. /new before redirect) + fallback = _meeting_id_from_url("https://meet.google.com/new") + assert fallback.startswith("meet-") + + +# --------------------------------------------------------------------------- +# _BotState — transcript + status file round-trip +# --------------------------------------------------------------------------- + +def test_bot_state_dedupes_captions_and_flushes_status(tmp_path): + from plugins.google_meet.meet_bot import _BotState + + out = tmp_path / "session" + state = _BotState(out_dir=out, meeting_id="abc-defg-hij", + url="https://meet.google.com/abc-defg-hij") + + state.record_caption("Alice", "Hey everyone") + state.record_caption("Alice", "Hey everyone") # dup — ignored + state.record_caption("Bob", "Let's start") + + transcript = (out / "transcript.txt").read_text() + assert "Alice: Hey everyone" in transcript + assert "Bob: Let's start" in transcript + # dedup — Alice line appears exactly once + assert transcript.count("Alice: Hey everyone") == 1 + + status = json.loads((out / "status.json").read_text()) + assert status["meetingId"] == "abc-defg-hij" + assert status["transcriptLines"] == 2 + assert status["transcriptPath"].endswith("transcript.txt") + + +def test_bot_state_ignores_blank_text(tmp_path): + from plugins.google_meet.meet_bot import _BotState + + state = _BotState(out_dir=tmp_path / "s", meeting_id="x-y-z", + url="https://meet.google.com/x-y-z") + state.record_caption("Alice", "") + state.record_caption("Alice", " ") + state.record_caption("", "text but no speaker") + + status = json.loads((tmp_path / "s" / "status.json").read_text()) + assert status["transcriptLines"] == 1 + # blank-speaker falls back to "Unknown" + assert "Unknown: text but no speaker" in (tmp_path / "s" / "transcript.txt").read_text() + + +def test_parse_duration(): + from plugins.google_meet.meet_bot import _parse_duration + + assert _parse_duration("30m") == 30 * 60 + assert _parse_duration("2h") == 2 * 3600 + assert _parse_duration("90s") == 90 + assert _parse_duration("90") == 90 + assert _parse_duration("") is None + assert _parse_duration("bogus") is None + + +# --------------------------------------------------------------------------- +# process_manager — refuses unsafe URLs, manages active pointer +# --------------------------------------------------------------------------- + +def test_start_refuses_unsafe_url(): + from plugins.google_meet import process_manager as pm + + res = pm.start("https://evil.example.com/abc-defg-hij") + assert res["ok"] is False + assert "refusing" in res["error"] + + +def test_status_reports_no_active_meeting(): + from plugins.google_meet import process_manager as pm + + assert pm.status() == {"ok": False, "reason": "no active meeting"} + assert pm.transcript() == {"ok": False, "reason": "no active meeting"} + assert pm.stop() == {"ok": False, "reason": "no active meeting"} + + +def test_start_spawns_subprocess_and_writes_active_pointer(tmp_path): + """Verify start() wires env vars correctly and records the pid.""" + from plugins.google_meet import process_manager as pm + + class _FakeProc: + def __init__(self, pid): + self.pid = pid + + captured_env = {} + captured_argv = [] + + def _fake_popen(argv, **kwargs): + captured_argv.extend(argv) + captured_env.update(kwargs.get("env") or {}) + return _FakeProc(99999) + + with patch.object(pm.subprocess, "Popen", side_effect=_fake_popen): + # Also prevent pid liveness probe from stomping on our real pids + with patch.object(pm, "_pid_alive", return_value=False): + res = pm.start( + "https://meet.google.com/abc-defg-hij", + guest_name="Test Bot", + duration="15m", + ) + + assert res["ok"] is True + assert res["meeting_id"] == "abc-defg-hij" + assert res["pid"] == 99999 + assert captured_env["HERMES_MEET_URL"] == "https://meet.google.com/abc-defg-hij" + assert captured_env["HERMES_MEET_GUEST_NAME"] == "Test Bot" + assert captured_env["HERMES_MEET_DURATION"] == "15m" + # python -m plugins.google_meet.meet_bot + assert any("plugins.google_meet.meet_bot" in a for a in captured_argv) + + # .active.json points at the bot + active = pm._read_active() + assert active is not None + assert active["pid"] == 99999 + assert active["meeting_id"] == "abc-defg-hij" + + +def test_transcript_reads_last_n_lines(tmp_path): + from plugins.google_meet import process_manager as pm + + meeting_dir = Path(os.environ["HERMES_HOME"]) / "workspace" / "meetings" / "abc-defg-hij" + meeting_dir.mkdir(parents=True) + (meeting_dir / "transcript.txt").write_text( + "[10:00:00] Alice: one\n" + "[10:00:01] Bob: two\n" + "[10:00:02] Alice: three\n" + ) + pm._write_active({ + "pid": 0, "meeting_id": "abc-defg-hij", + "out_dir": str(meeting_dir), + "url": "https://meet.google.com/abc-defg-hij", + "started_at": 0, + }) + + res = pm.transcript(last=2) + assert res["ok"] is True + assert res["total"] == 3 + assert len(res["lines"]) == 2 + assert res["lines"][-1].endswith("Alice: three") + + +def test_stop_signals_process_and_clears_pointer(tmp_path): + from plugins.google_meet import process_manager as pm + + pm._write_active({ + "pid": 11111, "meeting_id": "x-y-z", + "out_dir": str(tmp_path / "x-y-z"), + "url": "https://meet.google.com/x-y-z", + "started_at": 0, + }) + + alive_seq = iter([True, True, False]) # alive at first, gone after SIGTERM + def _alive(pid): + try: + return next(alive_seq) + except StopIteration: + return False + + sent = [] + def _kill(pid, sig): + sent.append((pid, sig)) + + with patch.object(pm, "_pid_alive", side_effect=_alive), \ + patch.object(pm.os, "kill", side_effect=_kill), \ + patch.object(pm.time, "sleep", lambda _s: None): + res = pm.stop() + + assert res["ok"] is True + assert (11111, signal.SIGTERM) in sent + # .active.json cleared + assert pm._read_active() is None + + +# --------------------------------------------------------------------------- +# Tool handlers — JSON shape + safety gates +# --------------------------------------------------------------------------- + +def test_meet_join_handler_missing_url_returns_error(): + from plugins.google_meet.tools import handle_meet_join + + out = json.loads(handle_meet_join({})) + assert out["success"] is False + assert "url is required" in out["error"] + + +def test_meet_join_handler_respects_safety_gate(): + from plugins.google_meet.tools import handle_meet_join + + with patch("plugins.google_meet.tools.check_meet_requirements", return_value=True): + out = json.loads(handle_meet_join({"url": "https://evil.example.com/foo"})) + assert out["success"] is False + assert "refusing" in out["error"] + + +def test_meet_join_handler_returns_error_when_playwright_missing(): + from plugins.google_meet.tools import handle_meet_join + + with patch("plugins.google_meet.tools.check_meet_requirements", return_value=False): + out = json.loads(handle_meet_join({"url": "https://meet.google.com/abc-defg-hij"})) + assert out["success"] is False + assert "prerequisites missing" in out["error"] + + +def test_meet_say_requires_text(): + from plugins.google_meet.tools import handle_meet_say + + out = json.loads(handle_meet_say({})) + assert out["success"] is False + assert "text is required" in out["error"] + + +def test_meet_say_no_active_meeting(): + from plugins.google_meet.tools import handle_meet_say + + out = json.loads(handle_meet_say({"text": "hello everyone"})) + assert out["success"] is False + # Falls through to pm.enqueue_say which reports no active meeting. + assert "no active meeting" in out.get("reason", "") + + +def test_meet_status_and_transcript_no_active(): + from plugins.google_meet.tools import handle_meet_status, handle_meet_transcript + + assert json.loads(handle_meet_status({}))["success"] is False + assert json.loads(handle_meet_transcript({}))["success"] is False + + +def test_meet_leave_no_active(): + from plugins.google_meet.tools import handle_meet_leave + + out = json.loads(handle_meet_leave({})) + assert out["success"] is False + + +# --------------------------------------------------------------------------- +# _on_session_end — defensive cleanup +# --------------------------------------------------------------------------- + +def test_on_session_end_noop_when_nothing_active(): + from plugins.google_meet import _on_session_end + # Should not raise and should not call stop(). + with patch("plugins.google_meet.pm.stop") as stop_mock: + _on_session_end() + stop_mock.assert_not_called() + + +def test_on_session_end_stops_live_bot(): + from plugins.google_meet import _on_session_end + from plugins.google_meet import pm + + with patch.object(pm, "status", return_value={"ok": True, "alive": True}), \ + patch.object(pm, "stop") as stop_mock: + _on_session_end() + stop_mock.assert_called_once() + + +# --------------------------------------------------------------------------- +# Plugin register() — platform gating + tool registration +# --------------------------------------------------------------------------- + +def test_register_refuses_on_windows(): + import plugins.google_meet as plugin + + calls = {"tools": [], "cli": [], "hooks": []} + + class _Ctx: + def register_tool(self, **kw): calls["tools"].append(kw["name"]) + def register_cli_command(self, **kw): calls["cli"].append(kw["name"]) + def register_hook(self, name, fn): calls["hooks"].append(name) + + with patch.object(plugin.platform, "system", return_value="Windows"): + plugin.register(_Ctx()) + + assert calls == {"tools": [], "cli": [], "hooks": []} + + +def test_register_wires_tools_cli_and_hook_on_linux(): + import plugins.google_meet as plugin + + calls = {"tools": [], "cli": [], "hooks": []} + + class _Ctx: + def register_tool(self, **kw): calls["tools"].append(kw["name"]) + def register_cli_command(self, **kw): calls["cli"].append(kw["name"]) + def register_hook(self, name, fn): calls["hooks"].append(name) + + with patch.object(plugin.platform, "system", return_value="Linux"): + plugin.register(_Ctx()) + + assert set(calls["tools"]) == { + "meet_join", "meet_status", "meet_transcript", "meet_leave", "meet_say", + } + assert calls["cli"] == ["meet"] + assert calls["hooks"] == ["on_session_end"] + + +# --------------------------------------------------------------------------- +# v2: process_manager.enqueue_say + realtime-mode passthrough +# --------------------------------------------------------------------------- + +def test_enqueue_say_requires_text(): + from plugins.google_meet import process_manager as pm + assert pm.enqueue_say("")["ok"] is False + assert pm.enqueue_say(" ")["ok"] is False + + +def test_enqueue_say_no_active_meeting(): + from plugins.google_meet import process_manager as pm + res = pm.enqueue_say("hi team") + assert res["ok"] is False + assert "no active meeting" in res["reason"] + + +def test_enqueue_say_rejects_transcribe_mode(tmp_path): + from plugins.google_meet import process_manager as pm + + out_dir = Path(os.environ["HERMES_HOME"]) / "workspace" / "meetings" / "abc-defg-hij" + out_dir.mkdir(parents=True) + pm._write_active({ + "pid": 0, "meeting_id": "abc-defg-hij", + "out_dir": str(out_dir), "url": "https://meet.google.com/abc-defg-hij", + "started_at": 0, "mode": "transcribe", + }) + res = pm.enqueue_say("hi team") + assert res["ok"] is False + assert "transcribe mode" in res["reason"] + + +def test_enqueue_say_writes_jsonl_in_realtime_mode(): + from plugins.google_meet import process_manager as pm + + out_dir = Path(os.environ["HERMES_HOME"]) / "workspace" / "meetings" / "abc-defg-hij" + out_dir.mkdir(parents=True) + pm._write_active({ + "pid": 0, "meeting_id": "abc-defg-hij", + "out_dir": str(out_dir), "url": "https://meet.google.com/abc-defg-hij", + "started_at": 0, "mode": "realtime", + }) + res = pm.enqueue_say("hello everyone") + assert res["ok"] is True + assert "enqueued_id" in res + + queue = out_dir / "say_queue.jsonl" + assert queue.is_file() + lines = [json.loads(ln) for ln in queue.read_text().splitlines() if ln.strip()] + assert len(lines) == 1 + assert lines[0]["text"] == "hello everyone" + + +def test_start_passes_mode_into_active_record(): + from plugins.google_meet import process_manager as pm + + class _FakeProc: + def __init__(self, pid): self.pid = pid + + with patch.object(pm.subprocess, "Popen", return_value=_FakeProc(12345)), \ + patch.object(pm, "_pid_alive", return_value=False): + res = pm.start( + "https://meet.google.com/abc-defg-hij", + mode="realtime", + ) + assert res["ok"] is True + assert res["mode"] == "realtime" + assert pm._read_active()["mode"] == "realtime" + + +def test_start_realtime_env_vars_threaded_through(): + from plugins.google_meet import process_manager as pm + + class _FakeProc: + def __init__(self, pid): self.pid = pid + + captured_env = {} + def _fake_popen(argv, **kwargs): + captured_env.update(kwargs.get("env") or {}) + return _FakeProc(11111) + + with patch.object(pm.subprocess, "Popen", side_effect=_fake_popen), \ + patch.object(pm, "_pid_alive", return_value=False): + pm.start( + "https://meet.google.com/abc-defg-hij", + mode="realtime", + realtime_model="gpt-realtime", + realtime_voice="alloy", + realtime_instructions="Be brief.", + realtime_api_key="sk-test", + ) + assert captured_env["HERMES_MEET_MODE"] == "realtime" + assert captured_env["HERMES_MEET_REALTIME_MODEL"] == "gpt-realtime" + assert captured_env["HERMES_MEET_REALTIME_VOICE"] == "alloy" + assert captured_env["HERMES_MEET_REALTIME_INSTRUCTIONS"] == "Be brief." + assert captured_env["HERMES_MEET_REALTIME_KEY"] == "sk-test" + + +def test_meet_join_accepts_realtime_mode(): + from plugins.google_meet.tools import handle_meet_join + + with patch("plugins.google_meet.tools.check_meet_requirements", return_value=True), \ + patch("plugins.google_meet.tools.pm.start", return_value={"ok": True, "meeting_id": "x-y-z"}) as start_mock: + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "mode": "realtime", + })) + assert out["success"] is True + assert start_mock.call_args.kwargs["mode"] == "realtime" + + +def test_meet_join_rejects_bad_mode(): + from plugins.google_meet.tools import handle_meet_join + + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "mode": "bogus", + })) + assert out["success"] is False + assert "mode must be" in out["error"] + + +# --------------------------------------------------------------------------- +# v3: NodeClient routing from tool handlers +# --------------------------------------------------------------------------- + +def test_meet_join_unknown_node_returns_clear_error(): + from plugins.google_meet.tools import handle_meet_join + + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "node": "my-mac", + })) + assert out["success"] is False + assert "no registered meet node" in out["error"] + + +def test_meet_join_routes_to_registered_node(): + from plugins.google_meet.tools import handle_meet_join + from plugins.google_meet.node.registry import NodeRegistry + + reg = NodeRegistry() + reg.add("my-mac", "ws://1.2.3.4:18789", "tok") + + with patch("plugins.google_meet.node.client.NodeClient.start_bot", + return_value={"ok": True, "meeting_id": "a-b-c"}) as call_mock: + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "node": "my-mac", + "mode": "realtime", + })) + assert out["success"] is True + assert out["node"] == "my-mac" + assert call_mock.call_args.kwargs["mode"] == "realtime" + + +def test_meet_say_routes_to_node(): + from plugins.google_meet.tools import handle_meet_say + from plugins.google_meet.node.registry import NodeRegistry + + reg = NodeRegistry() + reg.add("my-mac", "ws://1.2.3.4:18789", "tok") + + with patch("plugins.google_meet.node.client.NodeClient.say", + return_value={"ok": True, "enqueued_id": "abc"}) as call_mock: + out = json.loads(handle_meet_say({"text": "hello", "node": "my-mac"})) + assert out["success"] is True + assert out["node"] == "my-mac" + call_mock.assert_called_once_with("hello") + + +def test_meet_join_auto_node_selects_sole_registered(): + from plugins.google_meet.tools import handle_meet_join + from plugins.google_meet.node.registry import NodeRegistry + + reg = NodeRegistry() + reg.add("only-one", "ws://1.2.3.4:18789", "tok") + + with patch("plugins.google_meet.node.client.NodeClient.start_bot", + return_value={"ok": True}) as call_mock: + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "node": "auto", + })) + assert out["success"] is True + assert out["node"] == "only-one" + assert call_mock.called + + +def test_meet_join_auto_node_ambiguous_returns_error(): + from plugins.google_meet.tools import handle_meet_join + from plugins.google_meet.node.registry import NodeRegistry + + reg = NodeRegistry() + reg.add("a", "ws://1.2.3.4:18789", "tok") + reg.add("b", "ws://5.6.7.8:18789", "tok") + + out = json.loads(handle_meet_join({ + "url": "https://meet.google.com/abc-defg-hij", + "node": "auto", + })) + assert out["success"] is False + assert "no registered meet node" in out["error"] + + +def test_cli_register_includes_node_subcommand(): + """`hermes meet` argparse tree includes the node subtree.""" + import argparse + from plugins.google_meet.cli import register_cli + + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + + # Parse a known-good node invocation to prove the subtree is wired. + ns = parser.parse_args(["node", "list"]) + assert ns.meet_command == "node" + assert ns.node_cmd == "list" + + +def test_cli_join_accepts_mode_and_node_flags(): + import argparse + from plugins.google_meet.cli import register_cli + + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + + ns = parser.parse_args([ + "join", "https://meet.google.com/abc-defg-hij", + "--mode", "realtime", "--node", "my-mac", + ]) + assert ns.mode == "realtime" + assert ns.node == "my-mac" + + +def test_cli_say_subcommand_exists(): + import argparse + from plugins.google_meet.cli import register_cli + + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + + ns = parser.parse_args(["say", "hello team", "--node", "my-mac"]) + assert ns.text == "hello team" + assert ns.node == "my-mac" + + +# --------------------------------------------------------------------------- +# v2.1: new _BotState fields + status dict shape +# --------------------------------------------------------------------------- + +def test_bot_state_exposes_v2_telemetry_fields(tmp_path): + from plugins.google_meet.meet_bot import _BotState + + state = _BotState(out_dir=tmp_path / "s", meeting_id="x-y-z", + url="https://meet.google.com/x-y-z") + # Defaults for the new fields. + status = json.loads((tmp_path / "s" / "status.json").read_text()) + for key in ( + "realtime", "realtimeReady", "realtimeDevice", + "audioBytesOut", "lastAudioOutAt", "lastBargeInAt", + "joinAttemptedAt", "leaveReason", + ): + assert key in status, f"missing v2 telemetry key: {key}" + assert status["realtime"] is False + assert status["realtimeReady"] is False + assert status["audioBytesOut"] == 0 + + # Setting them flushes them. + state.set(realtime=True, realtime_ready=True, audio_bytes_out=1024, + leave_reason="lobby_timeout") + status = json.loads((tmp_path / "s" / "status.json").read_text()) + assert status["realtime"] is True + assert status["realtimeReady"] is True + assert status["audioBytesOut"] == 1024 + assert status["leaveReason"] == "lobby_timeout" + + +# --------------------------------------------------------------------------- +# Admission detection + barge-in helper +# --------------------------------------------------------------------------- + +def test_looks_like_human_speaker(): + from plugins.google_meet.meet_bot import _looks_like_human_speaker + + # Blank, "unknown", "you", and the bot's own name → not human (no barge-in) + for s in ("", " ", "Unknown", "unknown", "You", "you", "Hermes Agent", "hermes agent"): + assert not _looks_like_human_speaker(s, "Hermes Agent"), f"{s!r} should NOT be human" + # Real names → human (barge-in) + for s in ("Alice", "Bob Lee", "@teknium"): + assert _looks_like_human_speaker(s, "Hermes Agent"), f"{s!r} SHOULD be human" + + +def test_detect_admission_returns_false_on_error(): + from plugins.google_meet.meet_bot import _detect_admission + + class _FakePage: + def evaluate(self, _js): raise RuntimeError("boom") + + assert _detect_admission(_FakePage()) is False + + +def test_detect_admission_true_when_probe_returns_true(): + from plugins.google_meet.meet_bot import _detect_admission + + class _FakePage: + def evaluate(self, _js): return True + + assert _detect_admission(_FakePage()) is True + + +def test_detect_denied_returns_false_on_error(): + from plugins.google_meet.meet_bot import _detect_denied + + class _FakePage: + def evaluate(self, _js): raise RuntimeError("boom") + + assert _detect_denied(_FakePage()) is False + + +# --------------------------------------------------------------------------- +# Realtime session counters + cancel_response (barge-in) +# --------------------------------------------------------------------------- + +def test_realtime_session_cancel_response_when_disconnected(): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + sess = RealtimeSession(api_key="sk-test", audio_sink_path=None) + # No _ws yet — cancel should no-op and return False. + assert sess.cancel_response() is False + + +def test_realtime_session_cancel_response_sends_cancel_frame(): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + sess = RealtimeSession(api_key="sk-test", audio_sink_path=None) + sent = [] + + class _FakeWs: + def send(self, msg): sent.append(msg) + + sess._ws = _FakeWs() + assert sess.cancel_response() is True + assert len(sent) == 1 + import json as _j + envelope = _j.loads(sent[0]) + assert envelope == {"type": "response.cancel"} + + +def test_realtime_session_counters_initialized(): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + sess = RealtimeSession(api_key="sk-test", audio_sink_path=None) + assert sess.audio_bytes_out == 0 + assert sess.last_audio_out_at is None + + +# --------------------------------------------------------------------------- +# hermes meet install CLI +# --------------------------------------------------------------------------- + +def test_cli_install_subcommand_is_registered(): + import argparse + from plugins.google_meet.cli import register_cli + + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + + ns = parser.parse_args(["install"]) + assert ns.meet_command == "install" + assert ns.realtime is False + assert ns.yes is False + + +def test_cli_install_flags_parse(): + import argparse + from plugins.google_meet.cli import register_cli + + parser = argparse.ArgumentParser(prog="hermes meet") + register_cli(parser) + + ns = parser.parse_args(["install", "--realtime", "--yes"]) + assert ns.realtime is True + assert ns.yes is True + + +def test_cmd_install_refuses_windows(capsys): + from plugins.google_meet.cli import _cmd_install + + with patch("plugins.google_meet.cli.platform" if False else "platform.system", + return_value="Windows"): + rc = _cmd_install(realtime=False, assume_yes=True) + assert rc == 1 + out = capsys.readouterr().out + assert "Windows" in out + + +def test_cmd_install_runs_pip_and_playwright(capsys): + """End-to-end wiring: pip + playwright install invoked, returncodes handled.""" + from plugins.google_meet.cli import _cmd_install + import subprocess as _sp + + calls = [] + class _FakeRes: + def __init__(self, rc=0): self.returncode = rc + + def _fake_run(argv, **kwargs): + calls.append(list(argv)) + return _FakeRes(0) + + with patch("platform.system", return_value="Linux"), \ + patch("subprocess.run", side_effect=_fake_run), \ + patch("shutil.which", return_value="/usr/bin/paplay"): + rc = _cmd_install(realtime=False, assume_yes=True) + assert rc == 0 + # First invocation: pip install + pip_cmds = [c for c in calls if len(c) > 2 and c[1:4] == ["-m", "pip", "install"]] + assert pip_cmds, f"no pip install run: {calls}" + assert "playwright" in pip_cmds[0] + assert "websockets" in pip_cmds[0] + # Second: playwright install chromium + pw_cmds = [c for c in calls if len(c) > 2 and c[1:4] == ["-m", "playwright", "install"]] + assert pw_cmds, f"no playwright install run: {calls}" + assert "chromium" in pw_cmds[0] + + +def test_cmd_install_realtime_skips_when_deps_present(capsys): + """When paplay + pactl are already on PATH, no sudo call happens.""" + from plugins.google_meet.cli import _cmd_install + + calls = [] + class _FakeRes: + def __init__(self, rc=0): self.returncode = rc + + def _fake_run(argv, **kwargs): + calls.append(list(argv)) + return _FakeRes(0) + + with patch("platform.system", return_value="Linux"), \ + patch("subprocess.run", side_effect=_fake_run), \ + patch("shutil.which", return_value="/usr/bin/paplay"): + rc = _cmd_install(realtime=True, assume_yes=True) + assert rc == 0 + # No sudo apt-get call — paplay was already on PATH. + sudo_calls = [c for c in calls if c and c[0] == "sudo"] + assert sudo_calls == [], f"unexpected sudo invocation: {sudo_calls}" + out = capsys.readouterr().out + assert "already installed" in out diff --git a/tests/plugins/test_google_meet_realtime.py b/tests/plugins/test_google_meet_realtime.py new file mode 100644 index 0000000000..71d0221693 --- /dev/null +++ b/tests/plugins/test_google_meet_realtime.py @@ -0,0 +1,293 @@ +"""Tests for plugins.google_meet.realtime.openai_client (v2). + +Uses a scripted fake WebSocket — no network, no API key required. +""" + +from __future__ import annotations + +import base64 +import json +import sys +import threading +import types +from pathlib import Path +from unittest.mock import patch + +import pytest + + +@pytest.fixture(autouse=True) +def _isolate_home(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + yield hermes_home + + +# --------------------------------------------------------------------------- +# Fake WebSocket +# --------------------------------------------------------------------------- + + +class _FakeWS: + """Scripted WS: send() records frames, recv() pops a queue.""" + + def __init__(self, recv_frames: list): + self.sent: list[dict] = [] + self._recv_q: list = list(recv_frames) + self.closed = False + + def send(self, payload): + # Always accept str payloads — client encodes JSON with json.dumps. + if isinstance(payload, (bytes, bytearray)): + payload = payload.decode() + self.sent.append(json.loads(payload)) + + def recv(self, timeout=None): # noqa: ARG002 + if not self._recv_q: + raise RuntimeError("fake ws: no more frames") + frame = self._recv_q.pop(0) + if isinstance(frame, dict): + return json.dumps(frame) + return frame + + def close(self): + self.closed = True + + +def _install_fake_websockets(monkeypatch, fake_ws): + """Install a fake ``websockets.sync.client`` module in sys.modules.""" + mod_websockets = types.ModuleType("websockets") + mod_sync = types.ModuleType("websockets.sync") + mod_sync_client = types.ModuleType("websockets.sync.client") + + captured = {"url": None, "headers": None, "kwargs": None} + + def _connect(url, **kwargs): + captured["url"] = url + captured["kwargs"] = kwargs + captured["headers"] = ( + kwargs.get("additional_headers") or kwargs.get("extra_headers") + ) + return fake_ws + + mod_sync_client.connect = _connect + mod_sync.client = mod_sync_client + mod_websockets.sync = mod_sync + + monkeypatch.setitem(sys.modules, "websockets", mod_websockets) + monkeypatch.setitem(sys.modules, "websockets.sync", mod_sync) + monkeypatch.setitem(sys.modules, "websockets.sync.client", mod_sync_client) + return captured + + +# --------------------------------------------------------------------------- +# connect() +# --------------------------------------------------------------------------- + + +def test_connect_sends_session_update_with_voice_and_instructions(monkeypatch): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + ws = _FakeWS(recv_frames=[]) + captured = _install_fake_websockets(monkeypatch, ws) + + sess = RealtimeSession( + api_key="sk-test", + model="gpt-realtime", + voice="verse", + instructions="Be brief.", + ) + sess.connect() + + # Auth + beta headers set. + assert captured["url"].startswith("wss://api.openai.com/v1/realtime") + assert "model=gpt-realtime" in captured["url"] + headers = captured["headers"] or [] + hdict = dict(headers) + assert hdict.get("Authorization") == "Bearer sk-test" + assert hdict.get("OpenAI-Beta") == "realtime=v1" + + # First frame sent must be session.update with the right shape. + assert len(ws.sent) == 1 + update = ws.sent[0] + assert update["type"] == "session.update" + s = update["session"] + assert s["voice"] == "verse" + assert s["instructions"] == "Be brief." + assert set(s["modalities"]) == {"audio", "text"} + assert s["output_audio_format"] == "pcm16" + assert s["input_audio_format"] == "pcm16" + + +# --------------------------------------------------------------------------- +# speak() +# --------------------------------------------------------------------------- + + +def test_speak_sends_create_and_response_and_writes_audio(monkeypatch, tmp_path): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + audio_bytes = b"\x01\x02\x03\x04PCM!" + b64 = base64.b64encode(audio_bytes).decode() + + recv_frames = [ + {"type": "response.created"}, + {"type": "response.audio.delta", "delta": b64}, + {"type": "response.audio.delta", "delta": base64.b64encode(b"more").decode()}, + {"type": "response.done"}, + ] + ws = _FakeWS(recv_frames=recv_frames) + _install_fake_websockets(monkeypatch, ws) + + sink = tmp_path / "out.pcm" + sess = RealtimeSession(api_key="sk-test", audio_sink_path=sink) + sess.connect() + result = sess.speak("Hello everyone.") + + # Frames sent after session.update: conversation.item.create then response.create. + types_sent = [f["type"] for f in ws.sent] + assert types_sent == ["session.update", "conversation.item.create", "response.create"] + + item = ws.sent[1]["item"] + assert item["role"] == "user" + assert item["content"][0]["type"] == "input_text" + assert item["content"][0]["text"] == "Hello everyone." + + resp = ws.sent[2]["response"] + assert resp["modalities"] == ["audio"] + + # Audio file got decoded + appended bytes. + data = sink.read_bytes() + assert data == audio_bytes + b"more" + assert result["ok"] is True + assert result["bytes_written"] == len(audio_bytes) + len(b"more") + assert result["duration_ms"] >= 0.0 + + +def test_speak_raises_on_error_frame(monkeypatch, tmp_path): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + ws = _FakeWS(recv_frames=[ + {"type": "response.created"}, + {"type": "error", "error": {"message": "bad juju"}}, + ]) + _install_fake_websockets(monkeypatch, ws) + + sess = RealtimeSession(api_key="sk-test", audio_sink_path=tmp_path / "o.pcm") + sess.connect() + with pytest.raises(RuntimeError, match="bad juju"): + sess.speak("hi") + + +def test_speak_without_connect_raises(monkeypatch): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + sess = RealtimeSession(api_key="sk-test") + with pytest.raises(RuntimeError, match="connect"): + sess.speak("hi") + + +def test_close_is_idempotent_and_closes_ws(monkeypatch): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + ws = _FakeWS(recv_frames=[]) + _install_fake_websockets(monkeypatch, ws) + + sess = RealtimeSession(api_key="sk-test") + sess.connect() + sess.close() + assert ws.closed is True + # Second close is a no-op. + sess.close() + + +# --------------------------------------------------------------------------- +# websockets dependency missing +# --------------------------------------------------------------------------- + + +def test_connect_raises_clean_error_when_websockets_missing(monkeypatch): + from plugins.google_meet.realtime.openai_client import RealtimeSession + + # Make `import websockets.sync.client` fail. + monkeypatch.setitem(sys.modules, "websockets", None) + monkeypatch.setitem(sys.modules, "websockets.sync", None) + monkeypatch.setitem(sys.modules, "websockets.sync.client", None) + + sess = RealtimeSession(api_key="sk-test") + with pytest.raises(RuntimeError, match="pip install websockets"): + sess.connect() + + +# --------------------------------------------------------------------------- +# RealtimeSpeaker +# --------------------------------------------------------------------------- + + +class _StubSession: + def __init__(self): + self.spoken: list[str] = [] + + def speak(self, text, timeout=30.0): # noqa: ARG002 + self.spoken.append(text) + return {"ok": True, "bytes_written": len(text), "duration_ms": 1.0} + + +def test_speaker_run_until_stopped_processes_queue(tmp_path): + from plugins.google_meet.realtime.openai_client import RealtimeSpeaker + + queue = tmp_path / "queue.jsonl" + processed = tmp_path / "processed.jsonl" + queue.write_text( + json.dumps({"id": "a", "text": "hello one"}) + "\n" + + json.dumps({"id": "b", "text": "hello two"}) + "\n" + ) + + stub = _StubSession() + speaker = RealtimeSpeaker(stub, queue_path=queue, processed_path=processed) + + # Stop once the queue is empty. + def _stop(): + return queue.exists() and queue.read_text().strip() == "" + + speaker.run_until_stopped(_stop, poll_interval=0.01) + + assert stub.spoken == ["hello one", "hello two"] + + # Processed file has both entries, in order. + lines = [json.loads(l) for l in processed.read_text().splitlines() if l.strip()] + assert [l["id"] for l in lines] == ["a", "b"] + assert all(l["result"]["ok"] for l in lines) + + # Queue is empty (possibly empty string) after processing. + assert queue.read_text().strip() == "" + + +def test_speaker_exits_immediately_when_stop_fn_true(tmp_path): + from plugins.google_meet.realtime.openai_client import RealtimeSpeaker + + queue = tmp_path / "q.jsonl" + queue.write_text(json.dumps({"id": "x", "text": "never spoken"}) + "\n") + + stub = _StubSession() + speaker = RealtimeSpeaker(stub, queue_path=queue) + speaker.run_until_stopped(lambda: True, poll_interval=0.01) + assert stub.spoken == [] + + +def test_speaker_drops_line_without_processed_path_when_none(tmp_path): + from plugins.google_meet.realtime.openai_client import RealtimeSpeaker + + queue = tmp_path / "q.jsonl" + queue.write_text(json.dumps({"id": "only", "text": "once"}) + "\n") + + stub = _StubSession() + speaker = RealtimeSpeaker(stub, queue_path=queue, processed_path=None) + + def _stop(): + return queue.read_text().strip() == "" + + speaker.run_until_stopped(_stop, poll_interval=0.01) + assert stub.spoken == ["once"] + assert queue.read_text().strip() == ""