mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
853 lines
33 KiB
Python
853 lines
33 KiB
Python
|
|
"""Headless Google Meet bot — Playwright + live-caption scraping.
|
||
|
|
|
||
|
|
Runs as a standalone subprocess spawned by ``process_manager.py``. Reads config
|
||
|
|
from env vars, writes status + transcript to files under
|
||
|
|
``$HERMES_HOME/workspace/meetings/<meeting-id>/``. The main hermes process
|
||
|
|
reads those files via the ``meet_*`` tools — no IPC beyond filesystem.
|
||
|
|
|
||
|
|
The scraping strategy mirrors OpenUtter (sumansid/openutter): we don't parse
|
||
|
|
WebRTC audio, we enable Google Meet's built-in live captions and observe the
|
||
|
|
captions container in the DOM via a MutationObserver. This is lossy and
|
||
|
|
English-biased but it is:
|
||
|
|
|
||
|
|
* deterministic (no API keys, no STT billing),
|
||
|
|
* works behind Meet's normal login / admission,
|
||
|
|
* survives Meet UI rewrites fairly well because the caption container has a
|
||
|
|
stable ARIA role.
|
||
|
|
|
||
|
|
Run standalone for debugging::
|
||
|
|
|
||
|
|
HERMES_MEET_URL=https://meet.google.com/abc-defg-hij \\
|
||
|
|
HERMES_MEET_OUT_DIR=/tmp/meet-debug \\
|
||
|
|
HERMES_MEET_HEADED=1 \\
|
||
|
|
python -m plugins.google_meet.meet_bot
|
||
|
|
|
||
|
|
No meet.google.com URL → exits non-zero. Any URL that doesn't start with
|
||
|
|
``https://meet.google.com/`` is rejected (explicit-by-design).
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import signal
|
||
|
|
import sys
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
# Match ``https://meet.google.com/abc-defg-hij`` or ``.../lookup/...`` — the
|
||
|
|
# short three-segment code or a lookup URL. Anything else is rejected.
|
||
|
|
MEET_URL_RE = re.compile(
|
||
|
|
r"^https://meet\.google\.com/("
|
||
|
|
r"[a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,}"
|
||
|
|
r"|lookup/[^/?#]+"
|
||
|
|
r"|new"
|
||
|
|
r")(?:[/?#].*)?$"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# Filenames the bot reads/writes in ``HERMES_MEET_OUT_DIR``.
|
||
|
|
SAY_QUEUE_FILENAME = "say_queue.jsonl"
|
||
|
|
SAY_PCM_FILENAME = "speaker.pcm"
|
||
|
|
|
||
|
|
|
||
|
|
def _is_safe_meet_url(url: str) -> bool:
|
||
|
|
"""Return True if *url* is a Google Meet URL we're willing to navigate to."""
|
||
|
|
if not isinstance(url, str):
|
||
|
|
return False
|
||
|
|
return bool(MEET_URL_RE.match(url.strip()))
|
||
|
|
|
||
|
|
|
||
|
|
def _meeting_id_from_url(url: str) -> str:
|
||
|
|
"""Extract the 3-segment meeting code from a Meet URL.
|
||
|
|
|
||
|
|
For ``https://meet.google.com/abc-defg-hij`` → ``abc-defg-hij``.
|
||
|
|
For ``.../lookup/<id>`` or ``/new`` we fall back to a timestamped id — the
|
||
|
|
bot won't know the real code until after redirect, and callers pass this
|
||
|
|
through to filename anyway.
|
||
|
|
"""
|
||
|
|
m = re.search(
|
||
|
|
r"meet\.google\.com/([a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,})",
|
||
|
|
url or "",
|
||
|
|
)
|
||
|
|
if m:
|
||
|
|
return m.group(1)
|
||
|
|
return f"meet-{int(time.time())}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Status + transcript file writers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class _BotState:
|
||
|
|
"""Single-process mutable state, flushed to ``status.json`` on each change."""
|
||
|
|
|
||
|
|
def __init__(self, out_dir: Path, meeting_id: str, url: str):
|
||
|
|
self.out_dir = out_dir
|
||
|
|
self.meeting_id = meeting_id
|
||
|
|
self.url = url
|
||
|
|
self.in_call = False
|
||
|
|
self.captioning = False
|
||
|
|
self.captions_enabled_attempted = False
|
||
|
|
self.lobby_waiting = False
|
||
|
|
self.join_attempted_at: Optional[float] = None
|
||
|
|
self.joined_at: Optional[float] = None
|
||
|
|
self.last_caption_at: Optional[float] = None
|
||
|
|
self.transcript_lines = 0
|
||
|
|
self.error: Optional[str] = None
|
||
|
|
self.exited = False
|
||
|
|
# v2 realtime fields.
|
||
|
|
self.realtime = False
|
||
|
|
self.realtime_ready = False
|
||
|
|
self.realtime_device: Optional[str] = None
|
||
|
|
self.audio_bytes_out: int = 0
|
||
|
|
self.last_audio_out_at: Optional[float] = None
|
||
|
|
self.last_barge_in_at: Optional[float] = None
|
||
|
|
self.leave_reason: Optional[str] = None
|
||
|
|
# Scraped captions, in order, deduped. Each entry is a dict of
|
||
|
|
# {"ts": <epoch>, "speaker": str, "text": str}.
|
||
|
|
self._seen: set = set()
|
||
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.transcript_path = out_dir / "transcript.txt"
|
||
|
|
self.status_path = out_dir / "status.json"
|
||
|
|
self._flush()
|
||
|
|
|
||
|
|
# -------- transcript ------------------------------------------------
|
||
|
|
|
||
|
|
def record_caption(self, speaker: str, text: str) -> None:
|
||
|
|
"""Append a caption line if we haven't seen this exact (speaker, text)."""
|
||
|
|
speaker = (speaker or "").strip() or "Unknown"
|
||
|
|
text = (text or "").strip()
|
||
|
|
if not text:
|
||
|
|
return
|
||
|
|
key = f"{speaker}|{text}"
|
||
|
|
if key in self._seen:
|
||
|
|
return
|
||
|
|
self._seen.add(key)
|
||
|
|
self.transcript_lines += 1
|
||
|
|
self.last_caption_at = time.time()
|
||
|
|
ts = time.strftime("%H:%M:%S", time.localtime(self.last_caption_at))
|
||
|
|
line = f"[{ts}] {speaker}: {text}\n"
|
||
|
|
# Atomic-ish append — good enough for a single-writer.
|
||
|
|
with self.transcript_path.open("a", encoding="utf-8") as f:
|
||
|
|
f.write(line)
|
||
|
|
self._flush()
|
||
|
|
|
||
|
|
# -------- status file ----------------------------------------------
|
||
|
|
|
||
|
|
def _flush(self) -> None:
|
||
|
|
data = {
|
||
|
|
"meetingId": self.meeting_id,
|
||
|
|
"url": self.url,
|
||
|
|
"inCall": self.in_call,
|
||
|
|
"captioning": self.captioning,
|
||
|
|
"captionsEnabledAttempted": self.captions_enabled_attempted,
|
||
|
|
"lobbyWaiting": self.lobby_waiting,
|
||
|
|
"joinAttemptedAt": self.join_attempted_at,
|
||
|
|
"joinedAt": self.joined_at,
|
||
|
|
"lastCaptionAt": self.last_caption_at,
|
||
|
|
"transcriptLines": self.transcript_lines,
|
||
|
|
"transcriptPath": str(self.transcript_path),
|
||
|
|
"error": self.error,
|
||
|
|
"exited": self.exited,
|
||
|
|
"pid": os.getpid(),
|
||
|
|
# v2 realtime telemetry.
|
||
|
|
"realtime": self.realtime,
|
||
|
|
"realtimeReady": self.realtime_ready,
|
||
|
|
"realtimeDevice": self.realtime_device,
|
||
|
|
"audioBytesOut": self.audio_bytes_out,
|
||
|
|
"lastAudioOutAt": self.last_audio_out_at,
|
||
|
|
"lastBargeInAt": self.last_barge_in_at,
|
||
|
|
"leaveReason": self.leave_reason,
|
||
|
|
}
|
||
|
|
tmp = self.status_path.with_suffix(".json.tmp")
|
||
|
|
tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||
|
|
tmp.replace(self.status_path)
|
||
|
|
|
||
|
|
def set(self, **kwargs) -> None:
|
||
|
|
for k, v in kwargs.items():
|
||
|
|
setattr(self, k, v)
|
||
|
|
self._flush()
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Playwright bot entry point
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
# JavaScript injected into the Meet tab to observe captions. Captures
|
||
|
|
# {speaker, text} tuples via a MutationObserver on the caption container,
|
||
|
|
# and exposes ``window.__hermesMeetDrain()`` to pull new entries. This
|
||
|
|
# mirrors the OpenUtter caption scraping approach.
|
||
|
|
_CAPTION_OBSERVER_JS = r"""
|
||
|
|
(() => {
|
||
|
|
if (window.__hermesMeetInstalled) return;
|
||
|
|
window.__hermesMeetInstalled = true;
|
||
|
|
window.__hermesMeetQueue = [];
|
||
|
|
|
||
|
|
const captionSelector = '[role="region"][aria-label*="aption" i], ' +
|
||
|
|
'div[jsname="YSxPC"], ' + // legacy
|
||
|
|
'div[jsname="tgaKEf"]'; // current (Apr 2026)
|
||
|
|
|
||
|
|
function pushEntry(speaker, text) {
|
||
|
|
if (!text || !text.trim()) return;
|
||
|
|
window.__hermesMeetQueue.push({
|
||
|
|
ts: Date.now(),
|
||
|
|
speaker: (speaker || '').trim(),
|
||
|
|
text: text.trim(),
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
function scan(root) {
|
||
|
|
// Meet captions render as a list of rows; each row contains a speaker
|
||
|
|
// label and a text block. Selectors vary across Meet rewrites; we try
|
||
|
|
// a few shapes and fall back to raw text.
|
||
|
|
const rows = root.querySelectorAll('div[jsname="dsyhDe"], div.CNusmb, div.TBMuR');
|
||
|
|
if (rows.length) {
|
||
|
|
rows.forEach((row) => {
|
||
|
|
const spkEl = row.querySelector('div.KcIKyf, div.zs7s8d, span[jsname="YSxPC"]');
|
||
|
|
const txtEl = row.querySelector('div.bh44bd, span[jsname="tgaKEf"], div.iTTPOb');
|
||
|
|
const speaker = spkEl ? spkEl.innerText : '';
|
||
|
|
const text = txtEl ? txtEl.innerText : row.innerText;
|
||
|
|
pushEntry(speaker, text);
|
||
|
|
});
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
// Fallback: treat the whole region's innerText as one anonymous line.
|
||
|
|
const text = (root.innerText || '').split('\n').filter(Boolean).pop();
|
||
|
|
pushEntry('', text);
|
||
|
|
}
|
||
|
|
|
||
|
|
function attach() {
|
||
|
|
const el = document.querySelector(captionSelector);
|
||
|
|
if (!el) return false;
|
||
|
|
const obs = new MutationObserver(() => scan(el));
|
||
|
|
obs.observe(el, { childList: true, subtree: true, characterData: true });
|
||
|
|
scan(el);
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Try now and retry on interval — the caption region only appears after
|
||
|
|
// captions are enabled and someone speaks.
|
||
|
|
if (!attach()) {
|
||
|
|
const iv = setInterval(() => { if (attach()) clearInterval(iv); }, 1500);
|
||
|
|
}
|
||
|
|
|
||
|
|
window.__hermesMeetDrain = () => {
|
||
|
|
const out = window.__hermesMeetQueue.slice();
|
||
|
|
window.__hermesMeetQueue = [];
|
||
|
|
return out;
|
||
|
|
};
|
||
|
|
})();
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def _enable_captions_js() -> str:
|
||
|
|
"""Return a small JS snippet that tries to click the 'Turn on captions' button.
|
||
|
|
|
||
|
|
Best-effort — Meet's caption toggle is keyboard-accessible via ``c``. We
|
||
|
|
dispatch that keystroke as a cheap fallback. Real click targeting is too
|
||
|
|
brittle to rely on.
|
||
|
|
"""
|
||
|
|
return r"""
|
||
|
|
(() => {
|
||
|
|
const ev = new KeyboardEvent('keydown', {
|
||
|
|
key: 'c', code: 'KeyC', keyCode: 67, which: 67, bubbles: true,
|
||
|
|
});
|
||
|
|
document.body.dispatchEvent(ev);
|
||
|
|
return true;
|
||
|
|
})();
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def _start_realtime_speaker(
|
||
|
|
*,
|
||
|
|
rt: dict,
|
||
|
|
out_dir: Path,
|
||
|
|
bridge_info: dict,
|
||
|
|
api_key: str,
|
||
|
|
model: str,
|
||
|
|
voice: str,
|
||
|
|
instructions: str,
|
||
|
|
stop_flag: dict,
|
||
|
|
state: "_BotState",
|
||
|
|
) -> None:
|
||
|
|
"""Wire up the OpenAI Realtime session + speaker thread + PCM pump.
|
||
|
|
|
||
|
|
The speaker thread reads text lines from ``say_queue.jsonl``, sends each
|
||
|
|
to OpenAI Realtime, and writes PCM audio into ``speaker.pcm``. A
|
||
|
|
separate *pump* thread forwards that PCM into the OS audio sink so
|
||
|
|
Chrome's fake mic picks it up. On Linux we pipe to ``paplay`` against
|
||
|
|
the null-sink; on macOS the caller is expected to have the BlackHole
|
||
|
|
device selected as default input.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from plugins.google_meet.realtime.openai_client import (
|
||
|
|
RealtimeSession,
|
||
|
|
RealtimeSpeaker,
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"realtime import failed: {e}")
|
||
|
|
return
|
||
|
|
|
||
|
|
pcm_path = out_dir / SAY_PCM_FILENAME
|
||
|
|
queue_path = out_dir / SAY_QUEUE_FILENAME
|
||
|
|
processed_path = out_dir / "say_processed.jsonl"
|
||
|
|
# Reset the sink file so we start clean each session.
|
||
|
|
pcm_path.write_bytes(b"")
|
||
|
|
# Make sure the queue exists so the speaker poller doesn't error on
|
||
|
|
# first iteration.
|
||
|
|
queue_path.touch()
|
||
|
|
|
||
|
|
try:
|
||
|
|
session = RealtimeSession(
|
||
|
|
api_key=api_key,
|
||
|
|
model=model,
|
||
|
|
voice=voice,
|
||
|
|
instructions=instructions,
|
||
|
|
audio_sink_path=pcm_path,
|
||
|
|
sample_rate=24000,
|
||
|
|
)
|
||
|
|
session.connect()
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"realtime connect failed: {e}")
|
||
|
|
return
|
||
|
|
|
||
|
|
rt["session"] = session
|
||
|
|
|
||
|
|
def _stop_fn():
|
||
|
|
return stop_flag.get("stop", False)
|
||
|
|
|
||
|
|
rt["speaker_stop"] = lambda: stop_flag.__setitem__("stop", stop_flag.get("stop", False))
|
||
|
|
|
||
|
|
speaker = RealtimeSpeaker(
|
||
|
|
session=session,
|
||
|
|
queue_path=queue_path,
|
||
|
|
processed_path=processed_path,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _speaker_loop():
|
||
|
|
try:
|
||
|
|
speaker.run_until_stopped(_stop_fn)
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"realtime speaker crashed: {e}")
|
||
|
|
|
||
|
|
t_speaker = threading.Thread(target=_speaker_loop, name="meet-speaker", daemon=True)
|
||
|
|
t_speaker.start()
|
||
|
|
rt["speaker_thread"] = t_speaker
|
||
|
|
|
||
|
|
# PCM pump: feeds speaker.pcm (24kHz s16le mono) into the OS audio
|
||
|
|
# device that Chrome's fake mic reads from. Different tools per
|
||
|
|
# platform, but the contract is the same — block-read the growing
|
||
|
|
# PCM file and stream it to the device in near-real-time.
|
||
|
|
platform_tag = (bridge_info or {}).get("platform")
|
||
|
|
if platform_tag == "linux":
|
||
|
|
import subprocess as _sp
|
||
|
|
|
||
|
|
sink = (bridge_info or {}).get("write_target") or "hermes_meet_sink"
|
||
|
|
try:
|
||
|
|
proc = _sp.Popen(
|
||
|
|
[
|
||
|
|
"paplay",
|
||
|
|
"--raw",
|
||
|
|
"--rate=24000",
|
||
|
|
"--format=s16le",
|
||
|
|
"--channels=1",
|
||
|
|
f"--device={sink}",
|
||
|
|
str(pcm_path),
|
||
|
|
],
|
||
|
|
stdin=_sp.DEVNULL,
|
||
|
|
stdout=_sp.DEVNULL,
|
||
|
|
stderr=_sp.DEVNULL,
|
||
|
|
)
|
||
|
|
rt["pcm_pump"] = proc
|
||
|
|
except FileNotFoundError:
|
||
|
|
state.set(error="paplay not found — install pulseaudio-utils for realtime on Linux")
|
||
|
|
elif platform_tag == "darwin":
|
||
|
|
# macOS: use ffmpeg to tail-read speaker.pcm and write it to the
|
||
|
|
# BlackHole output device. The user must have BlackHole selected
|
||
|
|
# as the default input in System Settings → Sound for Chrome to
|
||
|
|
# pick it up. We prefer ffmpeg because it's scriptable and can
|
||
|
|
# target AVFoundation devices by name; fall back to afplay-ing
|
||
|
|
# the file in a tight loop if ffmpeg is absent.
|
||
|
|
import shutil as _shutil
|
||
|
|
import subprocess as _sp
|
||
|
|
|
||
|
|
device_name = (bridge_info or {}).get("write_target") or "BlackHole 2ch"
|
||
|
|
if _shutil.which("ffmpeg"):
|
||
|
|
try:
|
||
|
|
# -re: read input at native frame rate.
|
||
|
|
# -f avfoundation -i: speaker path as raw PCM.
|
||
|
|
# -f s16le -ar 24000 -ac 1 -i <pcm>: interpret the file.
|
||
|
|
# -f audiotoolbox -audio_device_index: write to BlackHole.
|
||
|
|
# Simpler: output as raw via coreaudio using "-f audiotoolbox".
|
||
|
|
# ffmpeg's audiotoolbox output picks the current default
|
||
|
|
# output device, which isn't what we want. Instead we use
|
||
|
|
# -f avfoundation with the named device as OUTPUT via
|
||
|
|
# -vn and the device name.
|
||
|
|
proc = _sp.Popen(
|
||
|
|
[
|
||
|
|
"ffmpeg",
|
||
|
|
"-nostdin", "-hide_banner", "-loglevel", "error",
|
||
|
|
"-re",
|
||
|
|
"-f", "s16le", "-ar", "24000", "-ac", "1",
|
||
|
|
"-i", str(pcm_path),
|
||
|
|
"-f", "audiotoolbox",
|
||
|
|
"-audio_device_index", _mac_audio_device_index(device_name),
|
||
|
|
"-",
|
||
|
|
],
|
||
|
|
stdin=_sp.DEVNULL,
|
||
|
|
stdout=_sp.DEVNULL,
|
||
|
|
stderr=_sp.DEVNULL,
|
||
|
|
)
|
||
|
|
rt["pcm_pump"] = proc
|
||
|
|
except FileNotFoundError:
|
||
|
|
state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"macOS pcm pump failed to start: {e}")
|
||
|
|
else:
|
||
|
|
state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
|
||
|
|
|
||
|
|
|
||
|
|
def _mac_audio_device_index(device_name: str) -> str:
|
||
|
|
"""Return the ffmpeg ``-audio_device_index`` for *device_name*, as a string.
|
||
|
|
|
||
|
|
Probes ``ffmpeg -f avfoundation -list_devices true -i ''`` (which prints
|
||
|
|
the device table on stderr) and matches *device_name* case-insensitively.
|
||
|
|
Defaults to ``"0"`` if the device can't be found — caller will get a
|
||
|
|
misrouted stream but not a crash, and the error will be obvious.
|
||
|
|
"""
|
||
|
|
import subprocess as _sp
|
||
|
|
|
||
|
|
try:
|
||
|
|
out = _sp.run(
|
||
|
|
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
timeout=10,
|
||
|
|
)
|
||
|
|
except Exception:
|
||
|
|
return "0"
|
||
|
|
# ffmpeg prints the table on stderr. Lines look like:
|
||
|
|
# [AVFoundation indev @ 0x...] [0] BlackHole 2ch
|
||
|
|
import re as _re
|
||
|
|
|
||
|
|
needle = device_name.strip().lower()
|
||
|
|
for line in (out.stderr or "").splitlines():
|
||
|
|
m = _re.search(r"\[(\d+)\]\s+(.+)$", line)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
if m.group(2).strip().lower() == needle:
|
||
|
|
return m.group(1)
|
||
|
|
return "0"
|
||
|
|
|
||
|
|
|
||
|
|
def run_bot() -> int: # noqa: C901 — orchestration, explicit branches
|
||
|
|
url = os.environ.get("HERMES_MEET_URL", "").strip()
|
||
|
|
out_dir_env = os.environ.get("HERMES_MEET_OUT_DIR", "").strip()
|
||
|
|
headed = os.environ.get("HERMES_MEET_HEADED", "").lower() in ("1", "true", "yes")
|
||
|
|
auth_state = os.environ.get("HERMES_MEET_AUTH_STATE", "").strip()
|
||
|
|
guest_name = os.environ.get("HERMES_MEET_GUEST_NAME", "Hermes Agent")
|
||
|
|
duration_s = _parse_duration(os.environ.get("HERMES_MEET_DURATION", ""))
|
||
|
|
# v2: optional realtime mode. Enabled when HERMES_MEET_MODE=realtime.
|
||
|
|
mode = os.environ.get("HERMES_MEET_MODE", "transcribe").strip().lower()
|
||
|
|
realtime_model = os.environ.get("HERMES_MEET_REALTIME_MODEL", "gpt-realtime")
|
||
|
|
realtime_voice = os.environ.get("HERMES_MEET_REALTIME_VOICE", "alloy")
|
||
|
|
realtime_instructions = os.environ.get("HERMES_MEET_REALTIME_INSTRUCTIONS", "")
|
||
|
|
realtime_api_key = os.environ.get("HERMES_MEET_REALTIME_KEY") or os.environ.get("OPENAI_API_KEY", "")
|
||
|
|
|
||
|
|
if not url or not _is_safe_meet_url(url):
|
||
|
|
sys.stderr.write(
|
||
|
|
"google_meet bot: refusing to launch — HERMES_MEET_URL must be a "
|
||
|
|
"meet.google.com URL. got: %r\n" % url
|
||
|
|
)
|
||
|
|
return 2
|
||
|
|
if not out_dir_env:
|
||
|
|
sys.stderr.write("google_meet bot: HERMES_MEET_OUT_DIR is required\n")
|
||
|
|
return 2
|
||
|
|
|
||
|
|
out_dir = Path(out_dir_env)
|
||
|
|
meeting_id = _meeting_id_from_url(url)
|
||
|
|
state = _BotState(out_dir=out_dir, meeting_id=meeting_id, url=url)
|
||
|
|
|
||
|
|
# SIGTERM → exit cleanly so the parent ``meet_leave`` gets a finalized
|
||
|
|
# transcript. We set a flag instead of raising so the Playwright context
|
||
|
|
# teardown runs in the finally block below.
|
||
|
|
stop_flag = {"stop": False}
|
||
|
|
|
||
|
|
def _on_signal(_sig, _frame):
|
||
|
|
stop_flag["stop"] = True
|
||
|
|
|
||
|
|
signal.signal(signal.SIGTERM, _on_signal)
|
||
|
|
signal.signal(signal.SIGINT, _on_signal)
|
||
|
|
|
||
|
|
# v2 realtime: provision virtual audio device + start speaker thread.
|
||
|
|
# We track these in a dict so the finally block can tear them down
|
||
|
|
# regardless of how we exit. If anything in the realtime setup fails we
|
||
|
|
# fall back to transcribe mode with a status flag.
|
||
|
|
rt = {
|
||
|
|
"enabled": mode == "realtime",
|
||
|
|
"bridge": None, # AudioBridge | None
|
||
|
|
"bridge_info": None, # dict | None
|
||
|
|
"session": None, # RealtimeSession | None
|
||
|
|
"speaker_thread": None, # threading.Thread | None
|
||
|
|
"speaker_stop": None, # callable | None
|
||
|
|
}
|
||
|
|
if rt["enabled"]:
|
||
|
|
if not realtime_api_key:
|
||
|
|
state.set(error="realtime mode requested but no API key in HERMES_MEET_REALTIME_KEY/OPENAI_API_KEY — falling back to transcribe")
|
||
|
|
rt["enabled"] = False
|
||
|
|
else:
|
||
|
|
try:
|
||
|
|
from plugins.google_meet.audio_bridge import AudioBridge
|
||
|
|
bridge = AudioBridge()
|
||
|
|
rt["bridge_info"] = bridge.setup()
|
||
|
|
rt["bridge"] = bridge
|
||
|
|
state.set(realtime=True, realtime_device=rt["bridge_info"].get("device_name"))
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"audio bridge setup failed: {e} — falling back to transcribe")
|
||
|
|
rt["enabled"] = False
|
||
|
|
|
||
|
|
try:
|
||
|
|
from playwright.sync_api import sync_playwright
|
||
|
|
except ImportError as e:
|
||
|
|
state.set(error=f"playwright not installed: {e}", exited=True)
|
||
|
|
sys.stderr.write(
|
||
|
|
"google_meet bot: playwright is not installed. Run "
|
||
|
|
"`pip install playwright && python -m playwright install chromium`\n"
|
||
|
|
)
|
||
|
|
if rt["bridge"]:
|
||
|
|
rt["bridge"].teardown()
|
||
|
|
return 3
|
||
|
|
|
||
|
|
# Chrome env: if realtime is live on Linux, point PULSE_SOURCE at the
|
||
|
|
# virtual source so Chrome's fake mic reads the audio we generate.
|
||
|
|
chrome_env = os.environ.copy()
|
||
|
|
chrome_args = [
|
||
|
|
"--use-fake-ui-for-media-stream",
|
||
|
|
"--disable-blink-features=AutomationControlled",
|
||
|
|
]
|
||
|
|
if not rt["enabled"]:
|
||
|
|
# v1-style fake device (silence) — we don't care about mic content
|
||
|
|
# when we're not speaking.
|
||
|
|
chrome_args.insert(1, "--use-fake-device-for-media-stream")
|
||
|
|
elif rt["bridge_info"] and rt["bridge_info"].get("platform") == "linux":
|
||
|
|
chrome_env["PULSE_SOURCE"] = rt["bridge_info"].get("device_name", "")
|
||
|
|
|
||
|
|
try:
|
||
|
|
with sync_playwright() as pw:
|
||
|
|
# Playwright's launch() doesn't take env; we set PULSE_SOURCE
|
||
|
|
# via the process env before launch so the child Chrome inherits it.
|
||
|
|
for k, v in chrome_env.items():
|
||
|
|
os.environ[k] = v
|
||
|
|
browser = pw.chromium.launch(
|
||
|
|
headless=not headed,
|
||
|
|
args=chrome_args,
|
||
|
|
)
|
||
|
|
context_args = {
|
||
|
|
"viewport": {"width": 1280, "height": 800},
|
||
|
|
"user_agent": (
|
||
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||
|
|
),
|
||
|
|
"permissions": ["microphone", "camera"],
|
||
|
|
}
|
||
|
|
if auth_state and Path(auth_state).is_file():
|
||
|
|
context_args["storage_state"] = auth_state
|
||
|
|
context = browser.new_context(**context_args)
|
||
|
|
page = context.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"navigate failed: {e}", exited=True)
|
||
|
|
return 4
|
||
|
|
|
||
|
|
# Guest-mode: Meet shows a name field before "Ask to join". When
|
||
|
|
# we're authed, we instead see "Join now".
|
||
|
|
_try_guest_name(page, guest_name)
|
||
|
|
_click_join(page, state)
|
||
|
|
|
||
|
|
# Install caption observer and attempt to enable captions.
|
||
|
|
try:
|
||
|
|
page.evaluate(_enable_captions_js())
|
||
|
|
state.set(captions_enabled_attempted=True)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
page.evaluate(_CAPTION_OBSERVER_JS)
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"caption observer install failed: {e}")
|
||
|
|
|
||
|
|
# Note: in_call=False until admission is confirmed (we detect
|
||
|
|
# either the Leave button or the caption region, signalling we
|
||
|
|
# made it past the lobby).
|
||
|
|
state.set(captioning=True, join_attempted_at=time.time())
|
||
|
|
|
||
|
|
# v2 realtime: start the speaker thread reading from the
|
||
|
|
# plugin-side say queue. The thread reads JSONL lines written by
|
||
|
|
# meet_say, calls OpenAI Realtime, and streams the audio PCM to
|
||
|
|
# the virtual sink that Chrome's fake-mic is pointed at.
|
||
|
|
if rt["enabled"]:
|
||
|
|
_start_realtime_speaker(
|
||
|
|
rt=rt,
|
||
|
|
out_dir=out_dir,
|
||
|
|
bridge_info=rt["bridge_info"],
|
||
|
|
api_key=realtime_api_key,
|
||
|
|
model=realtime_model,
|
||
|
|
voice=realtime_voice,
|
||
|
|
instructions=realtime_instructions,
|
||
|
|
stop_flag=stop_flag,
|
||
|
|
state=state,
|
||
|
|
)
|
||
|
|
if rt["session"] is not None:
|
||
|
|
state.set(realtime_ready=True)
|
||
|
|
|
||
|
|
# Admission + drain loop. Runs until SIGTERM, duration expiry,
|
||
|
|
# or the page detects "You were removed / you left the
|
||
|
|
# meeting". Responsible for:
|
||
|
|
# * detecting admission (Leave button visible → in_call=True)
|
||
|
|
# * timing out stuck-in-lobby (default 5 minutes)
|
||
|
|
# * draining scraped captions into the transcript
|
||
|
|
# * triggering realtime barge-in when a human speaks while
|
||
|
|
# the bot is generating audio
|
||
|
|
# * periodically flushing realtime counters into status.json
|
||
|
|
deadline = (time.time() + duration_s) if duration_s else None
|
||
|
|
lobby_deadline = time.time() + float(
|
||
|
|
os.environ.get("HERMES_MEET_LOBBY_TIMEOUT", "300")
|
||
|
|
)
|
||
|
|
last_admission_check = 0.0
|
||
|
|
while not stop_flag["stop"]:
|
||
|
|
now = time.time()
|
||
|
|
if deadline and now > deadline:
|
||
|
|
state.set(leave_reason="duration_expired")
|
||
|
|
break
|
||
|
|
|
||
|
|
# Admission detection every ~3s until admitted.
|
||
|
|
if not state.in_call and (now - last_admission_check) > 3.0:
|
||
|
|
last_admission_check = now
|
||
|
|
admitted = _detect_admission(page)
|
||
|
|
if admitted:
|
||
|
|
state.set(
|
||
|
|
in_call=True,
|
||
|
|
lobby_waiting=False,
|
||
|
|
joined_at=now,
|
||
|
|
)
|
||
|
|
elif now > lobby_deadline:
|
||
|
|
state.set(
|
||
|
|
error=(
|
||
|
|
"lobby timeout — host never admitted the bot "
|
||
|
|
f"within {int(lobby_deadline - state.join_attempted_at) if state.join_attempted_at else 0}s"
|
||
|
|
),
|
||
|
|
leave_reason="lobby_timeout",
|
||
|
|
)
|
||
|
|
break
|
||
|
|
elif _detect_denied(page):
|
||
|
|
state.set(
|
||
|
|
error="host denied admission",
|
||
|
|
leave_reason="denied",
|
||
|
|
)
|
||
|
|
break
|
||
|
|
|
||
|
|
try:
|
||
|
|
queued = page.evaluate("window.__hermesMeetDrain && window.__hermesMeetDrain()")
|
||
|
|
if isinstance(queued, list):
|
||
|
|
for entry in queued:
|
||
|
|
if not isinstance(entry, dict):
|
||
|
|
continue
|
||
|
|
speaker = str(entry.get("speaker", ""))
|
||
|
|
text = str(entry.get("text", ""))
|
||
|
|
state.record_caption(speaker=speaker, text=text)
|
||
|
|
# Barge-in: if the bot is currently generating
|
||
|
|
# audio AND a real human just spoke, cancel the
|
||
|
|
# in-flight response so we don't talk over them.
|
||
|
|
if rt["enabled"] and rt["session"] is not None:
|
||
|
|
if _looks_like_human_speaker(speaker, guest_name):
|
||
|
|
try:
|
||
|
|
cancelled = rt["session"].cancel_response()
|
||
|
|
if cancelled:
|
||
|
|
state.set(last_barge_in_at=now)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
except Exception:
|
||
|
|
# Meet reloaded or we got booted — try to detect and
|
||
|
|
# exit gracefully rather than spinning.
|
||
|
|
if page.is_closed():
|
||
|
|
state.set(leave_reason="page_closed")
|
||
|
|
break
|
||
|
|
|
||
|
|
# Fold the realtime session's byte/timestamp counters into
|
||
|
|
# the status file so meet_status can surface them.
|
||
|
|
if rt["session"] is not None:
|
||
|
|
state.set(
|
||
|
|
audio_bytes_out=getattr(rt["session"], "audio_bytes_out", 0),
|
||
|
|
last_audio_out_at=getattr(rt["session"], "last_audio_out_at", None),
|
||
|
|
)
|
||
|
|
|
||
|
|
time.sleep(1.0)
|
||
|
|
|
||
|
|
# Try to leave cleanly — click "Leave call" button if present.
|
||
|
|
try:
|
||
|
|
page.evaluate(
|
||
|
|
"() => { const b = document.querySelector('button[aria-label*=\"eave call\"]');"
|
||
|
|
" if (b) b.click(); }"
|
||
|
|
)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
context.close()
|
||
|
|
browser.close()
|
||
|
|
# v2: teardown realtime speaker + audio bridge.
|
||
|
|
if rt["speaker_stop"]:
|
||
|
|
try:
|
||
|
|
rt["speaker_stop"]()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
if rt["speaker_thread"] is not None:
|
||
|
|
try:
|
||
|
|
rt["speaker_thread"].join(timeout=5.0)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
if rt["session"]:
|
||
|
|
try:
|
||
|
|
rt["session"].close()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
if rt["bridge"]:
|
||
|
|
try:
|
||
|
|
rt["bridge"].teardown()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
state.set(in_call=False, captioning=False, exited=True)
|
||
|
|
return 0
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
state.set(error=f"unhandled: {e}", exited=True)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
|
||
|
|
def _try_guest_name(page, guest_name: str) -> None:
|
||
|
|
"""If Meet is showing a guest-name input, type *guest_name* into it."""
|
||
|
|
try:
|
||
|
|
# Meet's guest name input has placeholder "Your name".
|
||
|
|
locator = page.locator('input[aria-label*="name" i]').first
|
||
|
|
if locator.count() and locator.is_visible():
|
||
|
|
locator.fill(guest_name, timeout=2_000)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def _detect_admission(page) -> bool:
|
||
|
|
"""True if we're clearly past the lobby and in the call itself.
|
||
|
|
|
||
|
|
Uses a JS-side probe because Meet's DOM structure varies by client
|
||
|
|
version. We check several high-signal indicators and declare admission
|
||
|
|
on the first hit:
|
||
|
|
|
||
|
|
1. Leave-call button is present (``aria-label`` contains "eave call").
|
||
|
|
2. Caption region has appeared (we installed the observer and it attached).
|
||
|
|
3. The participant list container is visible.
|
||
|
|
|
||
|
|
Conservative by default — returns False on any error.
|
||
|
|
"""
|
||
|
|
probe = r"""
|
||
|
|
(() => {
|
||
|
|
const leave = document.querySelector('button[aria-label*="eave call" i]');
|
||
|
|
if (leave) return true;
|
||
|
|
if (window.__hermesMeetInstalled) {
|
||
|
|
const caps = document.querySelector(
|
||
|
|
'[role="region"][aria-label*="aption" i], ' +
|
||
|
|
'div[jsname="YSxPC"], div[jsname="tgaKEf"]'
|
||
|
|
);
|
||
|
|
if (caps) return true;
|
||
|
|
}
|
||
|
|
const parts = document.querySelector('[aria-label*="articipants" i]');
|
||
|
|
if (parts) return true;
|
||
|
|
return false;
|
||
|
|
})();
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
return bool(page.evaluate(probe))
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _detect_denied(page) -> bool:
|
||
|
|
"""True when Meet is showing a 'you were denied' / 'no one admitted' page."""
|
||
|
|
probe = r"""
|
||
|
|
(() => {
|
||
|
|
const text = document.body ? document.body.innerText || '' : '';
|
||
|
|
// English only — matches what shows up when the host denies or
|
||
|
|
// removes a guest.
|
||
|
|
if (/You can't join this video call/i.test(text)) return true;
|
||
|
|
if (/You were removed from the meeting/i.test(text)) return true;
|
||
|
|
if (/No one responded to your request to join/i.test(text)) return true;
|
||
|
|
return false;
|
||
|
|
})();
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
return bool(page.evaluate(probe))
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _looks_like_human_speaker(speaker: str, bot_guest_name: str) -> bool:
|
||
|
|
"""Whether a caption line's speaker is probably a human, not our bot echo.
|
||
|
|
|
||
|
|
Meet attributes captions to the speaker's display name. When Chrome is
|
||
|
|
reading our fake mic, Meet still attributes captions to *our* bot name
|
||
|
|
(because the bot is the one "speaking"). We don't want those to trigger
|
||
|
|
barge-in. Anything else — real participant names — does.
|
||
|
|
|
||
|
|
Conservative: unknown / blank speakers (common when caption scraping
|
||
|
|
falls back to raw text) do NOT trigger barge-in, because we can't tell
|
||
|
|
whether it was a human or us.
|
||
|
|
"""
|
||
|
|
if not speaker or not speaker.strip():
|
||
|
|
return False
|
||
|
|
spk = speaker.strip().lower()
|
||
|
|
if spk in ("unknown", "you", bot_guest_name.strip().lower()):
|
||
|
|
return False
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
def _click_join(page, state: _BotState) -> None:
|
||
|
|
"""Click 'Join now' or 'Ask to join' if either button is visible.
|
||
|
|
|
||
|
|
Flags ``lobby_waiting`` when we hit the "waiting for host to admit you"
|
||
|
|
state so the agent can surface that in status.
|
||
|
|
"""
|
||
|
|
for label in ("Join now", "Ask to join"):
|
||
|
|
try:
|
||
|
|
btn = page.get_by_role("button", name=label, exact=False).first
|
||
|
|
if btn.count() and btn.is_visible():
|
||
|
|
btn.click(timeout=3_000)
|
||
|
|
if label == "Ask to join":
|
||
|
|
state.set(lobby_waiting=True)
|
||
|
|
break
|
||
|
|
except Exception:
|
||
|
|
continue
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_duration(raw: str) -> Optional[float]:
|
||
|
|
"""Parse ``30m`` / ``2h`` / ``90`` (seconds) → float seconds, or None."""
|
||
|
|
if not raw:
|
||
|
|
return None
|
||
|
|
raw = raw.strip().lower()
|
||
|
|
try:
|
||
|
|
if raw.endswith("h"):
|
||
|
|
return float(raw[:-1]) * 3600
|
||
|
|
if raw.endswith("m"):
|
||
|
|
return float(raw[:-1]) * 60
|
||
|
|
if raw.endswith("s"):
|
||
|
|
return float(raw[:-1])
|
||
|
|
return float(raw)
|
||
|
|
except ValueError:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__": # pragma: no cover — subprocess entry point
|
||
|
|
sys.exit(run_bot())
|