feat(desktop): add context usage breakdown popover

Let users click the status bar context indicator to see how tokens are split across system prompt, tools, rules, skills, MCP, and conversation. Co-authored-by: Cursor <cursoragent@cursor.com>
test(auxiliary): cover NVIDIA NIM max_tokens in _build_call_kwargs
2026-06-29 23:05:20 +08:00 · 2026-06-29 09:18:10 -04:00 · 2026-06-29 18:04:39 +05:30 · 2026-06-29 18:04:39 +05:30 · 2026-06-29 21:33:00 +10:00 · 2026-06-29 04:25:51 -07:00
609 changed files with 43565 additions and 6259 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -66,8 +66,12 @@ runtime/

 # ---------- Not needed inside the Docker image ----------

-# Desktop app source (Tauri/Electron); never installed in the container
+# Desktop app source (Tauri/Electron); never installed in the container.
+# apps/shared is the dashboard↔desktop websocket helper and is linked from
+# web/package.json as a file: workspace dep — keep it in the build context.
 apps/
+!apps/shared/
+!apps/shared/**

 # Test suite — not shipped in production images
 tests/
--- a/.envrc
+++ b/.envrc
@@ -1,5 +1,5 @@
 watch_file pyproject.toml uv.lock
 watch_file package-lock.json package.json web/package.json ui-tui/package.json website/package.json apps/shared/package.json apps/desktop/package.json ui-tui/packages/hermes-ink/package.json
-watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix nix/hermes-agent.nix nix/desktop.nix

 use flake
--- a/.github/actions/hermes-smoke-test/action.yml
+++ b/.github/actions/hermes-smoke-test/action.yml
@@ -1,50 +0,0 @@
-name: Hermes smoke test
-description: >
-  Run the image's built-in entrypoint against `--help` and `dashboard --help`
-  to catch basic runtime regressions before publishing.  Requires the image
-  to already be loaded into the local Docker daemon under `image`.
-
-  Works identically on amd64 and arm64 runners.
-
-inputs:
-  image:
-    description: Fully-qualified image tag (e.g. nousresearch/hermes-agent:test)
-    required: true
-
-runs:
-  using: composite
-  steps:
-    - name: Ensure /tmp/hermes-test is hermes-writable
-      shell: bash
-      run: |
-        # The image runs as the hermes user (UID 10000).  GitHub Actions
-        # creates /tmp/hermes-test root-owned by default, which hermes
-        # can't write to — chown it to match the in-container UID before
-        # bind-mounting.  Real users doing `docker run -v ~/.hermes:...`
-        # with their own UID hit the same issue and have their own
-        # remediations (HERMES_UID env var, or chown locally).
-        mkdir -p /tmp/hermes-test
-        sudo chown -R 10000:10000 /tmp/hermes-test
-
-    - name: hermes --help
-      shell: bash
-      run: |
-        # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so
-        # this exercises the actual production startup path. PR #30136
-        # review caught that an --entrypoint override here had been
-        # silently neutered by the s6-overlay migration — stage2-hook
-        # ignores its CMD args, so the smoke test was a no-op.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" --help
-
-    - name: hermes dashboard --help
-      shell: bash
-      run: |
-        # Regression guard for #9153: dashboard was present in source but
-        # missing from the published image.  If this fails, something in
-        # the Dockerfile is excluding the dashboard subcommand from the
-        # installed package.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" dashboard --help
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ permissions:
  pull-requests: write # needed by lint (PR comment) + supply-chain (PR comment)
  actions: read # needed by osv-scanner (SARIF upload)
  security-events: write # needed by osv-scanner (SARIF upload)
+  packages: write # needed by docker build

 concurrency:
  group: ci-${{ github.ref }}
@@ -32,6 +33,7 @@ jobs:
  # (all lanes true) so post-merge validation is never weakened.
  # ─────────────────────────────────────────────────────────────────────
  detect:
+    name: Detect affected areas
    runs-on: ubuntu-latest
    outputs:
      python: ${{ steps.classify.outputs.python }}
@@ -53,11 +55,15 @@ jobs:
  # Skipped workflows (if condition is false) don't spin up runners.
  # ─────────────────────────────────────────────────────────────────────
  tests:
+    name: Python tests
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/tests.yml
+    with:
+      slice_count: 8

  lint:
+    name: Python lints
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/lint.yml
@@ -65,35 +71,49 @@ jobs:
      event_name: ${{ needs.detect.outputs.event_name }}

  typecheck:
+    name: TypeScript
    needs: detect
    if: needs.detect.outputs.frontend == 'true'
    uses: ./.github/workflows/typecheck.yml

  docs-site:
+    name: Docs Site
    needs: detect
    if: needs.detect.outputs.site == 'true'
    uses: ./.github/workflows/docs-site-checks.yml

  history-check:
+    name: Deny unrelated histories
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request'
    uses: ./.github/workflows/history-check.yml

  contributor-check:
+    name: Check contributors
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/contributor-check.yml

  uv-lockfile:
+    name: Check uv.lock
    needs: detect
    uses: ./.github/workflows/uv-lockfile-check.yml

  docker-lint:
+    name: Lint Docker scripts
    needs: detect
    if: needs.detect.outputs.docker_meta == 'true'
    uses: ./.github/workflows/docker-lint.yml

+  docker:
+    name: Build&Test Docker image
+    needs: detect
+    if: needs.detect.outputs.python == 'true' || needs.detect.outputs.frontend == 'true' || needs.detect.outputs.docker_meta == 'true'
+    uses: ./.github/workflows/docker.yml
+    secrets: inherit
+
  supply-chain:
+    name: Supply-chain scan
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request' && (needs.detect.outputs.scan == 'true' || needs.detect.outputs.deps == 'true' || needs.detect.outputs.mcp_catalog == 'true')
    uses: ./.github/workflows/supply-chain-audit.yml
@@ -104,7 +124,7 @@ jobs:
      mcp_catalog: ${{ needs.detect.outputs.mcp_catalog == 'true' }}

  osv-scanner:
-    needs: detect
+    name: OSV scan
    uses: ./.github/workflows/osv-scanner.yml

  # ─────────────────────────────────────────────────────────────────────
@@ -127,6 +147,8 @@ jobs:
      - docker-lint
      - supply-chain
      - osv-scanner
+      # we don't require docker to pass rn because it's so slow lol
+      # - docker
    if: always()
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/docker-lint.yml
+++ b/.github/workflows/docker-lint.yml
@@ -2,7 +2,7 @@ name: Docker / shell lint

 # Lints the container build inputs: Dockerfile (via hadolint) and any shell
 # scripts under docker/ (via shellcheck). These catch the class of regression
-# the behavioral docker-publish smoke test can't — unquoted variable
+# the behavioral docker smoke test can't — unquoted variable
 # expansions, silently-failing RUN commands, etc.
 #
 # Rules and ignores are documented in .hadolint.yaml at the repo root.
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,24 +1,9 @@
-name: Docker Build and Publish
+name: Docker Build, Test, and Publish

 on:
-  push:
-    branches: [main]
-    paths:
-      - '**/*.py'
-      - 'pyproject.toml'
-      - 'uv.lock'
-      - 'Dockerfile'
-      - 'docker/**'
-      - '.github/workflows/docker-publish.yml'
-      - '.github/actions/hermes-smoke-test/**'
-
-  # No paths filter — the job must always run so the required check
-  # reports a status (path-gated workflows leave checks "pending" forever
-  # when no matching files change, which blocks merge).
-  pull_request:
-
  release:
    types: [published]
+  workflow_call:

 permissions:
  contents: read
@@ -39,11 +24,7 @@ env:
  IMAGE_NAME: nousresearch/hermes-agent

 jobs:
-  # ---------------------------------------------------------------------------
-  # Build amd64 natively.  This job also runs the smoke tests (basic --help
-  # and the dashboard subcommand regression guard from #9153), because amd64
-  # is the only arch we can `load` into the local daemon on an amd64 runner.
-  # ---------------------------------------------------------------------------
+  # Build, test, and optionally push the amd64 image.
  build-amd64:
    # Only run on the upstream repository, not on forks
    if: github.repository == 'NousResearch/hermes-agent'
@@ -53,24 +34,19 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      # The image build + smoke test + integration tests run ONLY on
-      # push-to-main and release — never on PRs. They are the heaviest jobs
-      # in CI (~15-45 min) and a broken build surfaces on the main push (and
-      # is gated pre-merge by docker-lint + uv-lockfile-check). Every step
-      # below is skipped on PRs, so the job still reports green and the
-      # required check never hangs.
+      # The image build + integration tests run on every event
+      # (PRs, push-to-main, release). Publish steps below are gated to
+      # push-to-main / release only.
      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

-      # Build once, load into the local daemon for smoke testing.  Cached
+      # Build once, load into the local daemon for testing.  Cached
      # to gha with a per-arch scope; the push step below reuses every
      # layer from this build.
-      - name: Build image (amd64, smoke test)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+      - name: Build image (amd64)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -82,25 +58,12 @@ jobs:
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64

-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
-
-      # ---------------------------------------------------------------------
      # Run the docker-integration test suite against the freshly-built
-      # image already loaded into the local daemon (`:test`).  These tests
-      # are excluded from the sharded `tests.yml :: test` matrix on purpose
-      # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each
-      # shard would otherwise reach the session-scoped ``built_image``
-      # fixture in ``tests/docker/conftest.py`` and start a 3-7min
-      # ``docker build`` — guaranteed to
-      # die in fixture setup.
+      # image already loaded into the local daemon (`:test`).
      #
-      # Piggybacking here avoids a second image build: the smoke test
-      # already proved the image loads + runs, so the daemon has it under
-      # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at
+      # Piggybacking here avoids a second image build: the build step
+      # already loaded the image into the daemon under
+      # `${IMAGE_NAME}:test`, so we just point ``HERMES_TEST_IMAGE`` at
      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
      #
@@ -110,26 +73,20 @@ jobs:
      # cheapest path to coverage on every PR that touches docker code.
      # ---------------------------------------------------------------------
      - name: Install uv (for docker tests)
-        if: github.event_name != 'pull_request'
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Set up Python 3.11 (for docker tests)
-        if: github.event_name != 'pull_request'
        run: uv python install 3.11

      - name: Install Python dependencies (for docker tests)
-        if: github.event_name != 'pull_request'
        run: |
-          uv venv .venv --python 3.11
-          source .venv/bin/activate
          # ``dev`` extra pulls in pytest, pytest-asyncio —
          # everything tests/docker/ needs.  We deliberately avoid ``all``
          # here because the docker tests only drive the container via
          # subprocess and don't import hermes_agent's optional deps.
-          uv pip install -e ".[dev]"
+          uv sync --locked --python 3.11 --extra dev

      - name: Run docker integration tests
-        if: github.event_name != 'pull_request'
        env:
          # Skip rebuild; use the image already loaded by the build step.
          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
@@ -139,12 +96,11 @@ jobs:
          OPENAI_API_KEY: ""
          NOUS_API_KEY: ""
        run: |
-          source .venv/bin/activate
-          python -m pytest tests/docker/ -v --tb=short
+          scripts/run_tests.sh tests/docker/ --file-timeout 600

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -155,7 +111,7 @@ jobs:
      - name: Push amd64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -179,7 +135,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: digest-amd64
          path: /tmp/digests/*
@@ -187,10 +143,7 @@ jobs:
          retention-days: 1

  # ---------------------------------------------------------------------------
-  # Build arm64 natively on GitHub's free arm64 runner.  This replaces the
-  # previous QEMU-emulated arm64 build, which was ~5-10x slower and shared
-  # a cache scope with amd64.  Matches the amd64 job's shape: build+load,
-  # smoke test, then on push/release push by digest.
+  # Build, test, and optionally push the arm64 image.
  # ---------------------------------------------------------------------------
  build-arm64:
    if: github.repository == 'NousResearch/hermes-agent'
@@ -200,29 +153,26 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      # arm64 build runs only on push-to-main and release (see build-amd64).
      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

      # Log in to ghcr.io so the registry-backed build cache below can be
      # read (cache-from) on every event and written (cache-to) on
      # push/release.  Uses the workflow's GITHUB_TOKEN, which is valid for
      # the whole job — unlike the gha cache backend's short-lived Azure SAS
      # token, which expired mid-build on slow cold-cache arm64 runs and
-      # crashed the build before the smoke test (the reason the gha cache
+      # crashed the build before the tests ran (the reason the gha cache
      # was removed from arm64 PRs in the first place).
      - name: Log in to ghcr.io (build cache)
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # Build once, load into the local daemon for smoke testing, then push
+      # Build once, load into the local daemon for testing, then push
      # by digest below. Reads AND writes the registry-backed cache so the
      # push reuses layers from this build and the next build starts warm.
      #
@@ -230,9 +180,8 @@ jobs:
      # cache that previously broke here: its credential is the job-lifetime
      # GITHUB_TOKEN, not a short-lived SAS token, so the cold-build-outlives-
      # token failure mode cannot recur.
-      - name: Build image (arm64, smoke test, cached publish)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+      - name: Build image (arm64, cached publish)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -244,15 +193,29 @@ jobs:
          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max

-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
+      - name: Install uv for docker tests
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+
+      - name: Set up Python 3.11 for docker tests
+        run: uv python install 3.11
+
+      - name: Install Python dependencies for docker tests
+        run: |
+          uv sync --locked --python 3.11 --extra dev
+
+      - name: Run docker tests
+        env:
+          # Skip rebuild; use the image already loaded by the build step.
+          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
+          OPENROUTER_API_KEY: ""
+          OPENAI_API_KEY: ""
+          NOUS_API_KEY: ""
+        run: |
+          scripts/run_tests.sh tests/docker/ --file-timeout 600

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -260,7 +223,7 @@ jobs:
      - name: Push arm64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -282,7 +245,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: digest-arm64
          path: /tmp/digests/*
@@ -304,17 +267,17 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Download digests
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          path: /tmp/digests
          pattern: digest-*
          merge-multiple: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -37,7 +37,7 @@ jobs:
          fetch-depth: 0 # need full history for merge-base + worktree

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff + ty
        uses: ./.github/actions/retry
@@ -110,7 +110,7 @@ jobs:
          cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY"

      - name: Upload reports as artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: lint-reports
          path: .lint-reports/
@@ -164,7 +164,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff
        uses: ./.github/actions/retry
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -3,17 +3,17 @@ name: Build Skills Index
 on:
  schedule:
    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: '0 6,18 * * *'
-  workflow_dispatch:  # Manual trigger
+    - cron: "0 6,18 * * *"
+  workflow_dispatch: # Manual trigger
  push:
    branches: [main]
    paths:
-      - 'scripts/build_skills_index.py'
-      - '.github/workflows/skills-index.yml'
+      - "scripts/build_skills_index.py"
+      - ".github/workflows/skills-index.yml"

 permissions:
  contents: read
-  actions: write   # to trigger deploy-site.yml on schedule
+  actions: write # to trigger deploy-site.yml on schedule

 jobs:
  build-index:
@@ -21,11 +21,11 @@ jobs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.11'
+          python-version: "3.11"

      - name: Install dependencies
        run: pip install httpx==0.28.1 pyyaml==6.0.2
@@ -36,7 +36,7 @@ jobs:
        run: python scripts/build_skills_index.py

      - name: Upload index artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: skills-index
          path: website/static/api/skills-index.json
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,6 +2,11 @@ name: Tests

 on:
  workflow_call:
+    inputs:
+      slice_count:
+        description: Number of parallel test slices
+        type: number
+        default: 8

 permissions:
  contents: read
@@ -12,13 +17,11 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  test:
+  generate:
+    name: "Generate slices"
    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        slice: [1, 2, 3, 4, 5, 6]
+    outputs:
+      matrix: ${{ steps.matrix.outputs.matrix }}
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -27,13 +30,26 @@ jobs:
        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: test_durations.json
-          # main always writes a new suffix, but jobs pick the latest one with the same prefix
-          # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
-          # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
-          # If there are no exact matches, the action searches for partial matches of the restore keys.
-          # When the action finds a partial match, the most recent cache is restored to the path directory.
          key: test-durations

+      - name: Generate test slices
+        id: matrix
+        run: |
+          MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
+          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
+
+  test:
+    name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
+    needs: generate
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
      - name: Install ripgrep (prebuilt binary)
        run: |
          set -euo pipefail
@@ -49,7 +65,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -78,33 +94,19 @@ jobs:
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice }}/6)
-        # Per-file isolation via scripts/run_tests_parallel.py: discovers
-        # every test_*.py file under tests/ (excluding integration/ + e2e/),
-        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
+      - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
+        # Per-file isolation via scripts/run_tests.sh: each test file runs
+        # in its own freshly-spawned `python -m pytest <file>` subprocess
        # with bounded parallelism. No xdist, no shared workers, no
        # module-level state leakage between files.
        #
-        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
-        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
-        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
-        # every file a fresh interpreter — the only isolation boundary
-        # that matters in practice (cross-file leakage was the original
-        # flake source; intra-file is the test author's responsibility).
-        #
-        # Why drop xdist entirely: xdist's persistent workers accumulate
-        # state across files, which is exactly the leakage we wanted to
-        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
-        # the job with cleaner semantics.
-        #
-        # Matrix slicing (--slice I/N): files are distributed across 6
-        # jobs by cached duration (LPT algorithm) so each job gets
-        # roughly equal wall time. Without a cache, files default to 2s
-        # estimate and get split roughly evenly by count — still correct,
-        # just not perfectly balanced.
+        # File list is pre-computed by the generate job (--generate-slices)
+        # which runs LPT distribution once and passes the file list to each
+        # matrix job via --files. Previously each job re-discovered files and
+        # re-ran LPT independently — redundant N times.
        run: |
          source .venv/bin/activate
-          python scripts/run_tests_parallel.py --slice ${{ matrix.slice }}/6
+          scripts/run_tests.sh --files '${{ matrix.slice.files }}'
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -114,7 +116,7 @@ jobs:
      - name: Upload per-slice durations
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: test-durations-slice-${{ matrix.slice }}
+          name: test-durations-slice-${{ matrix.slice.index }}
          path: test_durations.json
          retention-days: 1

@@ -173,7 +175,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
--- a/.github/workflows/typecheck.yml
+++ b/.github/workflows/typecheck.yml
@@ -6,6 +6,7 @@ on:

 jobs:
  typecheck:
+    name: Check TypeScript
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -22,8 +23,7 @@ jobs:
      # native builds. Skipping install scripts drops node-pty's node-gyp
      # header fetch — the transient flake that killed this job pre-`tsc` — and
      # is faster. retry covers the remaining registry blips.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci --ignore-scripts
      - run: npm run --prefix ${{ matrix.package }} typecheck
@@ -35,6 +35,7 @@ jobs:
  # users build apps/desktop from source on install/update. Run the real
  # `vite build` here so that class of break fails in CI instead.
  desktop-build:
+    name: Build desktop app
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -44,8 +45,7 @@ jobs:
          cache: npm
      # Keep install scripts here: the production build may need node-pty's
      # native binary. retry handles the transient install-time fetch flakes.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci
      - run: npm run --prefix apps/desktop build
--- a/.github/workflows/upload_to_pypi.yml
+++ b/.github/workflows/upload_to_pypi.yml
@@ -5,11 +5,11 @@ name: Publish to PyPI
 on:
  push:
    tags:
-      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+      - "v20*" # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
  workflow_dispatch:
    inputs:
      confirm_tag:
-        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
+        description: "Tag to publish (e.g. v2026.5.15). Must already exist."
        required: true
        type: string

@@ -27,7 +27,7 @@ jobs:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
          # On workflow_dispatch, check out the confirmed tag.
@@ -43,17 +43,17 @@ jobs:
          fi

      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.13'
+          python-version: "3.13"

      - name: Install uv
-        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
        with:
-          node-version: '22'
+          node-version: "22"

      - name: Build web dashboard
        run: cd web && npm ci && npm run build
@@ -81,7 +81,7 @@ jobs:
        run: uv build --sdist --wheel

      - name: Upload distribution artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: python-package-distributions
          path: dist/
@@ -94,17 +94,17 @@ jobs:
      name: pypi
      url: https://pypi.org/p/hermes-agent
    permissions:
-      id-token: write  # OIDC trusted publishing
+      id-token: write # OIDC trusted publishing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
        with:
          skip-existing: true

@@ -116,12 +116,12 @@ jobs:
    needs: publish
    runs-on: ubuntu-latest
    permissions:
-      contents: write   # attach assets to the existing release
-      id-token: write   # sigstore signing
+      contents: write # attach assets to the existing release
+      id-token: write # sigstore signing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -145,7 +145,7 @@ jobs:

      - name: Sign with Sigstore
        if: env.skip_sign != 'true'
-        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc  # v3.3.0
+        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc # v3.3.0
        with:
          inputs: >-
            ./dist/*.tar.gz
--- a/.github/workflows/uv-lockfile-check.yml
+++ b/.github/workflows/uv-lockfile-check.yml
@@ -4,7 +4,7 @@ name: uv.lock check
 # that modify pyproject.toml without regenerating uv.lock (or vice versa)
 # must not merge, because the Docker build's `uv sync --frozen` step will
 # fail on a stale lockfile and we'd rather catch it here than in the
-# docker-publish workflow on main.
+# docker workflow on main.
 #
 # ─────────────────────────────────────────────────────────────────────────
 # IMPORTANT: this check runs against the MERGED state, not just your branch
@@ -63,7 +63,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      # `uv lock --check` re-resolves the project from pyproject.toml and
      # compares the result to uv.lock, exiting non-zero if they disagree.
@@ -100,7 +100,7 @@ jobs:

          This check is blocking because the Docker image build uses
          `uv sync --frozen --extra all`, which rejects stale lockfiles
-          — catching it here avoids a ~15 min failed docker-publish run
+          — catching it here avoids a ~15 min failed docker run
          on `main` post-merge.
          EOF
            echo "::error title=uv.lock out of sync::Run \`uv lock\` locally and commit the result. If on a PR, sync with main first."
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,9 @@ RELEASE_v*.md
 # Desktop demo-run scratch output (hermes writes demo/*.txt during recorded
 # walkthroughs). Throwaway artifacts, never part of the app.
 apps/desktop/demo/
+
+# PR infographics are rendered locally and embedded in PR descriptions via the
+# image-provider (fal.media) URL — they are NEVER committed to the repo. The
+# PR body is the archive. See the hermes-agent-dev skill's
+# pr-infographic-workflow reference (storage rule + lapse #8 / #COMMIT-1).
+infographic/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -123,6 +123,17 @@ conservative at the waist.
  without E2E proof, and plugins that touch core files.** Plugins live in their
  own directory and work within the ABCs/hooks we provide; if a plugin needs
  more, widen the generic plugin surface, don't special-case it in core.
+- **Third-party products / other people's projects integrated into the core
+  tree.** Observability backends, vendor SaaS integrations, analytics dashboards,
+  and similar "someone else's product" plugins do NOT land under `plugins/` in
+  this repo. They place an ongoing maintenance burden on us to keep them working
+  against a fast-moving core, for a backend we don't own. Ship them as a
+  **standalone plugin repo** users install into `~/.hermes/plugins/` (or via a
+  pip entry point), and promote them in the Nous Research Discord
+  (`#plugins-skills-and-skins`). This is a coupling-and-maintenance decision, not
+  a quality bar — the plugin can be excellent and still be a close. PRs that add
+  such a directory to the tree are closed with a pointer to publish it as its own
+  repo.

 ### Before you call it a bug — verify the premise (and when NOT to close)

@@ -480,7 +491,7 @@ The dashboard embeds the real `hermes --tui` — **not** a rewrite.  See `hermes

 ### Electron Desktop Chat App (`apps/desktop/`)

-A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.
+A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). The WebSocket/JSON-RPC transport lives in the framework-agnostic `apps/shared` package (`@hermes/shared` — `JsonRpcGatewayClient` + WS URL helpers), which the web dashboard (`web/`) also consumes; **desktop has no build/runtime dependency on the dashboard frontend** — it spawns a headless `hermes serve` backend server (the same gateway `dashboard` serves, minus the browser UI). `dashboard` and `serve` share `cmd_dashboard`/`start_server` but are independent surfaces — neither launches the other. The one exception is a backward-compat *fallback*: `serve` is newer, so the desktop spawn (`electron/backend-command.cjs` + `backendSupportsServe()` in `main.cjs`) detects whether the resolved runtime registers `serve` and, only when it does not (an older managed install / PATH `hermes` the app hasn't updated yet), rewrites the argv to the legacy `dashboard --no-open`. Without that, a new app against an un-upgraded runtime would crash on an unknown subcommand and brick every mid-upgrade user. It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.

 **Slash commands in the desktop app are curated client-side, then dispatched to the backend.** The pipeline:

@@ -783,6 +794,24 @@ landing in this tree. PRs that add a new directory under
 provider as its own repo. Existing in-tree providers stay; bug fixes
 to them are welcome.

+**No new third-party-product plugins in-tree (policy, June 2026):** the
+same rule applies beyond memory providers. Plugins that integrate
+someone else's product or project — observability/metrics backends,
+vendor SaaS connectors, analytics dashboards, paid-service tie-ins —
+must ship as **standalone plugin repos** that users install into
+`~/.hermes/plugins/` (or via pip entry points). They register through
+the existing plugin discovery path and use the ABCs/hooks/ctx surface
+we expose; nothing special is needed in core. The reason is
+maintenance load: every product we absorb into the tree becomes our
+burden to keep working against a fast-moving core, for a backend we
+don't own. Promote standalone plugins in the Nous Research Discord
+(`#plugins-skills-and-skins`). PRs that add such a directory under
+`plugins/` are closed with a pointer to publish it as its own repo —
+this is a coupling decision, not a quality judgment. (The
+`observability/`, `kanban/`, `disk-cleanup/`, etc. directories already
+in the tree are existing precedent, not an invitation to add more
+third-party-product plugins alongside them.)
+
 ### Model-provider plugins (`plugins/model-providers/<name>/`)

 Every inference backend (openrouter, anthropic, gmi, deepseek, nvidia, …)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -85,6 +85,23 @@ This isn't a quality bar — it's a coupling-and-maintenance decision. Memory pr

 ---

+## Third-Party Product Integrations: Ship as a Standalone Plugin
+
+The same rule extends to **any plugin that integrates someone else's product or project** — observability/metrics backends, vendor SaaS connectors, analytics dashboards, paid-service tie-ins, and similar third-party integrations. **These do not land in this repo.**
+
+The reason is maintenance load, not quality. Every external product absorbed into the core tree becomes ours to keep working against a fast-moving codebase, for a backend we don't own and can't control. Hermes ships a lot and the core moves quickly; coupling third-party products into it creates an open-ended burden on the maintainers.
+
+Publish these as a **standalone plugin repo** instead:
+
+- Implement the relevant ABC and use the existing plugin discovery path (`~/.hermes/plugins/`, project `.hermes/plugins/`, or a pip entry point) — see [Build a Hermes Plugin](https://hermes-agent.nousresearch.com/docs/guides/build-a-hermes-plugin)
+- Register lifecycle hooks (`pre_tool_call`, `post_tool_call`, `pre_llm_call`, `post_llm_call`, `on_session_start`, `on_session_end`), tools (`ctx.register_tool`), and CLI subcommands (`ctx.register_cli_command`) through the surface we already expose — no core changes needed
+- If your plugin needs a capability the framework doesn't expose, that's a feature request to **widen the generic plugin surface** (a new hook or `ctx` method) — never special-case your plugin in core
+- Promote it in the [Nous Research Discord](https://discord.gg/NousResearch) `#plugins-skills-and-skins` channel so users can find and install it
+
+A well-built third-party-product plugin can clear automated review and still be closed for this reason — it's a placement decision, not a verdict on the code. PRs that add such a directory under `plugins/` will be closed with a pointer to publish it as its own repo.
+
+---
+
 ## Development Setup

 ### Prerequisites
--- a/32
+++ b/32
@@ -119,6 +119,9 @@ COPY package.json package-lock.json ./
 COPY web/package.json web/
 COPY ui-tui/package.json ui-tui/
 COPY ui-tui/packages/hermes-ink/ ui-tui/packages/hermes-ink/
+# apps/shared/ is copied IN FULL because web/package.json references it as a
+# `file:` workspace dependency (same pattern as hermes-ink above).
+COPY apps/shared/ apps/shared/

 # `npm_config_install_links=false` forces npm to install `file:` deps as
 # symlinks instead of copies.  This is the default since npm 10+, which is
@@ -184,12 +187,19 @@ RUN uv sync --frozen --no-install-project --extra all --extra messaging --extra
 # invalidate the (relatively slow) web + ui-tui build layer.
 COPY web/ web/
 COPY ui-tui/ ui-tui/
+COPY apps/shared/ apps/shared/
 RUN cd web && npm run build && \
    cd ../ui-tui && npm run build

 # ---------- Source code ----------
 # .dockerignore excludes node_modules, so the installs above survive.
-COPY . .
+# --link decouples this layer from parents for cache purposes; --chmod bakes
+# the final read-only permissions at copy time so we skip the separate
+# `chmod -R` pass that previously walked ~30k files across the venv +
+# node_modules + source (21s amd64 / 222s arm64 — #49113).  `a+rX,go-w`
+# gives the non-root hermes user read + traverse but no write; root retains
+# write so the build steps below don't need chmod u+w dances.
+COPY --link --chmod=a+rX,go-w . .

 # ---------- Permissions ----------
 # Link hermes-agent itself (editable). Deps are already installed in the
@@ -197,19 +207,15 @@ COPY . .
 # resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

-# Keep /opt/hermes immutable for the runtime hermes user. Hosted/container
-# instances must not be able to self-edit the installed source or venv; user
-# data, skills, plugins, config, logs, and dashboard uploads live under
-# /opt/data instead. Root can still repair the image during build/boot, but
-# supervised Hermes processes drop to the non-root hermes user.
+# Wire the exec shim and install-method stamp.  Files under /opt/hermes are
+# already root-owned (COPY, uv sync, npm install all run as root) and
+# read-only for the hermes user (go-w from the --chmod above).
+
 USER root
 RUN mkdir -p /opt/hermes/bin && \
    cp /opt/hermes/docker/hermes-exec-shim.sh /opt/hermes/bin/hermes && \
    chmod 0755 /opt/hermes/bin/hermes && \
-    printf 'docker\n' > /opt/hermes/.install_method && \
-    chown -R root:root /opt/hermes && \
-    chmod -R a+rX /opt/hermes && \
-    chmod -R a-w /opt/hermes
+    printf 'docker\n' > /opt/hermes/.install_method
 # The ``.install_method`` stamp is baked next to the running code (the install
 # tree), NOT into $HERMES_HOME. $HERMES_HOME (/opt/data) is a shared data
 # volume that is commonly bind-mounted from the host and even shared with a
@@ -236,13 +242,11 @@ RUN mkdir -p /opt/hermes/bin && \
 #
 # The arg is optional — local `docker build` without --build-arg simply
 # omits the file, and the runtime falls back to live-git lookup.  CI
-# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
+# (.github/workflows/docker.yml) passes ${{ github.sha }} so
 # every published image has it.
 ARG HERMES_GIT_SHA=
 RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
-        chmod u+w /opt/hermes && \
-        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
-        chmod a-w /opt/hermes /opt/hermes/.hermes_build_sha; \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha; \
    fi

 # ---------- s6-overlay service wiring ----------
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [NovitaAI](https://novita.ai) (AI-native cloud for Model API, Agent Sandbox, and GPU Cloud), [NVIDIA NIM](https://build.nvidia.com) (Nemotron), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), OpenRouter, OpenAI, your own endpoint, and [many others](https://hermes-agent.nousresearch.com/docs/integrations/providers). Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -722,10 +722,50 @@ def init_agent(
    elif agent.provider == "moa":
        from agent.moa_loop import MoAClient
        agent.api_mode = "chat_completions"
-        agent.client = MoAClient(agent.model or "default")
+
+        # Route reference-model outputs to the agent's tool_progress_callback so
+        # every surface that already consumes it (CLI spinner/scrollback, TUI,
+        # desktop, gateway) can show each reference's answer as a labelled block
+        # before the aggregator acts. The facade emits "moa.reference" and
+        # "moa.aggregating" events; we forward them through the same callback
+        # the tool lifecycle uses. Best-effort and cache-safe — these are
+        # display-only events, they never touch the message history.
+        def _moa_reference_relay(event: str, **kwargs: Any) -> None:
+            cb = getattr(agent, "tool_progress_callback", None)
+            if cb is None:
+                return
+            try:
+                if event == "moa.reference":
+                    label = str(kwargs.get("label") or "")
+                    text = str(kwargs.get("text") or "")
+                    idx = kwargs.get("index")
+                    count = kwargs.get("count")
+                    cb(
+                        "moa.reference",
+                        label,
+                        text,
+                        None,
+                        moa_index=idx,
+                        moa_count=count,
+                    )
+                elif event == "moa.aggregating":
+                    cb(
+                        "moa.aggregating",
+                        str(kwargs.get("aggregator") or ""),
+                        None,
+                        None,
+                        moa_ref_count=kwargs.get("ref_count"),
+                    )
+            except Exception:
+                pass
+
+        agent.client = MoAClient(
+            agent.model or "default",
+            reference_callback=_moa_reference_relay,
+        )
        agent._client_kwargs = {}
        agent.api_key = api_key or "moa-virtual-provider"
-        agent.base_url = base_url or "moa://local"
+        agent.base_url = "moa://local"
        if not agent.quiet_mode:
            print(f"🤖 AI Agent initialized with MoA preset: {agent.model}")
    elif agent.api_mode == "bedrock_converse":
@@ -1267,6 +1307,12 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

+    # Intent-ack continuation config: "auto" (default — codex_responses only,
+    # the historical gate), true (all api_modes), false (never), or a list of
+    # model-name substrings.  Resolved against the active api_mode/model in the
+    # conversation loop's intent-ack block.
+    agent._intent_ack_continuation = _agent_section.get("intent_ack_continuation", "auto")
+
    # Universal task-completion guidance toggle.  Default True.  Surfaced
    # as a separate flag from tool_use_enforcement because the guidance
    # applies to ALL models, not just the model families enforcement
@@ -1630,8 +1676,10 @@ def init_agent(
            f"Model {agent.model} has a context window of {_ctx:,} tokens, "
            f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
            f"by Hermes Agent.  Choose a model with at least "
-            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
-            f"model.context_length in config.yaml to override."
+            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context.  If your server "
+            f"reports a window smaller than the model's true window, set "
+            f"model.context_length in config.yaml to the real value "
+            f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
        )

    # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@@ -42,6 +42,14 @@ from utils import base_url_host_matches, base_url_hostname, env_var_enabled, ato
 logger = logging.getLogger(__name__)


+# Max consecutive successful credential-pool token refreshes of the SAME entry
+# on a persistent auth failure before we give up and let the fallback chain
+# activate. A single-entry OAuth pool can re-mint a fresh token indefinitely
+# even when the upstream keeps rejecting it, so without this cap the retry loop
+# spins forever and never reaches ``_try_activate_fallback``. See #26080.
+_MAX_AUTH_REFRESH_ATTEMPTS = 2
+
+
 def _ra():
    """Lazy ``run_agent`` reference for test-patch routing."""
    import run_agent
@@ -775,6 +783,30 @@ def recover_with_credential_pool(
            return False, has_retried_429
        refreshed = pool.try_refresh_current()
        if refreshed is not None:
+            # ``try_refresh_current()`` re-mints a fresh OAuth token and reports
+            # success even when the upstream keeps rejecting it — a single-entry
+            # pool (common for OAuth/Max subscribers) has nothing to rotate to,
+            # so a bare "refreshed → retry" loop spins forever on the same dead
+            # token and the configured fallback never activates. Cap consecutive
+            # same-entry refreshes and fall through to fallback once exceeded.
+            # See #26080.
+            refreshed_id = getattr(refreshed, "id", None)
+            if refreshed_id is not None:
+                refresh_counts = getattr(agent, "_auth_pool_refresh_counts", None)
+                if refresh_counts is None:
+                    refresh_counts = {}
+                    agent._auth_pool_refresh_counts = refresh_counts
+                refresh_key = (agent.provider, refreshed_id)
+                refresh_counts[refresh_key] = refresh_counts.get(refresh_key, 0) + 1
+                if refresh_counts[refresh_key] > _MAX_AUTH_REFRESH_ATTEMPTS:
+                    _ra().logger.warning(
+                        "Credential auth failure persists after %s refreshes for "
+                        "pool entry %s — treating as unrecoverable and allowing "
+                        "fallback to activate.",
+                        refresh_counts[refresh_key] - 1,
+                        refreshed_id,
+                    )
+                    return False, has_retried_429
            _ra().logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}")
            agent._swap_credential(refreshed)
            return True, has_retried_429
@@ -1046,6 +1078,34 @@ def restore_primary_runtime(agent) -> bool:
            api_mode=rt.get("compressor_api_mode", ""),
        )

+        # ── Re-select from the credential pool if one is available ──
+        # The snapshot's api_key was captured at construction time.  Across
+        # turns the pool may have rotated (token revocation, billing/rate-limit
+        # exhaustion, cooldown), leaving the snapshot key stale.  Restoring it
+        # blindly re-fails on the first request and burns through the remaining
+        # pool entries before cross-provider fallback even gets a chance.  Ask
+        # the pool for its current best entry and swap the live credential in.
+        # When the pool is absent, empty, or the entry has no usable key, we
+        # keep the snapshot key (the existing behavior).  Fixes #25205.
+        pool = getattr(agent, "_credential_pool", None)
+        if pool is not None and pool.has_available():
+            entry = pool.select()
+            if entry is not None:
+                entry_key = (
+                    getattr(entry, "runtime_api_key", None)
+                    or getattr(entry, "access_token", "")
+                )
+                if entry_key:
+                    # ``_swap_credential`` rebuilds the OpenAI/Anthropic client,
+                    # reapplies base-url-scoped headers, and carries the
+                    # accumulated base_url / OAuth-detection fixes (#33163).
+                    agent._swap_credential(entry)
+                    logger.info(
+                        "Restore re-selected pool entry %s (%s)",
+                        getattr(entry, "id", "?"),
+                        getattr(entry, "label", "?"),
+                    )
+
        # ── Reset fallback chain for the new turn ──
        agent._fallback_activated = False
        agent._fallback_index = 0
@@ -1221,7 +1281,11 @@ def dump_api_request_debug(
            dump_payload["error"] = error_info

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        dump_file = agent.logs_dir / f"request_dump_{agent.session_id}_{timestamp}.json"
+        # Sanitize the session ID into a traversal-free path segment — it can
+        # originate from untrusted input (X-Hermes-Session-Id header), and an
+        # unsanitized "../"-shaped ID would write the dump outside logs_dir.
+        safe_sid = _ra()._safe_session_filename_component(agent.session_id)
+        dump_file = agent.logs_dir / f"request_dump_{safe_sid}_{timestamp}.json"

        # Redact secrets before persisting/printing. This dump captures the
        # full request body (system prompt, tool defs, context-embedded
@@ -1420,6 +1484,15 @@ def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: boo
        keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", ""))
        if keepalive_http is not None:
            client_kwargs["http_client"] = keepalive_http
+    # Delegate all rate-limit / 5xx retry to hermes's outer conversation loop,
+    # which honors Retry-After and applies adaptive/jittered backoff. The OpenAI
+    # SDK default (max_retries=2) uses its own 1-2s backoff that ignores
+    # Retry-After and double-retries inside our loop — the same deadlock the
+    # Anthropic clients hit (#26293). This is the single chokepoint every primary
+    # OpenAI/aggregator client passes through (init, switch_model, recovery,
+    # restore, request-scoped); auxiliary_client builds its own clients and keeps
+    # SDK retries because it is NOT wrapped by the conversation loop.
+    client_kwargs.setdefault("max_retries", 0)
    # Uses the module-level `OpenAI` name, resolved lazily on first
    # access via __getattr__ below. Tests patch via `run_agent.OpenAI`.
    client = _ra().OpenAI(**client_kwargs)
@@ -1499,6 +1572,10 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
    # _client_kwargs is a dict — snapshot a shallow copy so mutating the
    # live dict doesn't poison the rollback target.
    _snapshot["_client_kwargs"] = dict(getattr(agent, "_client_kwargs", {}) or {})
+    # Snapshot the credential pool reference so a failed client rebuild can
+    # restore the original pool (issue #52727: pool reload is part of this
+    # switch and must be reversible on rollback).
+    _snapshot["_credential_pool"] = getattr(agent, "_credential_pool", _MISSING)

    try:
        # Clear the per-config context_length override so the new model's
@@ -1523,8 +1600,36 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        if api_key:
            agent.api_key = api_key

+        # ── Reload credential pool for the new provider (issue #52727) ──
+        # Without this, ``recover_with_credential_pool`` sees a
+        # ``pool.provider != agent.provider`` mismatch and short-circuits,
+        # leaving the new provider with no rotation/recovery on 401/429 and
+        # burning the original pool's entries. Only reload when the provider
+        # actually changed (or the pool was missing) — re-selecting the same
+        # provider must not churn the pool reference. A reload failure is
+        # logged + swallowed: the switch itself must still complete.
+        old_norm = (old_provider or "").strip().lower()
+        new_norm = (new_provider or "").strip().lower()
+        if old_norm != new_norm or getattr(agent, "_credential_pool", None) is None:
+            try:
+                from agent.credential_pool import load_pool
+                agent._credential_pool = load_pool(new_provider)
+            except Exception as _pool_exc:  # noqa: BLE001
+                logger.warning(
+                    "switch_model: credential pool reload failed for %s (%s); "
+                    "continuing without pool rotation this turn",
+                    new_provider, _pool_exc,
+                )
+
        # ── Build new client ──
-        if api_mode == "anthropic_messages":
+        if (new_provider or "").strip().lower() == "moa":
+            from agent.moa_loop import MoAClient
+
+            agent.api_key = api_key or "moa-virtual-provider"
+            agent.base_url = "moa://local"
+            agent._client_kwargs = {}
+            agent.client = MoAClient(agent.model or "default")
+        elif api_mode == "anthropic_messages":
            from agent.anthropic_adapter import (
                build_anthropic_client,
                resolve_anthropic_token,
@@ -2104,8 +2209,21 @@ def looks_like_codex_intermediate_ack(
    user_message: str,
    assistant_content: str,
    messages: List[Dict[str, Any]],
+    require_workspace: bool = True,
 ) -> bool:
-    """Detect a planning/ack message that should continue instead of ending the turn."""
+    """Detect a planning/ack message that should continue instead of ending the turn.
+
+    ``require_workspace`` (default True) keeps the original codex-coding scope:
+    the ack must reference a filesystem/repo workspace. The conversation loop
+    passes ``require_workspace=False`` when the user has explicitly opted into
+    intent-ack continuation for all api_modes (``agent.intent_ack_continuation``
+    is ``true`` or a model-list), so general autonomous workflows ("I'll run a
+    health check on the server", "I'll start the deployment") — which carry a
+    future-ack and an action verb but no filesystem reference — are caught too.
+    The future-ack + short-content + no-prior-tools + action-verb requirements
+    always apply, which is what keeps conversational "I'll help you brainstorm"
+    replies from tripping it.
+    """
    if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
        return False

@@ -2158,17 +2276,67 @@ def looks_like_codex_intermediate_ack(
        "path",
    )

+    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
+    if not assistant_mentions_action:
+        return False
+
+    # Opted-in (all-api_mode) path: a future-ack + action verb + no prior tool
+    # call is enough — the user asked us to keep going when the model only
+    # announces intent, regardless of whether a filesystem is involved.
+    if not require_workspace:
+        return True
+
    user_text = (user_message or "").strip().lower()
    user_targets_workspace = (
        any(marker in user_text for marker in workspace_markers)
        or "~/" in user_text
        or "/" in user_text
    )
-    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
    assistant_targets_workspace = any(
        marker in assistant_text for marker in workspace_markers
    )
-    return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action
+    return user_targets_workspace or assistant_targets_workspace
+
+
+def intent_ack_continuation_mode(agent) -> str:
+    """Classify the resolved intent-ack continuation mode for this turn.
+
+    Returns one of:
+      * ``"off"``        — never continue.
+      * ``"codex_only"`` — historical scope: continue only on the
+        ``codex_responses`` api_mode, and only for codebase/workspace acks
+        (``require_workspace=True``).
+      * ``"all"``        — user opted in for every api_mode; continue on any
+        future-ack + action verb (``require_workspace=False``).
+
+    Mirrors the four-mode shape of ``agent.tool_use_enforcement``: ``"auto"``
+    (default) → codex_only; ``True``/"true"/"always"/"yes"/"on" → all;
+    ``False``/"false"/"never"/"no"/"off" → off; ``list`` → all when a substring
+    matches the active model name, else off.
+    """
+    mode = getattr(agent, "_intent_ack_continuation", "auto")
+
+    if mode is True or (isinstance(mode, str) and mode.lower() in {"true", "always", "yes", "on"}):
+        return "all"
+    if mode is False or (isinstance(mode, str) and mode.lower() in {"false", "never", "no", "off"}):
+        return "off"
+    if isinstance(mode, list):
+        model_lower = (agent.model or "").lower()
+        return "all" if any(p.lower() in model_lower for p in mode if isinstance(p, str)) else "off"
+    # "auto" or any unrecognised value — historical codex-only behavior.
+    return "codex_only" if agent.api_mode == "codex_responses" else "off"
+
+
+def intent_ack_continuation_enabled(agent) -> bool:
+    """Whether intent-ack continuation should fire at all for this turn.
+
+    The ``codex_ack_continuations < 2`` per-turn cap and the
+    ``looks_like_codex_intermediate_ack`` detector are applied by the caller;
+    this only decides the on/off gate. Callers that also need to know whether
+    the workspace requirement applies should use ``intent_ack_continuation_mode``
+    directly (``"codex_only"`` ⇒ require_workspace=True, ``"all"`` ⇒ False).
+    """
+    return intent_ack_continuation_mode(agent) != "off"



--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
    kwargs = {
        "timeout": timeout_obj,
        "http_client": http_client,
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        "max_retries": 0,
        # The SDK requires *something* for api_key/auth_token. Our
        # event hook overrides Authorization per request so this value
        # is never sent. The sentinel string makes accidental leaks
@@ -757,6 +760,12 @@ def build_anthropic_client(
    _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
    kwargs = {
        "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
+        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
+        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
+        # its own 1-2s backoff that ignores Retry-After and double-retries
+        # inside our loop — burning request slots against a bucket that won't
+        # refill for minutes. (#26293)
+        "max_retries": 0,
    }
    if normalized_base_url:
        # Azure Anthropic endpoints require an ``api-version`` query parameter.
@@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
    return _anthropic_sdk.AnthropicBedrock(
        aws_region=region,
        timeout=Timeout(timeout=900.0, connect=10.0),
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        max_retries=0,
        default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
    )

@@ -914,44 +926,72 @@ def _read_claude_code_credentials_from_keychain() -> Optional[Dict[str, Any]]:
    return None


+def _read_claude_code_credentials_from_file() -> Optional[Dict[str, Any]]:
+    """Read Claude Code OAuth credentials from ~/.claude/.credentials.json.
+
+    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
+    """
+    cred_path = Path.home() / ".claude" / ".credentials.json"
+    if not cred_path.exists():
+        return None
+    try:
+        data = json.loads(cred_path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, IOError) as e:
+        logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
+        return None
+
+    oauth_data = data.get("claudeAiOauth")
+    if not (oauth_data and isinstance(oauth_data, dict)):
+        return None
+    access_token = oauth_data.get("accessToken", "")
+    if not access_token:
+        return None
+    return {
+        "accessToken": access_token,
+        "refreshToken": oauth_data.get("refreshToken", ""),
+        "expiresAt": oauth_data.get("expiresAt", 0),
+        "source": "claude_code_credentials_file",
+    }
+
+
 def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    """Read refreshable Claude Code OAuth credentials.

-    Checks two sources in order:
+    Reads from two possible sources and reconciles them:
      1. macOS Keychain (Darwin only) — "Claude Code-credentials" entry
      2. ~/.claude/.credentials.json file

+    Selection rules when both are present:
+      - If exactly one is non-expired, prefer that one. (Handles the case
+        where Claude Code refreshes one source but not the other — observed
+        in the wild on Claude Code 2.1.x.)
+      - Otherwise, prefer the source with the later ``expiresAt`` so that
+        any subsequent refresh uses the most recent ``refreshToken``.
+
    This intentionally excludes ~/.claude.json primaryApiKey. Opencode's
    subscription flow is OAuth/setup-token based with refreshable credentials,
    and native direct Anthropic provider usage should follow that path rather
    than auto-detecting Claude's first-party managed key.

-    Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
+    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
    """
-    # Try macOS Keychain first (covers Claude Code >=2.1.114)
    kc_creds = _read_claude_code_credentials_from_keychain()
-    if kc_creds:
-        return kc_creds
+    file_creds = _read_claude_code_credentials_from_file()

-    # Fall back to JSON file
-    cred_path = Path.home() / ".claude" / ".credentials.json"
-    if cred_path.exists():
-        try:
-            data = json.loads(cred_path.read_text(encoding="utf-8"))
-            oauth_data = data.get("claudeAiOauth")
-            if oauth_data and isinstance(oauth_data, dict):
-                access_token = oauth_data.get("accessToken", "")
-                if access_token:
-                    return {
-                        "accessToken": access_token,
-                        "refreshToken": oauth_data.get("refreshToken", ""),
-                        "expiresAt": oauth_data.get("expiresAt", 0),
-                        "source": "claude_code_credentials_file",
-                    }
-        except (json.JSONDecodeError, OSError, IOError) as e:
-            logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
+    if kc_creds and file_creds:
+        kc_valid = is_claude_code_token_valid(kc_creds)
+        file_valid = is_claude_code_token_valid(file_creds)
+        if kc_valid and not file_valid:
+            return kc_creds
+        if file_valid and not kc_valid:
+            return file_creds
+        # Both valid or both expired: prefer the later expiresAt so the
+        # downstream refresh path uses the freshest refresh_token.
+        kc_exp = kc_creds.get("expiresAt", 0) or 0
+        file_exp = file_creds.get("expiresAt", 0) or 0
+        return kc_creds if kc_exp >= file_exp else file_creds

-    return None
+    return kc_creds or file_creds


 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
@@ -1034,8 +1074,40 @@ def refresh_anthropic_oauth_pure(refresh_token: str, *, use_json: bool = False)


 def _refresh_oauth_token(creds: Dict[str, Any]) -> Optional[str]:
-    """Attempt to refresh an expired Claude Code OAuth token."""
-    refresh_token = creds.get("refreshToken", "")
+    """Attempt to refresh an expired Claude Code OAuth token.
+
+    Claude Code's OAuth refresh tokens are single-use: a successful refresh
+    rotates the pair and invalidates the old refresh token. Claude Code itself
+    also refreshes on its own schedule (IDE/CLI activity), so by the time
+    Hermes notices an expired token, Claude Code may have already rotated it.
+    POSTing our now-stale refresh token in that window races Claude Code and
+    fails with ``invalid_grant``.
+
+    So before refreshing, re-read the live credential sources. If Claude Code
+    has already produced a valid token, adopt it and skip the POST entirely.
+    Only fall back to refreshing ourselves when no fresh credential is found.
+    """
+    # Claude Code may have already refreshed — adopt its token rather than
+    # racing it with our (possibly already-rotated) refresh token. Only adopt
+    # when the live re-read produced a DIFFERENT token with a real future
+    # expiry: re-adopting the same credential we were just handed would be a
+    # no-op, and a 0/absent ``expiresAt`` means "managed key / unknown expiry"
+    # (see is_claude_code_token_valid) which must NOT be treated as a fresh
+    # refresh here.
+    current = read_claude_code_credentials()
+    if current:
+        current_token = current.get("accessToken", "")
+        current_exp = current.get("expiresAt", 0) or 0
+        if (
+            current_token
+            and current_token != creds.get("accessToken", "")
+            and current_exp > 0
+            and is_claude_code_token_valid(current)
+        ):
+            logger.debug("Adopted Claude Code's already-refreshed OAuth token")
+            return current_token
+
+    refresh_token = (current or {}).get("refreshToken", "") or creds.get("refreshToken", "")
    if not refresh_token:
        logger.debug("No refresh token available — cannot refresh")
        return None
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -102,6 +102,7 @@ OpenAI = _OpenAIProxy()  # module-level name, resolves lazily on call/isinstance

 from agent.credential_pool import load_pool
 from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length
+from agent.process_bootstrap import build_keepalive_http_client
 from hermes_cli.config import get_hermes_home
 from hermes_constants import OPENROUTER_BASE_URL
 from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars
@@ -109,6 +110,23 @@ from utils import base_url_host_matches, base_url_hostname, env_float, model_for
 logger = logging.getLogger(__name__)


+def _openai_http_client_kwargs(
+    base_url: Optional[str],
+    *,
+    async_mode: bool = False,
+) -> Dict[str, Any]:
+    """Inject keepalive httpx client with env-only proxy (not macOS system proxy)."""
+    client = build_keepalive_http_client(str(base_url or ""), async_mode=async_mode)
+    if client is None:
+        return {}
+    return {"http_client": client}
+
+
+def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
+    kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
+    return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
+
+
 # ── Interrupt protection for atomic auxiliary tasks ──────────────────────
 # Some auxiliary tasks must NOT be aborted mid-flight by a gateway interrupt
 # (e.g. an incoming user message while the agent is busy). Context
@@ -1614,7 +1632,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            _merged_aux = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_aux:
                extra["default_headers"] = _merged_aux
-            _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
+            _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
            _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
            return _client, model

@@ -1654,7 +1672,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
        _merged_aux2 = _apply_user_default_headers(extra.get("default_headers"))
        if _merged_aux2:
            extra["default_headers"] = _merged_aux2
-        _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
+        _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
        _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
        return _client, model

@@ -1669,20 +1687,21 @@ def _try_openrouter(explicit_api_key: str = None, model: str = None) -> Tuple[Op
    pool_present, entry = _select_pool_entry("openrouter")
    if pool_present:
        or_key = explicit_api_key or _pool_runtime_api_key(entry)
-        if not or_key:
-            _mark_provider_unhealthy("openrouter", ttl=60)
-            return None, None
-        base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
-        logger.debug("Auxiliary client: OpenRouter via pool")
-        return OpenAI(api_key=or_key, base_url=base_url,
-                       default_headers=build_or_headers()), model or _OPENROUTER_MODEL
+        if or_key:
+            base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
+            logger.debug("Auxiliary client: OpenRouter via pool")
+            return _create_openai_client(api_key=or_key, base_url=base_url,
+                           default_headers=build_or_headers()), model or _OPENROUTER_MODEL
+        # Pool exists but is exhausted (no usable runtime key) — fall through to
+        # the OPENROUTER_API_KEY env-var path rather than failing outright.
+        logger.debug("Auxiliary client: OpenRouter pool exhausted, trying OPENROUTER_API_KEY")

    or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY")
    if not or_key:
        _mark_provider_unhealthy("openrouter", ttl=60)
        return None, None
    logger.debug("Auxiliary client: OpenRouter")
-    return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
+    return _create_openai_client(api_key=or_key, base_url=OPENROUTER_BASE_URL,
                   default_headers=build_or_headers()), model or _OPENROUTER_MODEL


@@ -1775,7 +1794,7 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
-        OpenAI(
+        _create_openai_client(
            api_key=api_key,
            base_url=base_url,
        ),
@@ -2052,7 +2071,7 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
    if _custom_headers:
        _extra["default_headers"] = _custom_headers
    if custom_mode == "codex_responses":
-        real_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
+        real_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
        return CodexAuxiliaryClient(real_client, model), model
    if custom_mode == "anthropic_messages":
        # Third-party Anthropic-compatible gateway (MiniMax, Zhipu GLM,
@@ -2066,14 +2085,14 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
                "Custom endpoint declares api_mode=anthropic_messages but the "
                "anthropic SDK is not installed — falling back to OpenAI-wire."
            )
-            return OpenAI(api_key=custom_key, base_url=_clean_base, **_extra), model
+            return _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra), model
        return (
            AnthropicAuxiliaryClient(real_client, model, custom_key, custom_base, is_oauth=False),
            model,
        )
    # URL-based anthropic detection for custom endpoints that didn't set
    # api_mode explicitly (e.g. kimi.com/coding reached via custom config).
-    _fallback_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
+    _fallback_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
    _fallback_client = _maybe_wrap_anthropic(
        _fallback_client, model, custom_key, custom_base, custom_mode,
    )
@@ -2102,7 +2121,7 @@ def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str
        return None, None
    api_key, base_url = resolved
    logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
-    real_client = OpenAI(api_key=api_key, base_url=base_url)
+    real_client = _create_openai_client(api_key=api_key, base_url=base_url)
    return CodexAuxiliaryClient(real_client, model), model


@@ -2139,7 +2158,7 @@ def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
            return None, None
        base_url = _CODEX_AUX_BASE_URL
    logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", model)
-    real_client = OpenAI(
+    real_client = _create_openai_client(
        api_key=codex_token,
        base_url=base_url,
        default_headers=_codex_cloudflare_headers(codex_token),
@@ -2239,7 +2258,7 @@ def _try_azure_foundry(
    if _dq:
        extra["default_query"] = _dq

-    client = OpenAI(api_key=api_key, base_url=_clean_base, **extra)
+    client = _create_openai_client(api_key=api_key, base_url=_clean_base, **extra)

    if runtime_api_mode == "codex_responses":
        # GPT-5.x / o-series / codex models on Azure Foundry are
@@ -3624,6 +3643,37 @@ def _resolve_auto(
    # config.yaml (auxiliary.<task>.provider) still win over this.
    main_provider = str(runtime_provider or _read_main_provider() or "")
    main_model = str(runtime_model or _read_main_model() or "")
+
+    # MoA virtual provider: the "model" is a preset name (e.g. "opus-gpt") and
+    # there is no real "moa" HTTP endpoint, so resolving an aux client against
+    # provider="moa"/model=<preset> sends the preset name as the model id and
+    # the provider 400s ("opus-gpt is not a valid model ID"). Auxiliary tasks
+    # (title generation, compression, vision, …) don't need the reference
+    # fan-out — they should run on the aggregator, which is the preset's acting
+    # model. Resolve the MoA preset to its aggregator slot and continue Step 1
+    # with that real provider+model. Mirrors the MoA context-length resolution.
+    if main_provider == "moa":
+        try:
+            from hermes_cli.config import load_config
+            from hermes_cli.moa_config import resolve_moa_preset
+
+            _preset = resolve_moa_preset(load_config().get("moa") or {}, main_model)
+            _agg = _preset.get("aggregator") or {}
+            _agg_provider = str(_agg.get("provider") or "").strip()
+            _agg_model = str(_agg.get("model") or "").strip()
+            if _agg_provider and _agg_model and _agg_provider.lower() != "moa":
+                main_provider = _agg_provider
+                main_model = _agg_model
+                # The MoA virtual runtime carries a non-HTTP base_url
+                # ("moa://local") and a placeholder api_key; they belong to the
+                # facade, not the aggregator's real provider. Drop them so the
+                # aggregator resolves through its own provider credentials.
+                runtime_base_url = ""
+                runtime_api_key = ""
+                runtime_api_mode = ""
+        except Exception:
+            logger.debug("MoA aux resolution to aggregator failed", exc_info=True)
+
    if (main_provider and main_model
            and main_provider not in {"auto", ""}):
        resolved_provider = main_provider
@@ -3770,6 +3820,10 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
    _merged_async = _apply_user_default_headers(async_kwargs.get("default_headers"))
    if _merged_async:
        async_kwargs["default_headers"] = _merged_async
+    async_kwargs = {
+        **_openai_http_client_kwargs(sync_base_url, async_mode=True),
+        **async_kwargs,
+    }
    return AsyncOpenAI(**async_kwargs), model


@@ -3980,7 +4034,7 @@ def resolve_provider_client(
                               "but no Codex OAuth token found (run: hermes model)")
                return None, None
            final_model = _normalize_resolved_model(model, provider)
-            raw_client = OpenAI(
+            raw_client = _create_openai_client(
                api_key=codex_token,
                base_url=_CODEX_AUX_BASE_URL,
                default_headers=_codex_cloudflare_headers(codex_token),
@@ -4061,7 +4115,7 @@ def resolve_provider_client(
            _merged_custom = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_custom:
                extra["default_headers"] = _merged_custom
-            client = OpenAI(api_key=custom_key, base_url=_clean_base, **extra)
+            client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **extra)
            client = _wrap_if_needed(client, final_model, custom_base, custom_key)
            return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                    else (client, final_model))
@@ -4165,7 +4219,7 @@ def resolve_provider_client(
                        _fb_headers = _apply_user_default_headers(_fb_extra.get("default_headers"))
                        if _fb_headers:
                            _fb_extra["default_headers"] = _fb_headers
-                        client = OpenAI(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
+                        client = _create_openai_client(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
                        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                                else (client, final_model))
                    sync_anthropic = AnthropicAuxiliaryClient(
@@ -4174,7 +4228,7 @@ def resolve_provider_client(
                    if async_mode:
                        return AsyncAnthropicAuxiliaryClient(sync_anthropic), final_model
                    return sync_anthropic, final_model
-                client = OpenAI(api_key=custom_key, base_url=_clean_base2, **_extra2)
+                client = _create_openai_client(api_key=custom_key, base_url=_clean_base2, **_extra2)
                # codex_responses or inherited auto-detect (via _wrap_if_needed).
                # _wrap_if_needed reads the closed-over `api_mode` (the task-level
                # override). Named-provider entry api_mode=codex_responses also
@@ -4316,7 +4370,7 @@ def resolve_provider_client(
        _merged_main = _apply_user_default_headers(headers)
        if _merged_main:
            headers = _merged_main
-        client = OpenAI(api_key=api_key, base_url=base_url,
+        client = _create_openai_client(api_key=api_key, base_url=base_url,
                        **({"default_headers": headers} if headers else {}))

        # Copilot GPT-5+ models (except gpt-5-mini) require the Responses
@@ -4852,7 +4906,7 @@ def _refresh_nous_auxiliary_client(
        return None, model

    fresh_key, fresh_base_url = runtime
-    sync_client = OpenAI(api_key=fresh_key, base_url=fresh_base_url)
+    sync_client = _create_openai_client(api_key=fresh_key, base_url=fresh_base_url)
    final_model = model

    current_loop = None
@@ -5435,10 +5489,24 @@ def _build_call_kwargs(
        # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
        # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
        # there.
+        #
+        # NVIDIA NIM (integrate.api.nvidia.com and local NIM endpoints) is a
+        # second exception: some models—notably minimaxai/minimax-m3—return HTTP
+        # 200 with an empty choices[] payload when max_tokens is omitted. The main
+        # NVIDIA chat path already sends an output cap via the provider profile;
+        # preserve it on the auxiliary path too.
        _effective_base = base_url or (
            _current_custom_base_url() if provider == "custom" else ""
        )
-        if _is_anthropic_compat_endpoint(provider, _effective_base):
+        _provider_norm = str(provider or "").strip().lower()
+        _is_nvidia_nim = (
+            _provider_norm in {"nvidia", "nvidia-nim", "nim", "build-nvidia", "nemotron"}
+            or base_url_host_matches(_effective_base, "integrate.api.nvidia.com")
+        )
+        if (
+            _is_anthropic_compat_endpoint(provider, _effective_base)
+            or _is_nvidia_nim
+        ):
            kwargs["max_tokens"] = max_tokens

    if tools:
@@ -5962,8 +6030,17 @@ def call_llm(
        # When the provider returns a 429 rate-limit (not billing), fall
        # back to an alternative provider instead of exhausting retries
        # against the same rate-limited endpoint.
+        #
+        # ── Auth error fallback (#21165) ─────────────────────────────
+        # When the resolved provider returns 401 and neither the Nous
+        # refresh path nor explicit provider credential refresh applies,
+        # fall back to an alternative provider instead of dropping the
+        # auxiliary task on the floor (silent compression failure /
+        # message loss). Auth is NOT a capacity error: it only bypasses
+        # the explicit-provider gate when the user is in auto mode.
        should_fallback = (
-            _is_payment_error(first_err)
+            _is_auth_error(first_err)
+            or _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
@@ -5993,7 +6070,9 @@ def call_llm(
            or _is_invalid_aux_response_error(first_err)
        )
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_payment_error(first_err):
+            if _is_auth_error(first_err):
+                reason = "auth error"
+            elif _is_payment_error(first_err):
                reason = "payment error"
                # Resolve the actual provider label (resolved_provider may be
                # "auto"; the client's base_url tells us which backend got the
@@ -6442,8 +6521,13 @@ async def async_call_llm(
                        raise

        # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
+        # Auth error fallback (#21165): a 401 that survived the refresh path
+        # falls back in auto mode just like the sync call_llm() path. Auth is
+        # NOT a capacity error, so on an explicit provider it still respects
+        # the user's choice (handled by the is_auto/is_capacity_error gate).
        should_fallback = (
-            _is_payment_error(first_err)
+            _is_auth_error(first_err)
+            or _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
@@ -6465,7 +6549,9 @@ async def async_call_llm(
            or _is_invalid_aux_response_error(first_err)
        )
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_payment_error(first_err):
+            if _is_auth_error(first_err):
+                reason = "auth error"
+            elif _is_payment_error(first_err):
                reason = "payment error"
                _mark_provider_unhealthy(
                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -28,6 +28,7 @@ from typing import Any, Dict, Optional
 from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
 from agent.error_classifier import FailoverReason
+from agent.gemini_native_adapter import is_native_gemini_base_url
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
    _sanitize_surrogates,
@@ -37,6 +38,18 @@ from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname, env_float, env_int

 logger = logging.getLogger(__name__)
+_OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
+
+# When the fallback chain is fully exhausted on a non-rate-limit failure
+# (e.g. every provider returns a non-retryable client error like HTTP 400),
+# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
+# and does not reset _fallback_index=0 to replay the entire chain again.
+# Without this, a client/gateway that re-submits immediately would re-marshal
+# the full (potentially 80k-token) context once per provider every turn and
+# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
+# billing reasons keep their own 60s cooldown (set above); this is the
+# narrower non-rate-limit case.  See issue #24996.
+_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0


 def _ra():
@@ -115,6 +128,23 @@ def _is_openai_codex_backend(agent) -> bool:
    )


+def _validated_openrouter_provider_sort(raw_sort: Any) -> Optional[str]:
+    """Return a normalized OpenRouter provider.sort value or None."""
+    if not isinstance(raw_sort, str):
+        return None
+    sort_value = raw_sort.strip().lower()
+    if not sort_value:
+        return None
+    if sort_value in _OPENROUTER_PROVIDER_SORT_VALUES:
+        return sort_value
+    logger.warning(
+        "Ignoring invalid OpenRouter provider.sort value %r (allowed: %s)",
+        raw_sort,
+        ", ".join(sorted(_OPENROUTER_PROVIDER_SORT_VALUES)),
+    )
+    return None
+
+
 def _env_float(name: str, default: float) -> float:
    try:
        return float(os.getenv(name, str(default)))
@@ -229,6 +259,11 @@ def interruptible_api_call(agent, api_kwargs: dict):
                        invalidate_runtime_client(region)
                    raise
                result["response"] = normalize_converse_response(raw_response)
+            elif agent.provider == "moa":
+                # MoA is a virtual chat-completions provider backed by the
+                # in-process MoAClient facade. Do not rebuild a request-local
+                # OpenAI client from the virtual runtime metadata.
+                result["response"] = agent.client.chat.completions.create(**api_kwargs)
            else:
                request_client = _set_request_client(
                    agent._create_request_openai_client(
@@ -698,8 +733,9 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        _prefs["ignore"] = agent.providers_ignored
    if agent.providers_order:
        _prefs["order"] = agent.providers_order
-    if agent.provider_sort:
-        _prefs["sort"] = agent.provider_sort
+    _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
+    if _provider_sort:
+        _prefs["sort"] = _provider_sort
    if agent.provider_require_parameters:
        _prefs["require_parameters"] = True
    if agent.provider_data_collection:
@@ -1015,18 +1051,23 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
                    "arguments": tool_call.function.arguments
                },
            }
-            # Defence-in-depth: redact credentials from tool call arguments
-            # before they enter conversation history. Tool execution uses the
-            # raw API response object, not this dict, so redacting the
-            # persisted shape is safe and only affects storage. Catches the
-            # case where a model accidentally inlines a secret into a tool
-            # call (e.g. `terminal(command="curl -H 'Authorization: Bearer
-            # sk-...'")`). (#19798)
-            if isinstance(tc_dict["function"]["arguments"], str):
-                from agent.redact import redact_sensitive_text
-                tc_dict["function"]["arguments"] = redact_sensitive_text(
-                    tc_dict["function"]["arguments"]
-                )
+            # Tool-call arguments are intentionally NOT redacted here. This
+            # dict enters the in-memory conversation history that is replayed
+            # to the model on every subsequent turn AND persisted to state.db,
+            # which is itself replayed verbatim on session resume
+            # (get_messages_as_conversation). Masking a credential to `***`
+            # here poisons that replay: the model reads back its own
+            # `PGPASSWORD='***' psql ...` call and copies the placeholder into
+            # the next tool call, breaking every credential-dependent command
+            # on the second turn (#43083). The masking also provided no real
+            # protection — the same secret still leaks verbatim through tool
+            # OUTPUT (file contents, command output, diffs, the compaction
+            # block), none of which this pass ever touched. Keeping secrets
+            # out of the replayable store is a separate tokenization/vault
+            # concern, not something arg-redaction can deliver without
+            # breaking replay. Storage-time redaction remains governed by the
+            # `security.redact_secrets` toggle. (#19798 introduced this;
+            # #43083 removed it.)
            # Preserve extra_content (e.g. Gemini thought_signature) so it
            # is sent back on subsequent API calls.  Without this, Gemini 3
            # thinking models reject the request with a 400 error.
@@ -1093,8 +1134,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
        if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
            agent._rate_limited_until = time.monotonic() + 60
    if agent._fallback_index >= len(agent._fallback_chain):
+        # Chain exhausted.  If we actually walked a non-empty chain and the
+        # failure was NOT a rate-limit/billing event (those already armed
+        # their own 60s cooldown above), arm a short cooldown so the next
+        # turn's restore_primary_runtime stays gated instead of resetting
+        # _fallback_index=0 and re-marshaling the whole context across every
+        # provider again.  Guards the cross-turn replay storm in #24996.
+        if (
+            len(agent._fallback_chain) > 0
+            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
+        ):
+            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
+            agent._rate_limited_until = max(
+                _existing_cooldown,
+                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
+            )
        return False
-
    fb = agent._fallback_chain[agent._fallback_index]
    agent._fallback_index += 1
    fb_provider = (fb.get("provider") or "").strip().lower()
@@ -1210,14 +1265,16 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
            agent._transport_cache.clear()
        agent._fallback_activated = True

-        # Clear the credential pool when the fallback provider doesn't match
-        # the pool's provider.  The pool was seeded for the primary provider;
-        # leaving it attached means downstream recovery (rate_limit / billing /
-        # auth) calls ``_swap_credential`` with a primary entry which overwrites
-        # the agent's ``base_url`` back to the primary's endpoint — every
-        # fallback request then 404s against the wrong host.  See #33163.
+        # Rebind the credential pool to the fallback provider when the provider
+        # changes.  Keeping the primary pool attached would make downstream
+        # recovery (rate_limit / billing / auth) mutate the wrong credential
+        # set and can overwrite the fallback's base_url back to the primary
+        # endpoint.  See #33163.
+        #
        # When the fallback shares the pool's provider (e.g. both openrouter
-        # entries with different routing) the pool is preserved.
+        # entries with different routing) the pool is preserved.  When the
+        # providers differ, load the fallback provider's own pool if one exists
+        # so provider-specific rotation continues to work after the switch.
        _existing_pool = getattr(agent, "_credential_pool", None)
        if _existing_pool is not None:
            _pool_provider = (getattr(_existing_pool, "provider", "") or "").strip().lower()
@@ -1228,6 +1285,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
                    fb_provider, fb_model, _pool_provider,
                )
                agent._credential_pool = None
+        if getattr(agent, "_credential_pool", None) is None:
+            try:
+                from agent.credential_pool import load_pool
+
+                fallback_pool = load_pool(fb_provider)
+                if fallback_pool and fallback_pool.has_credentials():
+                    agent._credential_pool = fallback_pool
+                    logger.info(
+                        "Fallback to %s/%s: attached fallback credential pool",
+                        fb_provider, fb_model,
+                    )
+            except Exception as exc:
+                logger.debug(
+                    "Fallback to %s/%s: could not attach credential pool: %s",
+                    fb_provider, fb_model, exc,
+                )

        # Honor per-provider / per-model request_timeout_seconds for the
        # fallback target (same knob the primary client uses).  None = use
@@ -1458,8 +1531,9 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str:
                provider_preferences["ignore"] = agent.providers_ignored
            if agent.providers_order:
                provider_preferences["order"] = agent.providers_order
-            if agent.provider_sort:
-                provider_preferences["sort"] = agent.provider_sort
+            _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
+            if _provider_sort:
+                provider_preferences["sort"] = _provider_sort
            if provider_preferences and (
                (agent.provider or "").strip().lower() == "openrouter"
                or agent._is_openrouter_url()
@@ -1838,7 +1912,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
        stream_kwargs = {
            **api_kwargs,
            "stream": True,
-            "stream_options": {"include_usage": True},
            "timeout": _httpx.Timeout(
                connect=_conn_cap,
                read=_stream_read_timeout,
@@ -1846,6 +1919,14 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pool=_conn_cap,
            ),
        }
+        # OpenAI's `stream_options={"include_usage": True}` drives usage
+        # accounting on OpenAI-compatible endpoints (incl. the Gemini OpenAI
+        # compat shim and aggregators like OpenRouter).  Google's *native*
+        # Gemini REST endpoint rejects the keyword outright
+        # (`Completions.create() got an unexpected keyword argument
+        # 'stream_options'`), so omit it only for that endpoint.
+        if not is_native_gemini_base_url(agent.base_url):
+            stream_kwargs["stream_options"] = {"include_usage": True}
        request_client = _set_request_client(
            agent._create_request_openai_client(
                reason="chat_completion_stream_request",
@@ -2246,7 +2327,15 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                                _fire_first_delta()
                                agent._fire_reasoning_delta(thinking_text)

-            # Return the native Anthropic Message for downstream processing
+            # Return the native Anthropic Message for downstream processing.
+            # If the stream was interrupted (the event loop broke out above on
+            # agent._interrupt_requested), do NOT call get_final_message() — on
+            # a partially-consumed stream the SDK may hang draining remaining
+            # events or return a Message with incomplete tool_use blocks (partial
+            # JSON in `input`). The outer poll loop raises InterruptedError, so
+            # this return value is discarded anyway.
+            if agent._interrupt_requested:
+                return None
            return stream.get_final_message()

    def _call():
@@ -2391,12 +2480,19 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            diag=request_client_holder.get("diag"),
                        )
                        _close_request_client_once("stream_mid_tool_retry_cleanup")
-                        try:
-                            agent._replace_primary_openai_client(
-                                reason="stream_mid_tool_retry_pool_cleanup"
-                            )
-                        except Exception:
-                            pass
+                        if agent.api_mode == "anthropic_messages":
+                            try:
+                                agent._anthropic_client.close()
+                                agent._rebuild_anthropic_client()
+                            except Exception:
+                                pass
+                        else:
+                            try:
+                                agent._replace_primary_openai_client(
+                                    reason="stream_mid_tool_retry_pool_cleanup"
+                                )
+                            except Exception:
+                                pass
                        continue

                    # SSE error events from proxies (e.g. OpenRouter sends
@@ -2444,12 +2540,19 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            _close_request_client_once("stream_retry_cleanup")
                            # Also rebuild the primary client to purge
                            # any dead connections from the pool.
-                            try:
-                                agent._replace_primary_openai_client(
-                                    reason="stream_retry_pool_cleanup"
-                                )
-                            except Exception:
-                                pass
+                            if agent.api_mode == "anthropic_messages":
+                                try:
+                                    agent._anthropic_client.close()
+                                    agent._rebuild_anthropic_client()
+                                except Exception:
+                                    pass
+                            else:
+                                try:
+                                    agent._replace_primary_openai_client(
+                                        reason="stream_retry_pool_cleanup"
+                                    )
+                                except Exception:
+                                    pass
                            continue
                        # Retries exhausted. Log the final failure with
                        # full diagnostic detail (chain, headers,
@@ -2620,10 +2723,17 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pass
            # Rebuild the primary client too — its connection pool
            # may hold dead sockets from the same provider outage.
-            try:
-                agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
-            except Exception:
-                pass
+            if agent.api_mode == "anthropic_messages":
+                try:
+                    agent._anthropic_client.close()
+                    agent._rebuild_anthropic_client()
+                except Exception:
+                    pass
+            else:
+                try:
+                    agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
+                except Exception:
+                    pass
            # Reset the timer so we don't kill repeatedly while
            # the inner thread processes the closure.
            last_chunk_time["t"] = time.time()
@@ -2699,7 +2809,30 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                role="assistant", content=_partial_text, tool_calls=None,
                reasoning_content=None,
            )
-            return SimpleNamespace(
+            # Detect provider output-layer content filtering (e.g. MiniMax
+            # "output new_sensitive (1027)", Azure/OpenAI content_filter,
+            # Anthropic safety refusal).  The raw error is about to be
+            # swallowed into a finish_reason=length stub, so classify it HERE
+            # while we still have it and stamp the stub.  Retrying such a
+            # content-deterministic filter on the same primary just re-hits
+            # the filter — the conversation loop reads this tag and activates
+            # the fallback chain instead of burning continuation retries.
+            # error_classifier is the single source of truth for "what counts
+            # as a content filter" (#32421).
+            _content_filter_terminated = False
+            try:
+                from agent.error_classifier import classify_api_error, FailoverReason
+                _cls = classify_api_error(
+                    result["error"],
+                    provider=str(getattr(agent, "provider", "") or ""),
+                    model=str(getattr(agent, "model", "") or ""),
+                )
+                _content_filter_terminated = (
+                    _cls.reason == FailoverReason.content_policy_blocked
+                )
+            except Exception:
+                _content_filter_terminated = False
+            _stub = SimpleNamespace(
                id=PARTIAL_STREAM_STUB_ID,
                model=getattr(agent, "model", "unknown"),
                choices=[SimpleNamespace(
@@ -2708,6 +2841,9 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                usage=None,
                _dropped_tool_names=_partial_names or None,
            )
+            if _content_filter_terminated:
+                _stub._content_filter_terminated = True
+            return _stub
        raise result["error"]
    return result["response"]

--- a/agent/coding_context.py
+++ b/agent/coding_context.py
@@ -60,6 +60,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 logger = logging.getLogger("hermes.coding_context")

 CODING_TOOLSET = "coding"
@@ -647,12 +649,14 @@ def _enabled_mcp_servers(config: Optional[dict[str, Any]]) -> list[str]:


 def _git(cwd: Path, *args: str) -> str:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        out = subprocess.run(
            ["git", "-C", str(cwd), *args],
            capture_output=True,
            text=True,
            timeout=_GIT_TIMEOUT,
+            **_popen_kwargs,
        )
    except (OSError, subprocess.SubprocessError):
        return ""
--- a/agent/context_breakdown.py
+++ b/agent/context_breakdown.py
@@ -0,0 +1,156 @@
+"""Live session context-window breakdown for UI surfaces.
+
+Estimates how the next provider request is composed: system prompt tiers,
+tool schemas, and conversation history. Uses the same rough char/4 heuristic
+as ``agent.model_metadata.estimate_request_tokens_rough`` so numbers align
+with compression thresholds — not exact tokenizer counts.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+_SKILLS_BLOCK_RE = re.compile(r"<available_skills>.*?</available_skills>", re.DOTALL)
+
+_SUBAGENT_TOOL_NAMES = frozenset({"delegate_task"})
+
+_CATEGORY_COLORS = {
+    "system_prompt": "var(--context-usage-system)",
+    "tool_definitions": "var(--context-usage-tools)",
+    "rules": "var(--context-usage-rules)",
+    "skills": "var(--context-usage-skills)",
+    "mcp": "var(--context-usage-mcp)",
+    "subagent_definitions": "var(--context-usage-subagents)",
+    "memory": "var(--context-usage-memory)",
+    "conversation": "var(--context-usage-conversation)",
+}
+
+
+def _chars_to_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return (len(text) + 3) // 4
+
+
+def _json_tokens(value: Any) -> int:
+    if not value:
+        return 0
+    return _chars_to_tokens(json.dumps(value, ensure_ascii=False))
+
+
+def _tool_name(tool: dict) -> str:
+    fn = tool.get("function") if isinstance(tool, dict) else None
+    if isinstance(fn, dict):
+        return str(fn.get("name") or "")
+    return str(tool.get("name") or "")
+
+
+def _split_tools(tools: Sequence[dict]) -> Tuple[List[dict], List[dict], List[dict]]:
+    builtin: List[dict] = []
+    mcp: List[dict] = []
+    subagent: List[dict] = []
+    for tool in tools:
+        name = _tool_name(tool)
+        if name.startswith("mcp_"):
+            mcp.append(tool)
+        elif name in _SUBAGENT_TOOL_NAMES:
+            subagent.append(tool)
+        else:
+            builtin.append(tool)
+    return builtin, mcp, subagent
+
+
+def _memory_blocks(agent: Any) -> Tuple[str, str]:
+    memory_block = ""
+    user_block = ""
+    store = getattr(agent, "_memory_store", None)
+    if store is None:
+        return memory_block, user_block
+    try:
+        if getattr(agent, "_memory_enabled", True):
+            memory_block = store.format_for_system_prompt("memory") or ""
+        if getattr(agent, "_user_profile_enabled", True):
+            user_block = store.format_for_system_prompt("user") or ""
+    except Exception:
+        pass
+    return memory_block, user_block
+
+
+def _strip_blocks(text: str, *blocks: str) -> str:
+    out = text
+    for block in blocks:
+        if block:
+            out = out.replace(block, "")
+    return out.strip()
+
+
+def compute_session_context_breakdown(
+    agent: Any,
+    messages: Optional[List[dict]] = None,
+) -> Dict[str, Any]:
+    """Return a Cursor-style context usage breakdown for one live agent."""
+    from agent.model_metadata import estimate_messages_tokens_rough
+    from agent.system_prompt import build_system_prompt_parts
+
+    parts = build_system_prompt_parts(agent)
+    stable = parts.get("stable", "") or ""
+    context = parts.get("context", "") or ""
+    volatile = parts.get("volatile", "") or ""
+
+    skills_match = _SKILLS_BLOCK_RE.search(stable)
+    skills_index = skills_match.group(0) if skills_match else ""
+
+    memory_block, user_block = _memory_blocks(agent)
+    memory_text = "\n\n".join(part for part in (memory_block, user_block) if part).strip()
+
+    system_core = _strip_blocks(stable, skills_index)
+    system_tail = _strip_blocks(volatile, memory_block, user_block)
+    system_prompt_text = "\n\n".join(part for part in (system_core, system_tail) if part).strip()
+
+    tools = list(getattr(agent, "tools", None) or [])
+    builtin_tools, mcp_tools, subagent_tools = _split_tools(tools)
+
+    conversation_tokens = estimate_messages_tokens_rough(messages or [])
+
+    categories = [
+        ("system_prompt", "System prompt", _chars_to_tokens(system_prompt_text)),
+        ("tool_definitions", "Tool definitions", _json_tokens(builtin_tools)),
+        ("rules", "Rules", _chars_to_tokens(context)),
+        ("skills", "Skills", _chars_to_tokens(skills_index)),
+        ("mcp", "MCP", _json_tokens(mcp_tools)),
+        ("subagent_definitions", "Subagent definitions", _json_tokens(subagent_tools)),
+        ("memory", "Memory", _chars_to_tokens(memory_text)),
+        ("conversation", "Conversation", conversation_tokens),
+    ]
+
+    estimated_total = sum(tokens for _, _, tokens in categories)
+
+    comp = getattr(agent, "context_compressor", None)
+    context_max = int(getattr(comp, "context_length", 0) or 0) if comp else 0
+    measured_used = int(getattr(comp, "last_prompt_tokens", 0) or 0) if comp else 0
+    context_used = measured_used if measured_used > 0 else estimated_total
+    context_percent = (
+        max(0, min(100, round(context_used / context_max * 100)))
+        if context_max
+        else 0
+    )
+
+    return {
+        "categories": [
+            {
+                "color": _CATEGORY_COLORS.get(category_id, "var(--ui-text-tertiary)"),
+                "id": category_id,
+                "label": label,
+                "tokens": tokens,
+            }
+            for category_id, label, tokens in categories
+            if tokens > 0
+        ],
+        "context_max": context_max,
+        "context_percent": context_percent,
+        "context_used": context_used,
+        "estimated_total": estimated_total,
+        "model": getattr(agent, "model", "") or "",
+    }
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Awaitable, Callable

 from agent.model_metadata import estimate_tokens_rough
+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags

 _QUOTED_REFERENCE_VALUE = r'(?:`[^`\n]+`|"[^"\n]+"|\'[^\'\n]+\')'
 REFERENCE_PATTERN = re.compile(
@@ -290,6 +291,7 @@ def _expand_git_reference(
    args: list[str],
    label: str,
 ) -> tuple[str | None, str | None]:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["git", *args],
@@ -298,6 +300,7 @@ def _expand_git_reference(
            text=True,
            timeout=30,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"{ref.raw}: git command timed out (30s)", None
@@ -483,6 +486,7 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]:


 def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["rg", "--files", str(path.relative_to(cwd))],
@@ -491,6 +495,7 @@ def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
            text=True,
            timeout=10,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
        return None
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -288,6 +288,29 @@ def replay_compression_warning(agent: Any) -> None:
            pass


+def conversation_history_after_compression(agent: Any, messages: list) -> Optional[list]:
+    """Return the correct flush baseline after a compression boundary.
+
+    Legacy compression rotates to a fresh child session. That child has not
+    seen the compacted transcript through the normal same-turn flush path yet,
+    so callers must clear ``conversation_history`` to ``None`` and let the next
+    persistence call write the whole compacted list.
+
+    In-place compaction is different: ``archive_and_compact()`` has already
+    soft-archived the previous active rows and inserted ``messages`` as the new
+    active live transcript under the same session id. If the same agent turn
+    continues with ``conversation_history=None``, the identity-based flush path
+    treats those already-persisted compacted dicts as new and appends them a
+    second time, doubling the active context and retriggering compression.
+
+    A shallow copy is intentional: it captures the current compacted dict
+    identities as history while allowing later same-turn appends to remain new.
+    """
+    if bool(getattr(agent, "_last_compaction_in_place", False)):
+        return list(messages)
+    return None
+
+
 def compress_context(
    agent: Any,
    messages: list,
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -28,6 +28,7 @@ import uuid
 from typing import Any, Dict, List, Optional

 from agent.codex_responses_adapter import _summarize_user_message_for_log
+from agent.conversation_compression import conversation_history_after_compression
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
 from agent.iteration_budget import IterationBudget
@@ -587,6 +588,13 @@ def run_conversation(
    compression_attempts = 0
    _turn_exit_reason = "unknown"  # Diagnostic: why the loop ended

+    # Per-turn tally of consecutive successful credential-pool token refreshes,
+    # keyed by (provider, pool-entry-id). A persistent upstream 401 lets
+    # ``try_refresh_current()`` "succeed" forever on a single-entry OAuth pool,
+    # so this tally caps same-entry refreshes and lets the fallback chain take
+    # over instead of spinning. Reset here so each turn starts fresh. See #26080.
+    agent._auth_pool_refresh_counts = {}
+
    # Optional opt-in runtime: if api_mode == codex_app_server, hand the
    # turn to the codex app-server subprocess (terminal/file ops/patching
    # all run inside Codex). Default Hermes path is bypassed entirely.
@@ -827,7 +835,6 @@ def run_conversation(
                    aggregator=moa_config.get("aggregator") or {},
                    temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
                    aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
-                    max_tokens=int(moa_config.get("max_tokens", 4096) or 4096),
                )
                if _moa_context:
                    for _msg in reversed(api_messages):
@@ -1692,6 +1699,56 @@ def run_conversation(

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
+                        # ── Content-filter stream stall → fallback (#32421) ──
+                        # When the provider's output-layer safety filter (e.g.
+                        # MiniMax "output new_sensitive (1027)", Azure
+                        # content_filter) kills the stream mid-delivery, the
+                        # raw error was classified at the swallow point and the
+                        # stub tagged ``_content_filter_terminated``.  This
+                        # filter is content-deterministic — continuation
+                        # retries against the SAME primary just re-hit it and
+                        # burn paid attempts (the loop used to give up with
+                        # "Response remained truncated after 3 continuation
+                        # attempts" and never consult the fallback chain).
+                        # Escalate to the configured fallback BEFORE retrying.
+                        _cf_terminated = getattr(
+                            response, "_content_filter_terminated", False
+                        )
+                        if (
+                            _cf_terminated
+                            and agent._fallback_index < len(agent._fallback_chain)
+                        ):
+                            agent._vprint(
+                                f"{agent.log_prefix}🛡️  Content filter terminated "
+                                f"stream — activating fallback provider...",
+                                force=True,
+                            )
+                            agent._emit_status(
+                                "Content filter terminated stream; switching to fallback..."
+                            )
+                            if agent._try_activate_fallback():
+                                # Roll the partial content (if any was already
+                                # appended in a prior continuation pass) back to
+                                # the last clean turn so the fallback provider
+                                # gets a coherent continuation point.
+                                if truncated_response_parts:
+                                    messages = agent._get_messages_up_to_last_assistant(messages)
+                                agent._session_messages = messages
+                                length_continue_retries = 0
+                                truncated_response_parts = []
+                                retry_count = 0
+                                compression_attempts = 0
+                                _retry.primary_recovery_attempted = False
+                                _retry.restart_with_rebuilt_messages = True
+                                break
+                            # No fallback available — fall through to normal
+                            # continuation (best-effort, may loop).
+                            agent._vprint(
+                                f"{agent.log_prefix}⚠️  No fallback provider "
+                                f"configured — retrying with same provider "
+                                f"(may re-hit filter)...",
+                                force=True,
+                            )
                        if assistant_message is not None and not _trunc_has_tool_calls:
                            length_continue_retries += 1
                            interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
@@ -2259,6 +2316,15 @@ def run_conversation(
                    # "unknown variant `image_url`, expected `text`".
                    "unknown variant `image_url`, expected `text`",
                    "unknown variant image_url, expected text",
+                    # OpenRouter routes a request to upstream endpoints and,
+                    # when none of the candidate endpoints for the model accept
+                    # image input, returns HTTP 404 "No endpoints found that
+                    # support image input". Without this phrase the agent never
+                    # strips the images, the retry loop re-sends the same
+                    # rejected request until exhaustion, and the gateway leaves
+                    # every subsequent message queued behind the stuck turn —
+                    # the P1 in issue #21160. The 404 passes the 4xx gate below.
+                    "no endpoints found that support image input",
                )
                _err_lower = _err_body.lower()
                _looks_like_image_rejection = any(
@@ -2830,10 +2896,9 @@ def run_conversation(
                            approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )
-                        # Compression created a new session — clear history
-                        # so _flush_messages_to_session_db writes compressed
-                        # messages to the new session, not skipping them.
-                        conversation_history = None
+                        conversation_history = conversation_history_after_compression(
+                            agent, messages
+                        )
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
                            agent._buffer_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
@@ -2845,15 +2910,25 @@ def run_conversation(
                    # Fall through to normal error handling if compression
                    # is exhausted or didn't help.

-                # Eager fallback for rate-limit errors (429 or quota exhaustion).
-                # When a fallback model is configured, switch immediately instead
-                # of burning through retries with exponential backoff -- the
-                # primary provider won't recover within the retry window.
+                # Eager fallback for rate-limit errors (429 or quota exhaustion)
+                # and transport errors (connection failure / timeout / provider
+                # overloaded).  Rate limits and billing: switch immediately —
+                # the primary provider won't recover within the retry window.
+                # Transport errors: allow 1 retry first (transient hiccups
+                # recover), then fall back if the provider is truly unreachable.
                is_rate_limited = classified.reason in {
                    FailoverReason.rate_limit,
                    FailoverReason.billing,
                }
-                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
+                _is_transport_failure = classified.reason in {
+                    FailoverReason.timeout,
+                    FailoverReason.overloaded,
+                }
+                _should_fallback = (
+                    is_rate_limited
+                    or (_is_transport_failure and retry_count >= 2)
+                )
+                if _should_fallback and agent._fallback_index < len(agent._fallback_chain):
                    # Don't eagerly fallback if credential pool rotation may
                    # still recover.  See _pool_may_recover_from_rate_limit
                    # for the single-credential-pool and CloudCode-quota
@@ -2868,6 +2943,10 @@ def run_conversation(
                            agent._buffer_status(
                                "⚠️ Billing or credits exhausted — switching to fallback provider..."
                            )
+                        elif _is_transport_failure:
+                            agent._buffer_status(
+                                "⚠️ Provider unreachable — switching to fallback provider..."
+                            )
                        else:
                            agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
@@ -3042,10 +3121,9 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history
-                    # so _flush_messages_to_session_db writes compressed
-                    # messages to the new session, not skipping them.
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3209,10 +3287,9 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history
-                    # so _flush_messages_to_session_db writes compressed
-                    # messages to the new session, not skipping them.
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3474,6 +3551,13 @@ def run_conversation(
                    ):
                        _retry.primary_recovery_attempted = True
                        retry_count = 0
+                        # Primary transport recovery starts a fresh attempt
+                        # cycle. Re-open fallback state so a follow-on 429 can
+                        # still activate fallback_providers after stale
+                        # pre-recovery fallback/credential-pool bookkeeping.
+                        _retry.has_retried_429 = False
+                        agent._fallback_index = 0
+                        agent._fallback_activated = False
                        continue
                    # Try fallback before giving up entirely
                    if agent._has_pending_fallback():
@@ -3661,7 +3745,12 @@ def run_conversation(
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
-                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
+                                # Cap at 10 minutes. Anthropic Tier 1 input-token
+                                # buckets reset in ~171s, so a 120s cap caused us to
+                                # retry before the actual reset window and re-trip the
+                                # limit. 600s covers all realistic provider reset
+                                # windows while still rejecting pathological values. (#26293)
+                                _retry_after = min(float(_ra_raw), 600)
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
@@ -3742,6 +3831,17 @@ def run_conversation(
            _retry.restart_with_compressed_messages = False
            continue

+        if _retry.restart_with_rebuilt_messages:
+            # A content-filter stream stall (#32421) was escalated to the
+            # fallback chain and the partial content rolled back.  Re-issue
+            # the API call against the now-active fallback provider.  Refund
+            # the budget/count for the stalled attempt so the fallback gets a
+            # fair turn.
+            api_call_count -= 1
+            agent.iteration_budget.refund()
+            _retry.restart_with_rebuilt_messages = False
+            continue
+
        if _retry.restart_with_length_continuation:
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
@@ -4316,10 +4416,9 @@ def run_conversation(
                        approx_tokens=agent.context_compressor.last_prompt_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history so
-                    # _flush_messages_to_session_db writes compressed messages
-                    # to the new session (see preflight compression comment).
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )
                
                # Save session log incrementally (so progress is visible even if interrupted)
                agent._session_messages = messages
@@ -4361,7 +4460,11 @@ def run_conversation(
                            "as final response"
                        )
                        final_response = _recovered
-                        agent._response_was_previewed = True
+                        # Streaming delivered a fragment, not a confirmed
+                        # final preview. Leave response_previewed false so
+                        # gateway fallback delivery can send the recovered
+                        # text plus the abnormal-turn explanation.
+                        agent._response_was_previewed = False
                        break

                    # If the previous turn already delivered real content alongside
@@ -4606,14 +4709,20 @@ def run_conversation(
                # status from earlier failed attempts in this turn.
                agent._clear_status_buffer()

+                from agent.agent_runtime_helpers import (
+                    intent_ack_continuation_mode,
+                )
+
+                _ack_mode = intent_ack_continuation_mode(agent)
                if (
-                    agent.api_mode == "codex_responses"
+                    _ack_mode != "off"
                    and agent.valid_tool_names
                    and codex_ack_continuations < 2
                    and agent._looks_like_codex_intermediate_ack(
                        user_message=user_message,
                        assistant_content=final_response,
                        messages=messages,
+                        require_workspace=(_ack_mode == "codex_only"),
                    )
                ):
                    codex_ack_continuations += 1
--- a/agent/copilot_acp_client.py
+++ b/agent/copilot_acp_client.py
@@ -21,8 +21,14 @@ from pathlib import Path
 from types import SimpleNamespace
 from typing import Any

+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
 from agent.file_safety import get_read_block_error, is_write_denied
 from agent.redact import redact_sensitive_text
+from tools.environments.local import hermes_subprocess_env

 ACP_MARKER_BASE_URL = "acp://copilot"
 _DEFAULT_TIMEOUT_SECONDS = 900.0
@@ -94,7 +100,10 @@ def _resolve_home_dir() -> str:


 def _build_subprocess_env() -> dict[str, str]:
-    env = os.environ.copy()
+    # Copilot ACP is a model-driving CLI executor: it legitimately needs LLM
+    # provider credentials. Route through the central helper so Tier-1 secrets
+    # (gateway bot tokens, GitHub auth, infra) are still stripped (#29157).
+    env = hermes_subprocess_env(inherit_credentials=True)
    home = _resolve_home_dir()
    env["HOME"] = home
    from hermes_constants import apply_subprocess_home_env
@@ -224,11 +233,73 @@ def _render_message_content(content: Any) -> str:
    return str(content).strip()


-def _extract_tool_calls_from_text(text: str) -> tuple[list[SimpleNamespace], str]:
+def _build_openai_tool_call(
+    *,
+    call_id: str,
+    name: str,
+    arguments: str,
+) -> ChatCompletionMessageToolCall:
+    """Build an OpenAI-compatible tool-call object for downstream handling."""
+    return ChatCompletionMessageToolCall(
+        id=call_id,
+        call_id=call_id,
+        response_item_id=None,
+        type="function",
+        function=Function(name=name, arguments=arguments),
+    )
+
+
+def _completion_to_stream_chunks(completion: SimpleNamespace) -> list[SimpleNamespace]:
+    """Convert a one-shot ACP response into OpenAI-style stream chunks."""
+    choice = completion.choices[0]
+    message = choice.message
+    tool_call_deltas = None
+    if message.tool_calls:
+        tool_call_deltas = []
+        for index, tool_call in enumerate(message.tool_calls):
+            tool_call_deltas.append(
+                SimpleNamespace(
+                    index=index,
+                    id=getattr(tool_call, "id", None),
+                    type=getattr(tool_call, "type", "function"),
+                    function=SimpleNamespace(
+                        name=getattr(tool_call.function, "name", None),
+                        arguments=getattr(tool_call.function, "arguments", None),
+                    ),
+                )
+            )
+
+    delta = SimpleNamespace(
+        role="assistant",
+        content=message.content or None,
+        tool_calls=tool_call_deltas,
+        reasoning_content=message.reasoning_content,
+        reasoning=message.reasoning,
+    )
+    data_chunk = SimpleNamespace(
+        choices=[
+            SimpleNamespace(
+                index=0,
+                delta=delta,
+                finish_reason=choice.finish_reason,
+            )
+        ],
+        model=completion.model,
+        usage=None,
+    )
+    usage_chunk = SimpleNamespace(
+        choices=[],
+        model=completion.model,
+        usage=completion.usage,
+    )
+    return [data_chunk, usage_chunk]
+
+
+def _extract_tool_calls_from_text(text: str) -> tuple[list[ChatCompletionMessageToolCall], str]:
    if not isinstance(text, str) or not text.strip():
        return [], ""

-    extracted: list[SimpleNamespace] = []
+    extracted: list[ChatCompletionMessageToolCall] = []
    consumed_spans: list[tuple[int, int]] = []

    def _try_add_tool_call(raw_json: str) -> None:
@@ -252,12 +323,10 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[SimpleNamespace], str
            call_id = f"acp_call_{len(extracted)+1}"

        extracted.append(
-            SimpleNamespace(
-                id=call_id,
+            _build_openai_tool_call(
                call_id=call_id,
-                response_item_id=None,
-                type="function",
-                function=SimpleNamespace(name=fn_name.strip(), arguments=fn_args),
+                name=fn_name.strip(),
+                arguments=fn_args,
            )
        )

@@ -376,6 +445,7 @@ class CopilotACPClient:
        timeout: float | None = None,
        tools: list[dict[str, Any]] | None = None,
        tool_choice: Any = None,
+        stream: bool = False,
        **_: Any,
    ) -> Any:
        prompt_text = _format_messages_as_prompt(
@@ -422,11 +492,14 @@ class CopilotACPClient:
        )
        finish_reason = "tool_calls" if tool_calls else "stop"
        choice = SimpleNamespace(message=assistant_message, finish_reason=finish_reason)
-        return SimpleNamespace(
+        completion = SimpleNamespace(
            choices=[choice],
            usage=usage,
            model=model or "copilot-acp",
        )
+        if stream:
+            return _completion_to_stream_chunks(completion)
+        return completion

    def _run_prompt(self, prompt_text: str, *, timeout_seconds: float) -> tuple[str, str]:
        try:
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -537,10 +537,11 @@ class CredentialPool:
                self._entries[idx] = new
                return

-    def _persist(self) -> None:
+    def _persist(self, *, removed_ids: Optional[List[str]] = None) -> None:
        write_credential_pool(
            self.provider,
            [entry.to_dict() for entry in self._entries],
+            removed_ids=removed_ids,
        )

    def _is_terminal_auth_failure(
@@ -1124,13 +1125,17 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal xAI OAuth state: %s", clear_exc
                        )
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source == "loopback_pkce"
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "loopback_pkce"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            # For openai-codex: same race as xAI/nous — another Hermes process
            # may have consumed the refresh token between our proactive sync
@@ -1190,13 +1195,17 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
                        )
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source == "device_code"
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "device_code"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
@@ -1253,13 +1262,17 @@ class CredentialPool:
                        auth_mod.NOUS_DEVICE_CODE_SOURCE,
                        f"manual:{auth_mod.NOUS_DEVICE_CODE_SOURCE}",
                    }
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source in singleton_sources
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source not in singleton_sources
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            self._mark_exhausted(entry, None)
            return None
@@ -1421,7 +1434,7 @@ class CredentialPool:
            pruned_ids = set(entries_to_prune)
            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
-            self._persist()
+            self._persist(removed_ids=entries_to_prune)
        return available

    def _select_unlocked(self) -> Optional[PooledCredential]:
@@ -1595,7 +1608,11 @@ class CredentialPool:
            replace(entry, priority=new_priority)
            for new_priority, entry in enumerate(self._entries)
        ]
-        self._persist()
+        write_credential_pool(
+            self.provider,
+            [entry.to_dict() for entry in self._entries],
+            removed_ids=[removed.id],
+        )
        if self._current_id == removed.id:
            self._current_id = None
        return removed
@@ -2257,6 +2274,11 @@ def _seed_custom_pool(pool_key: str, entries: List[PooledCredential]) -> Tuple[b
 def load_pool(provider: str) -> CredentialPool:
    provider = (provider or "").strip().lower()
    raw_entries = read_credential_pool(provider)
+    disk_ids = {
+        entry.get("id")
+        for entry in raw_entries
+        if isinstance(entry, dict) and entry.get("id")
+    }
    raw_needs_sanitization = any(
        isinstance(payload, dict)
        and sanitize_borrowed_credential_payload(payload, provider) != payload
@@ -2285,8 +2307,10 @@ def load_pool(provider: str) -> CredentialPool:
        changed |= _normalize_pool_priorities(provider, entries)

    if changed:
+        new_ids = {entry.id for entry in entries}
        write_credential_pool(
            provider,
            [entry.to_dict() for entry in sorted(entries, key=lambda item: item.priority)],
+            removed_ids=disk_ids - new_ids,
        )
    return CredentialPool(provider, entries)
--- a/agent/curator.py
+++ b/agent/curator.py
@@ -273,6 +273,21 @@ def should_run_now(now: Optional[datetime] = None) -> bool:
 # Automatic state transitions (pure function, no LLM)
 # ---------------------------------------------------------------------------

+def _cron_referenced_skills() -> Set[str]:
+    """Skill names referenced by any cron job (incl. paused/disabled).
+
+    Best-effort: a cron-module import error or corrupt jobs store must never
+    break the curator, so any failure yields an empty set (no protection,
+    but no crash).
+    """
+    try:
+        from cron.jobs import referenced_skill_names as _refs
+        return _refs()
+    except Exception as e:
+        logger.debug("Curator could not read cron skill references: %s", e, exc_info=True)
+        return set()
+
+
 def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int]:
    """Walk every curator-managed skill and move active/stale/archived based on
    the latest real activity timestamp. Pinned skills are never touched.
@@ -292,6 +307,8 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
    stale_cutoff = now - timedelta(days=get_stale_after_days())
    archive_cutoff = now - timedelta(days=get_archive_after_days())

+    cron_referenced = _cron_referenced_skills()
+
    counts = {"marked_stale": 0, "archived": 0, "reactivated": 0, "checked": 0, "seeded": 0}

    for row in _u.agent_created_report():
@@ -300,6 +317,15 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
        if row.get("pinned"):
            continue

+        # A skill referenced by any cron job (incl. paused/disabled) is in
+        # use by definition — resuming or the next fire must find it. The
+        # scheduler only bumps usage when a job actually fires, so jobs that
+        # fire less often than archive_after_days, paused jobs, and far-future
+        # one-shots would otherwise have their skills aged out from under
+        # them. Treat referenced skills like pinned: never auto-transition.
+        if name in cron_referenced:
+            continue
+
        # First sight of a curation-eligible skill with no persisted record
        # (e.g. a newly-eligible built-in): anchor its clock to now and defer.
        if not row.get("_persisted", True):
@@ -316,6 +342,18 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int

        current = row.get("state", _u.STATE_ACTIVE)

+        # Never-used skills (use_count == 0) get a grace floor: don't archive
+        # one until it is at least stale_after_days old. A use=0 skill is
+        # absence of evidence, not evidence of staleness — a skill created
+        # recently may simply not have had its trigger come up yet.
+        never_used = int(row.get("use_count", 0) or 0) == 0
+        if never_used and anchor > stale_cutoff:
+            # Younger than the stale window — leave it alone entirely.
+            if current == _u.STATE_STALE:
+                _u.set_state(name, _u.STATE_ACTIVE)
+                counts["reactivated"] += 1
+            continue
+
        if anchor <= archive_cutoff and current != _u.STATE_ARCHIVED:
            ok, _msg = _u.archive_skill(name)
            if ok:
@@ -390,10 +428,19 @@ CURATOR_REVIEW_PROMPT = (
    "back load-bearing UX (slash-command entry points referenced in docs and "
    "tips) and are filtered out of the candidate list below — never resurrect "
    "one as an archive or absorb target.\n"
+    "3c. DO NOT archive or prune any skill marked `cron=yes` in the candidate "
+    "list. A cron job depends on it and will fail to load it on its next "
+    "run. You MAY still consolidate it into an umbrella — but only because "
+    "the curator rewrites cron job skill references to follow consolidations; "
+    "never simply prune it.\n"
    "4. DO NOT use usage counters as a reason to skip consolidation. The "
    "counters are new and often mostly zero. Judge overlap on CONTENT, "
    "not on use_count. 'use=0' is not evidence a skill is valuable; it's "
-    "absence of evidence either way.\n"
+    "absence of evidence either way. Corollary: 'use=0' is ALSO not a "
+    "reason to PRUNE a skill. Never archive a never-used skill (use=0) "
+    "unless it is at least 30 days old (check last_activity / created date) "
+    "AND its content is genuinely obsolete or fully absorbed elsewhere — a "
+    "recently-created skill simply may not have had its trigger come up yet.\n"
    "5. DO NOT reject consolidation on the grounds that 'each skill has "
    "a distinct trigger'. Pairwise distinctness is the wrong bar. The "
    "right bar is: 'would a human maintainer write this as N separate "
@@ -1413,12 +1460,14 @@ def _render_candidate_list() -> str:
    rows = skill_usage.agent_created_report()
    if not rows:
        return "No agent-created skills to review."
+    cron_referenced = _cron_referenced_skills()
    lines = [f"Agent-created skills ({len(rows)}):\n"]
    for r in rows:
        lines.append(
            f"- {r['name']}  "
            f"state={r['state']}  "
            f"pinned={'yes' if r.get('pinned') else 'no'}  "
+            f"cron={'yes' if r['name'] in cron_referenced else 'no'}  "
            f"activity={r.get('activity_count', 0)}  "
            f"use={r.get('use_count', 0)}  "
            f"view={r.get('view_count', 0)}  "
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [
    "servicequotaexceededexception",
 ]

+# Patterns that indicate provider-side overload, NOT a per-credential rate
+# limit or billing problem.  The credential is valid — the server is just
+# busy — so the correct recovery is "back off and retry the same key", never
+# "rotate the credential" (rotating exhausts the pool while the endpoint is
+# still busy; a single-key user has nothing to rotate to).  Some providers
+# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
+# status path matches the body against this list before falling through to
+# the rate_limit default.  Phrases are kept narrow and overload-flavoured so a
+# normal rate-limit message ("you have been rate-limited") doesn't hit this
+# bucket. (#14038, #15297)
+_OVERLOADED_PATTERNS = [
+    "overloaded",
+    "temporarily overloaded",
+    "service is temporarily overloaded",
+    "service may be temporarily overloaded",
+    "server is overloaded",
+    "server overloaded",
+    "service overloaded",
+    "service is overloaded",
+    "upstream overloaded",
+    "currently overloaded",
+    "at capacity",
+    "over capacity",
+]
+
 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
 _USAGE_LIMIT_PATTERNS = [
    "usage limit",
@@ -330,6 +355,14 @@ _CONTENT_POLICY_BLOCKED_PATTERNS = [
    # echo back; the underscore form is provider-specific enough.
    "content_filter",
    "responsibleaipolicyviolation",
+    # MiniMax output-layer safety filter. The error string is surfaced
+    # verbatim by MiniMax SDK / OpenAI-compatible endpoints, usually in the
+    # form "output new_sensitive (1027)" when the model's *output* (often a
+    # large tool-call argument block) trips the upstream safety filter and
+    # the SSE stream is truncated mid-flight. ``new_sensitive`` is the
+    # filter name and is narrow enough that billing / format / auth error
+    # strings will not collide. See #32421.
+    "new_sensitive",
 ]

 # Auth patterns (non-status-code signals)
@@ -863,7 +896,19 @@ def _classify_by_status(
        )

    if status_code == 429:
-        # Already checked long_context_tier above; this is a normal rate limit
+        # Already checked long_context_tier above. Some providers (notably
+        # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
+        # code as a true per-credential rate limit, but the credential is
+        # valid and the correct recovery is "back off and retry the same key",
+        # NOT "rotate the credential" (which exhausts the pool while the
+        # endpoint is still busy, and does nothing for a single-key user).
+        # Disambiguate on the error body so an overload 429 takes the
+        # transient-overload path instead of burning the pool. (#14038)
+        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+            return result_fn(
+                FailoverReason.overloaded,
+                retryable=True,
+            )
        return result_fn(
            FailoverReason.rate_limit,
            retryable=True,
@@ -1214,6 +1259,17 @@ def _classify_by_message(
            should_fallback=True,
        )

+    # Overloaded / server-busy patterns — must come BEFORE the rate_limit and
+    # billing checks so that a message-only "overloaded" (no 503/529 status,
+    # e.g. some Anthropic-compatible proxies) classifies as a transient
+    # overload (backoff + retry) instead of falling through to `unknown` or
+    # incorrectly triggering credential rotation.
+    if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+        return result_fn(
+            FailoverReason.overloaded,
+            retryable=True,
+        )
+
    # Billing patterns
    if any(p in error_msg for p in _BILLING_PATTERNS):
        return result_fn(
@@ -1303,19 +1359,25 @@ def _extract_status_code(error: Exception) -> Optional[int]:


 def _extract_error_body(error: Exception) -> dict:
-    """Extract the structured error body from an SDK exception."""
-    body = getattr(error, "body", None)
-    if isinstance(body, dict):
-        return body
-    # Some errors have .response.json()
-    response = getattr(error, "response", None)
-    if response is not None:
-        try:
-            json_body = response.json()
-            if isinstance(json_body, dict):
-                return json_body
-        except Exception:
-            pass
+    """Extract the structured error body from an SDK exception or its cause chain."""
+    current = error
+    for _ in range(5):  # Match _extract_status_code() traversal depth.
+        body = getattr(current, "body", None)
+        if isinstance(body, dict):
+            return body
+        # Some errors have .response.json()
+        response = getattr(current, "response", None)
+        if response is not None:
+            try:
+                json_body = response.json()
+                if isinstance(json_body, dict):
+                    return json_body
+            except Exception:
+                pass
+        cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
+        if cause is None or cause is current:
+            break
+        current = cause
    return {}


--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@@ -251,6 +251,78 @@ def _supports_vision_override(
    return None


+def _resolve_inference_base_url(
+    cfg: Optional[Dict[str, Any]],
+    provider: str,
+) -> str:
+    """Best-effort base URL for the active inference provider."""
+    try:
+        from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL
+
+        runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip()
+        if runtime:
+            return runtime
+    except Exception:
+        pass
+
+    if not isinstance(cfg, dict):
+        return ""
+
+    model_cfg_raw = cfg.get("model")
+    model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
+    base_url = str(model_cfg.get("base_url") or "").strip()
+    if base_url:
+        return base_url
+
+    config_provider = str(model_cfg.get("provider") or "").strip()
+    candidate_names: set[str] = set()
+    for p in filter(None, (provider, config_provider)):
+        candidate_names.add(p)
+        if p.lower().startswith("custom:"):
+            candidate_names.add(p.split(":", 1)[1])
+        else:
+            candidate_names.add(f"custom:{p}")
+
+    providers_cfg = cfg.get("providers")
+    if isinstance(providers_cfg, dict):
+        for name in candidate_names:
+            entry = providers_cfg.get(name)
+            if isinstance(entry, dict):
+                bu = str(entry.get("base_url") or "").strip()
+                if bu:
+                    return bu
+
+    custom_providers = cfg.get("custom_providers")
+    if isinstance(custom_providers, list):
+        lowered = {n.lower() for n in candidate_names}
+        for entry_raw in custom_providers:
+            if not isinstance(entry_raw, dict):
+                continue
+            entry_name = str(entry_raw.get("name") or "").strip()
+            if entry_name not in candidate_names and entry_name.lower() not in lowered:
+                continue
+            bu = str(entry_raw.get("base_url") or "").strip()
+            if bu:
+                return bu
+
+    return ""
+
+
+def _should_probe_ollama_vision(provider: str, base_url: str) -> bool:
+    """True when the active provider likely fronts a local Ollama server."""
+    p = (provider or "").strip().lower()
+    if p == "ollama":
+        return True
+    if not base_url:
+        return False
+    try:
+        from agent.model_metadata import detect_local_server_type
+
+        return detect_local_server_type(base_url) == "ollama"
+    except Exception:
+        return False
+
+
 def _coerce_mode(raw: Any) -> str:
    """Normalize a config value into one of the valid modes."""
    if not isinstance(raw, str):
@@ -302,15 +374,33 @@ def _lookup_supports_vision(
        return override
    if not provider or not model:
        return None
+    caps = None
    try:
        from agent.models_dev import get_model_capabilities
        caps = get_model_capabilities(provider, model)
    except Exception as exc:  # pragma: no cover - defensive
        logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
-        return None
-    if caps is None:
-        return None
-    return bool(caps.supports_vision)
+    if caps is not None:
+        return bool(caps.supports_vision)
+
+    base_url = _resolve_inference_base_url(cfg, provider)
+    if not base_url and (provider or "").strip().lower() == "ollama":
+        base_url = "http://localhost:11434/v1"
+    if _should_probe_ollama_vision(provider, base_url):
+        try:
+            from agent.model_metadata import query_ollama_supports_vision
+
+            ollama_vision = query_ollama_supports_vision(model, base_url)
+            if ollama_vision is not None:
+                return ollama_vision
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.debug(
+                "image_routing: ollama vision probe failed for %s:%s — %s",
+                provider,
+                model,
+                exc,
+            )
+    return None


 def decide_image_input_mode(
@@ -388,14 +478,98 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
    # BMP: "BM"
    if raw.startswith(b"BM"):
        return "image/bmp"
-    # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc.
-    if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in {
-        b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis",
-    }:
-        return "image/heic"
+    # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
+    if len(raw) >= 12 and raw[4:8] == b"ftyp":
+        brand = raw[8:12]
+        if brand in {b"avif", b"avis"}:
+            return "image/avif"
+        if brand in {
+            b"heic", b"heix", b"hevc", b"hevx",
+            b"mif1", b"msf1", b"heim", b"heis",
+        }:
+            return "image/heic"
+    # TIFF: II*\0 (little-endian) or MM\0* (big-endian)
+    if raw[:4] in {b"II*\x00", b"MM\x00*"}:
+        return "image/tiff"
+    # ICO: 00 00 01 00 (reserved=0, type=1=icon)
+    if raw[:4] == b"\x00\x00\x01\x00":
+        return "image/x-icon"
+    # SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
+    head = raw[:512].lstrip().lower()
+    if head.startswith(b"<?xml") or head.startswith(b"<svg"):
+        if b"<svg" in head:
+            return "image/svg+xml"
    return None


+# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
+# accepts natively. Anything outside this set has to be transcoded to PNG
+# before we declare media_type, otherwise the provider returns HTTP 400
+# ("Could not process image" / "Unsupported image media type") and the
+# whole turn fails with no salvage path.
+#
+# Discord (and a few other chat platforms) freely accept attachments in
+# formats outside this set -- AVIF screenshots from Chromium, HEIC from
+# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
+# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
+# it is skipped (logged) rather than transcoded.
+_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
+    "image/png", "image/jpeg", "image/gif", "image/webp",
+})
+
+
+def _transcode_to_png(raw: bytes) -> Optional[bytes]:
+    """Decode arbitrary image bytes with Pillow and re-encode as PNG.
+
+    Returns None if Pillow isn't installed or can't decode the input
+    (rare formats, corrupted bytes, missing optional decoder plugin for
+    HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
+    the image so the rest of the turn still works.
+
+    HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
+    them on demand and swallow ImportError so a missing plugin just
+    looks like 'Pillow can't decode this' rather than crashing.
+    """
+    try:
+        from PIL import Image
+    except ImportError:
+        logger.info(
+            "image_routing: Pillow not installed; cannot transcode "
+            "non-standard image format to PNG. Install with `pip install Pillow` "
+            "(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
+        )
+        return None
+    # Optional plugin registration. Silent on failure: an unsupported
+    # format will just fall through to Image.open raising below.
+    try:
+        import pillow_heif  # type: ignore
+
+        pillow_heif.register_heif_opener()
+    except Exception:
+        pass
+    try:
+        import pillow_avif  # type: ignore  # noqa: F401  -- registers AVIF on import
+    except Exception:
+        pass
+    try:
+        from io import BytesIO
+
+        with Image.open(BytesIO(raw)) as im:
+            # Pick an output mode PNG can serialise. Anything other than
+            # the standard set gets normalised to RGBA so transparency is
+            # preserved where the source had it.
+            if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
+                im = im.convert("RGBA")
+            buf = BytesIO()
+            im.save(buf, format="PNG", optimize=False)
+            return buf.getvalue()
+    except Exception as exc:
+        logger.info(
+            "image_routing: Pillow could not transcode image to PNG -- %s", exc
+        )
+        return None
+
+
 def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
    """Return image MIME type for *path*.

@@ -431,8 +605,18 @@ def _file_to_data_url(path: Path) -> Optional[str]:
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

-    Returns None only if the file can't be read (missing, permission
-    denied, etc.); the caller reports those paths in ``skipped``.
+    Format compatibility IS handled here: if the sniffed MIME isn't one
+    of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
+    HEIC, BMP, TIFF, or ICO that some providers reject outright), we
+    transcode to PNG with Pillow before declaring media_type. This fixes
+    the user-visible "Could not process image" HTTP 400 from Anthropic on
+    Discord-attached AVIF/HEIC/BMP files.
+
+    Returns None if the file can't be read OR if the format isn't
+    universally supported AND Pillow can't transcode it (Pillow missing,
+    HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
+    caller reports those paths in ``skipped`` and the rest of the turn
+    proceeds.
    """
    try:
        raw = path.read_bytes()
@@ -440,6 +624,22 @@ def _file_to_data_url(path: Path) -> Optional[str]:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path, raw=raw)
+    if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
+        transcoded = _transcode_to_png(raw)
+        if transcoded is None:
+            logger.warning(
+                "image_routing: %s is %s which is not accepted by all major "
+                "vision providers and could not be transcoded to PNG; "
+                "skipping this attachment.",
+                path, mime,
+            )
+            return None
+        logger.info(
+            "image_routing: transcoded %s (%s) -> image/png for provider compatibility",
+            path.name, mime,
+        )
+        raw = transcoded
+        mime = "image/png"
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"

--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -8,6 +8,7 @@ iteration.

 from __future__ import annotations

+import hashlib
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any
@@ -25,20 +26,112 @@ logger = logging.getLogger(__name__)
 # opening dozens of sockets at once.
 _MAX_REFERENCE_WORKERS = 8

+# Per-tool-result character budget for the advisory reference view. Tool
+# results can be huge (a full diff, a 5000-line file dump); replaying them
+# verbatim per reference per tool-loop step would blow the reference model's
+# context window and cost. We keep the agent's *actions* (tool calls) in full —
+# they are cheap, high-signal, and tell the reference what the agent did — but
+# preview each tool *result* head+tail so the reference still sees what came
+# back without replaying megabytes. The acting aggregator always gets the full,
+# untrimmed transcript; this budget only shapes the advisory copy.
+_REFERENCE_TOOL_RESULT_BUDGET = 4000
+
+# System prompt prepended to every reference-model call. References are
+# advisory — they do NOT act, call tools, or own the task. Without this
+# framing a reference receives the bare trimmed conversation and assumes it is
+# the acting agent: it then refuses ("I can't access repositories / URLs from
+# here") or tries to call tools it doesn't have. The prompt reframes the model
+# as an analyst whose job is to reason about the presented state and hand its
+# best thinking to the aggregator/orchestrator that will actually act.
+_REFERENCE_SYSTEM_PROMPT = (
+    "You are a reference advisor in a Mixture of Agents (MoA) process. You are "
+    "NOT the acting agent and you do NOT execute anything: you cannot call "
+    "tools, run commands, browse, or access files, repositories, or URLs, and "
+    "you should not try to or apologize for being unable to. A separate "
+    "aggregator/orchestrator model holds those capabilities and will take the "
+    "actual actions.\n\n"
+    "The conversation below is the current state of a task handled by that "
+    "acting agent. Your job is to give your most intelligent analysis of that "
+    "state: understand the goal, reason about the problem, and advise on what "
+    "to do next. Surface the best approach, concrete next steps and tool-use "
+    "strategy, likely pitfalls and risks, and anything the acting agent may "
+    "have missed or gotten wrong. Assume any referenced files, URLs, or "
+    "systems exist and reason about them from the context given rather than "
+    "asking for access.\n\n"
+    "Respond with your advice directly — no preamble, no disclaimers about "
+    "tools or access. Your response is private guidance handed to the "
+    "aggregator, not an answer shown to the user."
+)
+
+

 def _slot_label(slot: dict[str, str]) -> str:
    return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"


+def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
+    """Resolve a reference/aggregator slot to real runtime call kwargs.
+
+    A MoA slot is just a model selection — it must be called the same way any
+    model is called elsewhere, not through a bare ``call_llm(provider=...,
+    model=...)`` that leaves base_url/api_key/api_mode unresolved and lets the
+    auxiliary auto-detector guess. We route the slot's provider through
+    ``resolve_runtime_provider`` (the canonical provider→api_mode/base_url/
+    api_key resolver the CLI, gateway, and delegate_task all use), so the slot
+    gets its provider's real API surface — e.g. MiniMax → anthropic_messages,
+    GPT-5/o-series → max_completion_tokens, custom endpoints → their base_url.
+
+    Returns the kwargs to pass through to ``call_llm`` (provider/model plus the
+    resolved base_url/api_key when available). Falls back to the bare
+    provider/model on any resolution error so a misconfigured slot still
+    attempts the call rather than aborting the whole MoA turn.
+    """
+    provider = str(slot.get("provider") or "").strip()
+    model = str(slot.get("model") or "").strip()
+    out: dict[str, Any] = {"provider": provider, "model": model}
+    try:
+        from hermes_cli.runtime_provider import resolve_runtime_provider
+
+        rt = resolve_runtime_provider(requested=provider, target_model=model)
+        resolved_provider = str(rt.get("provider") or provider).strip().lower()
+        # call_llm treats an explicit base_url as a custom endpoint. That is
+        # correct for ordinary OpenAI-compatible targets, but wrong for OAuth /
+        # provider-backed targets whose provider branch adds auth refresh,
+        # request metadata, or request-shape adapters. Keep those providers
+        # identified by name.
+        if resolved_provider in {"nous", "openai-codex", "xai-oauth"}:
+            return out
+        # Pass the resolved endpoint through so call_llm builds the request for
+        # the provider's actual API surface instead of auto-detecting. base_url
+        # routes call_llm to the right adapter (incl. anthropic_messages mode);
+        # api_key is the resolved credential for that provider.
+        if rt.get("base_url"):
+            out["base_url"] = rt["base_url"]
+        if rt.get("api_key"):
+            out["api_key"] = rt["api_key"]
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.debug("MoA slot runtime resolution failed for %s: %s", _slot_label(slot), exc)
+    return out
+
+
 def _run_reference(
    slot: dict[str, str],
    ref_messages: list[dict[str, Any]],
    *,
-    temperature: float,
-    max_tokens: int,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
 ) -> tuple[str, str]:
    """Call one reference model and return ``(label, text)``.

+    The slot is resolved to its provider's real runtime (via ``_slot_runtime``)
+    and called through the same ``call_llm`` request-building path any model
+    uses, so per-model wire-format handling (anthropic_messages,
+    max_completion_tokens, fixed/forbidden temperature) applies identically to
+    a reference as it would if that model were the acting model. MoA imposes no
+    cap of its own (``max_tokens`` defaults to ``None`` → omitted → the model's
+    real maximum); ``temperature`` is only the user's configured preset value,
+    which call_llm may still override per model.
+
    Never raises: a failed reference becomes a labelled note so the aggregator
    can still act with partial context. Designed to run inside a thread pool —
    ``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right
@@ -46,13 +139,17 @@ def _run_reference(
    """
    label = _slot_label(slot)
    try:
+        # Prepend the advisory-role system prompt so the reference understands
+        # it is analyzing state for an aggregator, not acting on the task. The
+        # trimmed view (_reference_messages) already strips the agent's own
+        # system prompt, so this is the only system message the reference sees.
+        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
        response = call_llm(
            task="moa_reference",
-            provider=slot["provider"],
-            model=slot["model"],
-            messages=ref_messages,
+            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
+            **_slot_runtime(slot),
        )
        return label, _extract_text(response) or "(empty response)"
    except Exception as exc:
@@ -64,8 +161,8 @@ def _run_references_parallel(
    reference_models: list[dict[str, str]],
    ref_messages: list[dict[str, Any]],
    *,
-    temperature: float,
-    max_tokens: int,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
 ) -> list[tuple[str, str]]:
    """Fan out all reference models in parallel, returning outputs in order.

@@ -106,40 +203,140 @@ def _run_references_parallel(
    return [r for r in results if r is not None]


-def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Build an advisory-safe view of the conversation for reference models.
+def _truncate_tool_result(text: str, budget: int = _REFERENCE_TOOL_RESULT_BUDGET) -> str:
+    """Head+tail preview of a tool result for the advisory view.

-    Reference calls are advisory: they never call tools and never emit the
-    ``tool_calls`` the main model did. Replaying the full transcript verbatim
-    (a) re-bills the ~8K-token Hermes system prompt per reference per
-    iteration and (b) risks 400s from strict providers (Mistral, Fireworks)
-    that reject orphan ``tool`` messages or ``tool_calls`` the reference never
-    produced. We keep only the user/assistant *text* turns, dropping the
-    system prompt, any ``tool``-role messages, and any ``tool_calls`` payloads.
+    Keeps the first and last halves of the budget with a ``[... N chars
+    omitted ...]`` marker between them, so a reference sees both how the result
+    started and how it ended without replaying the whole payload.
    """
-    trimmed: list[dict[str, Any]] = []
+    if not text or len(text) <= budget:
+        return text
+    half = budget // 2
+    omitted = len(text) - 2 * half
+    return f"{text[:half]}\n[... {omitted} chars omitted ...]\n{text[-half:]}"
+
+
+def _render_tool_calls(tool_calls: Any) -> str:
+    """Render an assistant turn's tool_calls as readable text lines.
+
+    The advisory view cannot carry real ``tool_calls`` payloads (strict
+    providers reject tool_calls the reference never produced), so the agent's
+    actions are flattened to text the reference can read and reason about.
+    """
+    lines: list[str] = []
+    for tc in tool_calls or []:
+        fn = (tc.get("function") or {}) if isinstance(tc, dict) else {}
+        name = fn.get("name") or (tc.get("name") if isinstance(tc, dict) else "") or "tool"
+        args = fn.get("arguments")
+        if isinstance(args, str):
+            args_text = args
+        elif args is not None:
+            try:
+                import json
+
+                args_text = json.dumps(args, ensure_ascii=False)
+            except Exception:
+                args_text = str(args)
+        else:
+            args_text = ""
+        lines.append(f"[called tool: {name}({args_text})]" if args_text else f"[called tool: {name}]")
+    return "\n".join(lines)
+
+
+def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build an advisory view of the conversation for reference models.
+
+    A reference gives an INFORMED judgement on the current state, so it must
+    see what the agent actually did — its tool calls AND the tool results that
+    came back — not just the agent's narration. We therefore preserve the whole
+    conversation flow, but flatten it into clean user/assistant *text* turns:
+
+      - system prompt: dropped (8K of Hermes boilerplate, not advisory signal).
+      - assistant turns: kept; any ``tool_calls`` are rendered inline as
+        ``[called tool: name(args)]`` text lines appended to the turn's text.
+      - ``tool``-role results: NOT dropped. Each is folded (head+tail preview,
+        see ``_truncate_tool_result``) into the *preceding* assistant turn as a
+        ``[tool result: ...]`` block, so the reference sees what came back.
+
+    This emits ZERO ``tool``-role messages and ZERO ``tool_calls`` arrays — only
+    plain user/assistant text — so strict providers (Mistral, Fireworks) that
+    reject orphan tool messages / unproduced tool_calls don't 400, while the
+    reference still has the full picture.
+
+    The view MUST end with a ``user`` turn. Anthropic (and OpenRouter→Anthropic)
+    interpret a trailing assistant turn as an assistant *prefill* to continue,
+    and no-prefill models (e.g. Claude Opus 4.8) reject it with
+    ``400 ... must end with a user message``. Rather than DELETE the agent's
+    latest context to satisfy that (which would blind the reference to the
+    current state), we APPEND a synthetic user turn asking the reference to
+    judge the state above. End-on-user is satisfied and no context is lost.
+
+    The acting aggregator always receives the full, untrimmed transcript; this
+    function only shapes the disposable advisory copy.
+    """
+    advisory_instruction = (
+        "[The conversation above is the current state of the task. Give your "
+        "most intelligent judgement: what is going on, what should happen next, "
+        "what risks or mistakes you see, and how the acting agent should "
+        "proceed.]"
+    )
+
+    rendered: list[dict[str, Any]] = []
+    last_user_content: str | None = None
    for msg in messages:
        role = msg.get("role")
-        if role not in ("user", "assistant"):
-            # Drop system prompt and tool-result messages.
-            continue
        content = msg.get("content")
-        if not isinstance(content, str):
-            # Skip non-text (multimodal/tool-call-only) assistant turns.
-            if not content:
-                continue
        text = content if isinstance(content, str) else ""
-        if role == "assistant" and not text.strip():
-            # Assistant turn that was purely tool calls — nothing advisory.
+
+        if role == "system":
            continue
-        trimmed.append({"role": role, "content": text})
-    if not trimmed:
-        # Degenerate case (e.g. first turn was stripped): fall back to a
-        # minimal user turn so the reference still has something to answer.
+        if role == "user":
+            if text.strip():
+                last_user_content = text
+            rendered.append({"role": "user", "content": text})
+        elif role == "assistant":
+            parts: list[str] = []
+            if text.strip():
+                parts.append(text.strip())
+            calls_text = _render_tool_calls(msg.get("tool_calls"))
+            if calls_text:
+                parts.append(calls_text)
+            # Empty assistant turns (no text, no calls) carry nothing advisory.
+            if parts:
+                rendered.append({"role": "assistant", "content": "\n".join(parts)})
+        elif role == "tool":
+            # Fold the tool result into the preceding assistant turn as text so
+            # the reference sees what came back, without emitting a tool-role
+            # message a reference never produced.
+            result_text = _truncate_tool_result(text)
+            block = f"[tool result: {result_text}]"
+            if rendered and rendered[-1].get("role") == "assistant":
+                rendered[-1]["content"] = rendered[-1]["content"] + "\n" + block
+            else:
+                # No assistant turn to attach to (e.g. a leading tool result);
+                # keep it as advisory context on its own assistant-role line.
+                rendered.append({"role": "assistant", "content": block})
+        # Any other role is ignored.
+
+    # End on a user turn: append a synthetic advisory request rather than
+    # deleting the agent's latest assistant context. This satisfies Anthropic's
+    # no-trailing-assistant-prefill rule while preserving full state.
+    if rendered and rendered[-1].get("role") == "assistant":
+        rendered.append({"role": "user", "content": advisory_instruction})
+    elif rendered and rendered[-1].get("role") == "user":
+        # Already ends on a user turn (fresh user prompt, no agent action yet).
+        # Leave it — the reference answers that prompt directly.
+        pass
+
+    if not rendered:
+        # Degenerate case: nothing rendered. Fall back to the latest user turn.
+        if last_user_content is not None:
+            return [{"role": "user", "content": last_user_content}]
        for msg in reversed(messages):
            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
                return [{"role": "user", "content": msg["content"]}]
-    return trimmed
+    return rendered



@@ -169,12 +366,18 @@ def aggregate_moa_context(
    aggregator: dict[str, str],
    temperature: float = 0.6,
    aggregator_temperature: float = 0.4,
-    max_tokens: int = 4096,
+    max_tokens: int | None = None,
 ) -> str:
    """Run configured reference models and synthesize their advice.

    Failures are returned as model-specific notes instead of aborting the normal
    agent loop; the main model can still act with partial context.
+
+    ``max_tokens`` is ``None`` by default: MoA does not cap reference or
+    aggregator output, so each model uses its own maximum. ``call_llm`` omits
+    the parameter entirely when it is ``None`` (see its docstring), which also
+    sidesteps providers that reject ``max_tokens`` outright. A hardcoded cap
+    here previously truncated long aggregator syntheses.
    """
    reference_outputs: list[tuple[str, str]] = []
    ref_messages = _reference_messages(api_messages)
@@ -203,11 +406,10 @@ def aggregate_moa_context(
    try:
        response = call_llm(
            task="moa_aggregator",
-            provider=aggregator["provider"],
-            model=aggregator["model"],
            messages=[{"role": "user", "content": synth_prompt}],
            temperature=aggregator_temperature,
            max_tokens=max_tokens,
+            **_slot_runtime(aggregator),
        )
        synthesis = _extract_text(response)
    except Exception as exc:
@@ -230,8 +432,38 @@ def aggregate_moa_context(
 class MoAChatCompletions:
    """OpenAI-chat-compatible facade where the aggregator is the acting model."""

-    def __init__(self, preset_name: str):
+    def __init__(self, preset_name: str, reference_callback: Any = None):
        self.preset_name = preset_name or "default"
+        # Optional display hook. Called as reference outputs become available so
+        # frontends can show each reference model's answer as a labelled block
+        # before the aggregator acts. Signature:
+        #   reference_callback(event, **kwargs)
+        # where event is one of:
+        #   "moa.reference"   kwargs: index, count, label, text
+        #   "moa.aggregating" kwargs: aggregator (label), ref_count
+        # Never raises into the model call — display is best-effort.
+        self.reference_callback = reference_callback
+        # State-scoped reference cache. The agent loop calls create() once per
+        # tool-loop iteration; references should re-run whenever the task STATE
+        # advances — i.e. on every new user message AND every new tool result —
+        # so each reference judges the latest state. The advisory view
+        # (_reference_messages) now renders tool calls + results as text, so its
+        # signature changes on every new tool response; the cache key is that
+        # signature, so a new tool result is a cache MISS (references re-run)
+        # while a redundant create() call with identical state is a HIT (no
+        # re-run, no re-emit). This gives "fire on every user/tool response"
+        # for free, without re-firing on a pure no-op re-call.
+        self._ref_cache_key: tuple | None = None
+        self._ref_cache_outputs: list[tuple[str, str]] = []
+
+    def _emit(self, event: str, **kwargs: Any) -> None:
+        cb = self.reference_callback
+        if cb is None:
+            return
+        try:
+            cb(event, **kwargs)
+        except Exception as exc:  # pragma: no cover - display must never break the turn
+            logger.debug("MoA reference_callback failed for %s: %s", event, exc)

    def create(self, **api_kwargs: Any) -> Any:
        from hermes_cli.config import load_config
@@ -241,7 +473,10 @@ class MoAChatCompletions:
        messages = list(api_kwargs.get("messages") or [])
        reference_models = preset.get("reference_models") or []
        aggregator = preset.get("aggregator") or {}
-        max_tokens = int(preset.get("max_tokens", api_kwargs.get("max_tokens") or 4096) or 4096)
+        # MoA does not cap reference or aggregator output: each model uses its
+        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
+        # (it never caps by default), so a long aggregator synthesis is never
+        # truncated and providers that reject max_tokens don't 400.
        temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
        aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)

@@ -253,12 +488,52 @@ class MoAChatCompletions:

        reference_outputs: list[tuple[str, str]] = []
        ref_messages = _reference_messages(messages)
-        reference_outputs = _run_references_parallel(
-            reference_models,
-            ref_messages,
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
+
+        # Turn-scoped cache: only run + display references when the advisory
+        # view changed (i.e. a new user turn). Within one turn the agent loop
+        # calls create() once per tool iteration with the same advisory view;
+        # reuse the cached outputs and skip both the re-run and the re-emit.
+        _sig = hashlib.sha256(
+            "\u0000".join(
+                f"{m.get('role')}:{m.get('content')}" for m in ref_messages
+            ).encode("utf-8", "replace")
+        ).hexdigest()
+        _cache_key = (self.preset_name, _sig, tuple(_slot_label(s) for s in reference_models))
+        _refs_from_cache = _cache_key == self._ref_cache_key and bool(self._ref_cache_outputs)
+
+        if _refs_from_cache:
+            reference_outputs = list(self._ref_cache_outputs)
+        else:
+            reference_outputs = _run_references_parallel(
+                reference_models,
+                ref_messages,
+                temperature=temperature,
+                max_tokens=None,
+            )
+            self._ref_cache_key = _cache_key
+            self._ref_cache_outputs = list(reference_outputs)
+
+            # Surface each reference model's answer to the display BEFORE the
+            # aggregator acts — once per turn (only on the iteration that
+            # actually ran them). The user sees one labelled block per
+            # reference (rendered like a thinking block) so the MoA process is
+            # visible rather than a silent pause. Best-effort: never blocks the
+            # turn.
+            _ref_count = len(reference_outputs)
+            for _idx, (_label, _text) in enumerate(reference_outputs, start=1):
+                self._emit(
+                    "moa.reference",
+                    index=_idx,
+                    count=_ref_count,
+                    label=_label,
+                    text=_text,
+                )
+            if _ref_count:
+                self._emit(
+                    "moa.aggregating",
+                    aggregator=_slot_label(aggregator),
+                    ref_count=_ref_count,
+                )

        agg_messages = [dict(m) for m in messages]
        if reference_outputs:
@@ -286,21 +561,26 @@ class MoAChatCompletions:
            raise RuntimeError("MoA aggregator cannot be another MoA preset")
        agg_kwargs = dict(api_kwargs)
        agg_kwargs["messages"] = agg_messages
-        agg_kwargs["model"] = aggregator.get("model")
-        agg_kwargs["temperature"] = aggregator_temperature
+        # The aggregator is the acting model. Resolve its slot to the provider's
+        # real runtime (base_url/api_key/api_mode) and call it through the same
+        # request-building path any model uses — so per-model wire-format
+        # handling (anthropic_messages, max_completion_tokens, fixed/forbidden
+        # temperature) applies identically to it. MoA imposes no output cap:
+        # max_tokens is passed through from the caller (normally None → omitted
+        # → the model's real maximum). The preset's old hardcoded 4096 default
+        # is gone — it truncated long syntheses.
        return call_llm(
            task="moa_aggregator",
-            provider=aggregator.get("provider"),
-            model=aggregator.get("model"),
            messages=agg_messages,
            temperature=aggregator_temperature,
            max_tokens=agg_kwargs.get("max_tokens"),
            tools=agg_kwargs.get("tools"),
            extra_body=agg_kwargs.get("extra_body"),
+            **_slot_runtime(aggregator),
        )


 class MoAClient:
-    def __init__(self, preset_name: str):
+    def __init__(self, preset_name: str, reference_callback: Any = None):
        self.chat = type("_MoAChat", (), {})()
-        self.chat.completions = MoAChatCompletions(preset_name)
+        self.chat.completions = MoAChatCompletions(preset_name, reference_callback=reference_callback)
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -478,6 +478,16 @@ def _infer_provider_from_url(base_url: str) -> Optional[str]:
    return None


+def _lmstudio_server_root(base_url: str) -> str:
+    """Return the LM Studio server root for native ``/api/v1`` endpoints."""
+    root = _normalize_base_url(base_url).rstrip("/")
+    for suffix in ("/api/v1", "/api", "/v1"):
+        if root.endswith(suffix):
+            root = root[: -len(suffix)].rstrip("/")
+            break
+    return root
+
+
 def _is_known_provider_base_url(base_url: str) -> bool:
    return _infer_provider_from_url(base_url) is not None

@@ -549,6 +559,7 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
    server_url = normalized
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
+    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -556,7 +567,7 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
        with httpx.Client(timeout=2.0, headers=headers) as client:
            # LM Studio exposes /api/v1/models — check first (most specific)
            try:
-                r = client.get(f"{server_url}/api/v1/models")
+                r = client.get(f"{lmstudio_url}/api/v1/models")
                if r.status_code == 200:
                    return "lm-studio"
            except Exception:
@@ -774,7 +785,7 @@ def fetch_endpoint_model_metadata(
    if is_local_endpoint(normalized):
        try:
            if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
-                server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
+                server_url = _lmstudio_server_root(normalized)
                response = requests.get(
                    server_url.rstrip("/") + "/api/v1/models",
                    headers=headers,
@@ -1188,6 +1199,56 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option
    return None


+def query_ollama_supports_vision(model: str, base_url: str, api_key: str = "") -> Optional[bool]:
+    """Return True/False when Ollama ``/api/show`` reports vision support.
+
+    Uses the ``capabilities`` field on Ollama 0.6.0+ and falls back to
+    ``model_info.*.vision.block_count`` on older servers. Returns None when
+    the server is unreachable, not Ollama, or the model is unknown.
+    """
+    import httpx
+
+    bare_model = _strip_provider_prefix(model)
+    if not bare_model or not base_url:
+        return None
+
+    try:
+        if detect_local_server_type(base_url, api_key=api_key) != "ollama":
+            return None
+    except Exception:
+        return None
+
+    server_url = base_url.rstrip("/")
+    if server_url.endswith("/v1"):
+        server_url = server_url[:-3]
+
+    headers = _auth_headers(api_key)
+
+    try:
+        with httpx.Client(timeout=3.0, headers=headers) as client:
+            resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
+            if resp.status_code != 200:
+                return None
+            data = resp.json()
+    except Exception:
+        return None
+
+    caps = data.get("capabilities")
+    if isinstance(caps, list):
+        if any(str(cap).lower() == "vision" for cap in caps):
+            return True
+        if caps:
+            return False
+
+    model_info = data.get("model_info")
+    if isinstance(model_info, dict):
+        for key in model_info:
+            if "vision.block_count" in str(key).lower():
+                return True
+
+    return None
+
+
 def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query an Ollama server's native ``/api/show`` for context length.

@@ -1297,6 +1358,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
    server_url = base_url.rstrip("/")
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
+    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -1340,7 +1402,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
            # Use _model_id_matches for fuzzy matching: LM Studio stores models as
            # "publisher/slug" but users configure only "slug" after "local:" prefix.
            if server_type == "lm-studio":
-                resp = client.get(f"{server_url}/api/v1/models")
+                resp = client.get(f"{lmstudio_url}/api/v1/models")
                if resp.status_code == 200:
                    data = resp.json()
                    for m in data.get("models", []):
@@ -1646,6 +1708,34 @@ def get_model_context_length(
    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
        return config_context_length

+    # 0a. MoA virtual provider — ``model`` is a preset name, not a real model,
+    # and ``base_url`` is the local virtual endpoint, so every probe below would
+    # miss and fall through to the 256K default. The aggregator is the acting
+    # model, so resolve the context window from the aggregator slot's real
+    # provider+model instead. References are advisory-only and never bound the
+    # acting context, so they're ignored here.
+    if (provider or "").strip().lower() == "moa":
+        try:
+            from hermes_cli.config import load_config
+            from hermes_cli.moa_config import resolve_moa_preset
+            from hermes_cli.runtime_provider import resolve_runtime_provider
+
+            preset = resolve_moa_preset(load_config().get("moa") or {}, model)
+            agg = preset.get("aggregator") or {}
+            agg_provider = str(agg.get("provider") or "").strip()
+            agg_model = str(agg.get("model") or "").strip()
+            if agg_model and agg_provider and agg_provider.lower() != "moa":
+                rt = resolve_runtime_provider(requested=agg_provider, target_model=agg_model)
+                return get_model_context_length(
+                    agg_model,
+                    base_url=rt.get("base_url", "") or "",
+                    api_key=rt.get("api_key", "") or "",
+                    provider=agg_provider,
+                )
+        except Exception:
+            logger.debug("MoA aggregator context-length resolution failed", exc_info=True)
+        # Fall through to the generic default if aggregator resolution failed.
+
    # 0b. custom_providers per-model override — check before any probe.
    # This closes the gap where /model switch and display paths used to fall
    # back to 128K despite the user having a per-model context_length set.
--- a/agent/process_bootstrap.py
+++ b/agent/process_bootstrap.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import os
 import sys
 import urllib.request
-from typing import Optional
+from typing import Any, Optional

 from utils import base_url_hostname, normalize_proxy_url

@@ -142,6 +142,46 @@ def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
    return proxy


+def build_keepalive_http_client(
+    base_url: str = "",
+    *,
+    async_mode: bool = False,
+) -> Optional[Any]:
+    """Build an httpx client for OpenAI SDK calls with env-only proxy policy.
+
+    Uses explicit ``HTTPS_PROXY`` / ``NO_PROXY`` env vars via
+    ``_get_proxy_for_base_url``. A custom transport disables httpx's default
+    ``trust_env`` path, so macOS system proxy settings from
+    ``urllib.request.getproxies()`` (which omit the ExceptionsList) are not
+    applied. Mirrors ``AIAgent._build_keepalive_http_client``.
+    """
+    try:
+        import httpx
+        import socket
+
+        if "api.githubcopilot.com" in str(base_url or "").lower():
+            client_cls = httpx.AsyncClient if async_mode else httpx.Client
+            return client_cls()
+
+        sock_opts = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)]
+        if hasattr(socket, "TCP_KEEPIDLE"):
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 30))
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10))
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3))
+        elif hasattr(socket, "TCP_KEEPALIVE"):
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPALIVE, 30))
+
+        proxy = _get_proxy_for_base_url(base_url)
+        transport_cls = httpx.AsyncHTTPTransport if async_mode else httpx.HTTPTransport
+        client_cls = httpx.AsyncClient if async_mode else httpx.Client
+        return client_cls(
+            transport=transport_cls(socket_options=sock_opts),
+            proxy=proxy,
+        )
+    except Exception:
+        return None
+
+
 def _install_safe_stdio() -> None:
    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
    for stream_name in ("stdout", "stderr"):
@@ -164,4 +204,5 @@ __all__ = [
    "_install_safe_stdio",
    "_get_proxy_from_env",
    "_get_proxy_for_base_url",
+    "build_keepalive_http_client",
 ]
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -88,12 +88,15 @@ def _find_hermes_md(cwd: Path) -> Optional[Path]:
    stop_at = _find_git_root(cwd)
    current = cwd.resolve()

-    for directory in [current, *current.parents]:
+    # When there is no git root, only check cwd itself – walking parents
+    # could pick up a .hermes.md planted in /tmp, /home, etc.
+    search_dirs = [current, *current.parents] if stop_at else [current]
+
+    for directory in search_dirs:
        for name in _HERMES_MD_NAMES:
            candidate = directory / name
            if candidate.is_file():
                return candidate
-        # Stop walking at the git root (or filesystem root).
        if stop_at and directory == stop_at:
            break
    return None
@@ -617,7 +620,12 @@ DEVELOPER_ROLE_MODELS = ("gpt-5", "codex")
 PLATFORM_HINTS = {
    "whatsapp": (
        "You are on a text messaging communication platform, WhatsApp. "
-        "Please do not use markdown as it does not render. "
+        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
+        "`code`, ```code blocks```, [links](url)) is auto-converted to "
+        "WhatsApp's native syntax (*bold*, _italic_, ~strike~, monospace) — "
+        "feel free to write in markdown, and use bullet lists ('- item') "
+        "freely. Tables are NOT supported — prefer bullet lists or labeled "
+        "key:value pairs. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. The file "
        "will be sent as a native WhatsApp attachment — images (.jpg, .png, "
@@ -682,7 +690,11 @@ PLATFORM_HINTS = {
    ),
    "signal": (
        "You are on a text messaging communication platform, Signal. "
-        "Please do not use markdown as it does not render. "
+        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
+        "`code`, ```code blocks```) is auto-converted to Signal's native "
+        "rich formatting — feel free to write in markdown, and use bullet "
+        "lists ('- item') freely (they render as • bullets). Tables are NOT "
+        "supported — prefer bullet lists or labeled key:value pairs. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. Images "
        "(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
@@ -917,8 +929,7 @@ def _probe_remote_backend(env_type: str) -> str | None:
    try:
        # Import locally: tools/ imports are heavy and only relevant when a
        # non-local backend is actually configured.
-        from tools.terminal_tool import _get_env_config  # type: ignore
-        from tools.environments import get_environment  # type: ignore
+        from tools.terminal_tool import _create_environment, _get_env_config  # type: ignore
    except Exception as e:
        logger.debug("Backend probe unavailable (import failed): %s", e)
        _BACKEND_PROBE_CACHE[cache_key] = ""
@@ -926,7 +937,59 @@ def _probe_remote_backend(env_type: str) -> str | None:

    try:
        config = _get_env_config()
-        env = get_environment(config)
+        # Build the environment the same way tools/terminal_tool.py does for a
+        # live command: select the backend image, then assemble ssh/container
+        # config from the env-derived dict. (There is no `get_environment`
+        # factory — the real entry point is `_create_environment`.)
+        if env_type == "docker":
+            image = config.get("docker_image", "")
+        elif env_type == "singularity":
+            image = config.get("singularity_image", "")
+        elif env_type == "modal":
+            image = config.get("modal_image", "")
+        elif env_type == "daytona":
+            image = config.get("daytona_image", "")
+        else:
+            image = ""
+
+        ssh_config = None
+        if env_type == "ssh":
+            ssh_config = {
+                "host": config.get("ssh_host", ""),
+                "user": config.get("ssh_user", ""),
+                "port": config.get("ssh_port", 22),
+                "key": config.get("ssh_key", ""),
+                "persistent": config.get("ssh_persistent", False),
+            }
+
+        container_config = None
+        if env_type in {"docker", "singularity", "modal", "daytona"}:
+            container_config = {
+                "container_cpu": config.get("container_cpu", 1),
+                "container_memory": config.get("container_memory", 5120),
+                "container_disk": config.get("container_disk", 51200),
+                "container_persistent": config.get("container_persistent", True),
+                "modal_mode": config.get("modal_mode", "auto"),
+                "docker_volumes": config.get("docker_volumes", []),
+                "docker_mount_cwd_to_workspace": config.get("docker_mount_cwd_to_workspace", False),
+                "docker_forward_env": config.get("docker_forward_env", []),
+                "docker_env": config.get("docker_env", {}),
+                "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
+                "docker_extra_args": config.get("docker_extra_args", []),
+                "docker_persist_across_processes": config.get("docker_persist_across_processes", True),
+                "docker_orphan_reaper": config.get("docker_orphan_reaper", True),
+            }
+
+        env = _create_environment(
+            env_type=env_type,
+            image=image,
+            cwd=config.get("cwd", ""),
+            timeout=config.get("timeout", 180),
+            ssh_config=ssh_config,
+            container_config=container_config,
+            task_id="prompt-backend-probe",
+            host_cwd=config.get("host_cwd"),
+        )
        # Single-line POSIX probe — works on any Unixy backend. Wrapped in
        # `2>/dev/null` so a missing binary doesn't pollute the output.
        probe_cmd = (
--- a/agent/redact.py
+++ b/agent/redact.py
@@ -10,6 +10,7 @@ the first 6 and last 4 characters for debuggability.
 import logging
 import os
 import re
+import shlex

 logger = logging.getLogger(__name__)

@@ -107,12 +108,60 @@ _PREFIX_PATTERNS = [
    r"ntn_[A-Za-z0-9]{10,}",            # Notion internal integration token
 ]

-# ENV assignment patterns: KEY=value where KEY contains a secret-like name
+# ENV assignment patterns: KEY=value where KEY contains a secret-like name.
+# Uppercase keys tolerate spaces around "=" (e.g. ``FOO_SECRET = bar``) because
+# an all-caps key is almost never prose/code.
 _SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
 _ENV_ASSIGN_RE = re.compile(
    rf"([A-Z0-9_]{{0,50}}{_SECRET_ENV_NAMES}[A-Z0-9_]{{0,50}})\s*=\s*(['\"]?)(\S+)\2",
 )

+# Lowercase / dotted / hyphenated config keys from config files
+# (application.properties, .env, YAML-ish dumps): ``spring.datasource.password=secret``,
+# ``app.api.key=xyz``, ``password=secret``. The uppercase _ENV_ASSIGN_RE above
+# never matched these, so config-file passwords leaked verbatim (issue #16413).
+#
+# These run only in a config-file context, NOT in prose, code, or URLs — three
+# carve-outs preserved from the original design (#4367 + the documented
+# web-URL passthrough below):
+#   1. The value is bounded by ``[^\s&]`` (stops at whitespace AND ``&``) so
+#      form-urlencoded bodies are handled pair-by-pair (by _redact_form_body),
+#      not greedily swallowed.
+#   2. _CFG_DOTTED_RE only matches when the key is NAMESPACED (contains a dot),
+#      which is unambiguously a config key — never a prose word.
+#   3. _CFG_ANCHORED_RE matches a bare secret-word key only at line start
+#      (optionally after ``export``), so conversational ``I have password=foo``
+#      mid-sentence is left alone.
+# The colon-form URL guard (skip when ``://`` present) lives at the call site.
+_SECRET_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential|auth)"
+_CFG_VALUE = r"(['\"]?)([^\s&]+?)\2(?=[\s&]|$)"
+# Namespaced (dotted) key: the secret word may sit anywhere in a dotted path.
+_CFG_DOTTED_RE = re.compile(
+    rf"((?:[A-Za-z0-9_\-]+\.)+[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*"
+    rf"|[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*\.[A-Za-z0-9_.\-]+)"
+    rf"={_CFG_VALUE}",
+    re.IGNORECASE,
+)
+# Line-anchored bare key: ``password=…`` / ``export api_key=…`` at start of line.
+_CFG_ANCHORED_RE = re.compile(
+    rf"(^[ \t]*(?:export[ \t]+)?[A-Za-z0-9_\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_\-]*)={_CFG_VALUE}",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Unquoted YAML / colon config (e.g. ``password: secret``,
+# ``spring.datasource.password: hunter2``). The secret keyword must be part of
+# the KEY (anchored to the start of the line/indent), and the value is a single
+# whitespace-free token — so prose like ``note: secret meeting`` (keyword in the
+# value) and ``error: token expired`` are left alone. Bare ``auth`` is excluded
+# from the key set so ``Authorization:`` / ``author:`` don't match (the former
+# is masked by _AUTH_HEADER_RE); ``auth_token``/``auth-token`` still match via
+# the ``token`` keyword. Quoted values defer to _JSON_FIELD_RE via the lookahead.
+_YAML_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential)"
+_YAML_ASSIGN_RE = re.compile(
+    rf"(^[ \t]*[A-Za-z0-9_.\-]*{_YAML_CFG_NAMES}[A-Za-z0-9_.\-]*)(:[ \t]*)(?!['\"])([^\s&]+)",
+    re.IGNORECASE | re.MULTILINE,
+)
+
 # JSON field patterns: "apiKey": "value", "token": "value", etc.
 _JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer|secret_value|raw_secret|secret_input|key_material)"
 _JSON_FIELD_RE = re.compile(
@@ -125,8 +174,15 @@ _JSON_FIELD_RE = re.compile(
 # while the header name and scheme word are preserved for debuggability. The
 # previous rule only matched ``Bearer``, so ``Basic <base64 user:pass>`` and
 # ``token <pat>`` leaked verbatim into logs/transcripts.
+#
+# The credential class excludes quote characters (``"`` / ``'``): a token sitting
+# flush against a closing quote (``"Authorization: Bearer sk-..."``) must not pull
+# that quote into the match, or masking turns value corruption into *syntax*
+# corruption — the closing quote vanishes and the command/string no longer parses
+# (unterminated quote → shell EOF / Python SyntaxError). Real credentials never
+# contain ``"`` or ``'``, so excluding them is safe. See #43083.
 _AUTH_HEADER_RE = re.compile(
-    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?(\S+)",
+    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?([^\s\"']+)",
    re.IGNORECASE,
 )

@@ -154,9 +210,37 @@ _PRIVATE_KEY_RE = re.compile(
 )

 # Database connection strings: protocol://user:PASSWORD@host
-# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
+# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password.
+# The userinfo and password groups forbid whitespace ([^:\s]+ / [^@\s]+) so the
+# match can never span a line break. A real DSN password never contains
+# whitespace; without this bound the greedy [^@]+ would scan past the end of a
+# code line to the next stray "@" (e.g. a Python decorator), swallowing
+# intervening lines and corrupting tool OUTPUT for any source containing a
+# postgresql:// f-string template. See issue #33801.
 _DB_CONNSTR_RE = re.compile(
-    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
+    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:\s]+:)([^@\s]+)(@)",
+    re.IGNORECASE,
+)
+
+# Bare-token credential in a web/transport URL: ``scheme://TOKEN@host``.
+# This is the ``git remote set-url origin https://PASSWORD@github.com/...``
+# shape from issue #6396 — a single opaque credential in the userinfo position
+# with NO ``user:pass`` colon. It is unambiguously a secret: legitimate
+# round-trip URLs (OAuth callbacks, magic links, pre-signed shares — see the
+# "Web-URL redaction is intentionally OFF" note in redact_sensitive_text) carry
+# their tokens in the QUERY STRING, never in bare userinfo. The colon form
+# ``user:pass@`` is deliberately left to pass through (commit "pass web URLs
+# through unchanged", #34029) and is NOT matched here — the token class forbids
+# ``:``. DB schemes are handled by _DB_CONNSTR_RE above and excluded here.
+#
+# Guards against false positives:
+#   - 8+ char floor skips short usernames (git, admin, root, deploy, ubuntu).
+#   - The token class ``[^\s:@/]`` cannot cross ``/``, so an ``@`` sitting in a
+#     path or query (e.g. ``?q=user@example.com``) is never treated as userinfo.
+_URL_BARE_TOKEN_RE = re.compile(
+    r"((?:https?|wss?|git|ssh|ftp|ftps|sftp)://)"  # scheme
+    r"([^\s:@/]{8,})"                               # bare token (no colon/slash/@), 8+ chars
+    r"(@[^\s]+)",                                   # @host...
    re.IGNORECASE,
 )

@@ -340,7 +424,40 @@ def _redact_form_body(text: str) -> str:
    return _redact_query_string(text.strip())


-def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = False) -> str:
+def _mask_token_nonreusable(token: str) -> str:
+    """Redact a prefix-matched credential to a NON-REUSABLE sentinel.
+
+    Unlike :func:`_mask_token` (which keeps head/tail chars — fine for logs
+    that are never fed back into a config), this emits a marker that:
+
+    * cannot be mistaken for a usable-but-truncated key, so an agent that
+      reads it from a config file and writes it back does NOT corrupt the
+      stored credential into a dead 13-char string (issue #35519); and
+    * still does not leak the secret material (no head/tail chars).
+
+    The vendor prefix label is preserved for debuggability so the agent can
+    still tell *which* credential is present (e.g. a GitHub PAT vs an OpenAI
+    key) without seeing any of its bytes.
+    """
+    if not token:
+        return "«redacted-secret»"
+    # Preserve only the recognizable vendor prefix label (e.g. "ghp_", "sk-"),
+    # never any of the random secret body.
+    label = ""
+    for sub in _PREFIX_SUBSTRINGS:
+        if token.startswith(sub):
+            label = sub
+            break
+    return f"«redacted:{label}…»" if label else "«redacted-secret»"
+
+
+def redact_sensitive_text(
+    text: str,
+    *,
+    force: bool = False,
+    code_file: bool = False,
+    file_read: bool = False,
+) -> str:
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
@@ -353,6 +470,17 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    constants, "apiKey": "test" fixtures). Prefix patterns, auth headers,
    private keys, DB connstrings, JWTs, and URL secrets are still redacted.

+    Set file_read=True for file *content* returned to the agent (read_file /
+    search_files / cat). Secrets are STILL redacted — they are never exposed —
+    but prefix-matched credentials are replaced with a non-reusable sentinel
+    (``«redacted:ghp_…»``) instead of a head/tail-preserving mask
+    (``ghp_S1...Pn2T``). The old mask looked like a real-but-truncated key, so
+    an agent reading it from config.yaml and writing it back silently corrupted
+    the stored credential into a dead 13-char value → 401 (issue #35519). The
+    sentinel is syntactically invalid as a token, so it can't be mistaken for a
+    usable key or written back as one. Implies code_file=True (config/data
+    files shouldn't trigger the source-code ENV/JSON false-positive paths).
+
    Performance: each regex pattern is gated behind a cheap substring
    pre-check (e.g. ``"=" in text`` for ENV assignments, ``"://" in text``
    for URLs, ``"eyJ" in text`` for JWTs). On a typical hermes log line
@@ -371,9 +499,15 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    if not (force or _REDACT_ENABLED):
        return text

+    # file_read content shouldn't hit the source-code ENV/JSON false-positive
+    # paths either (it's config/data, not log lines).
+    if file_read:
+        code_file = True
+
    # Known prefixes (sk-, ghp_, etc.) — gate on substring presence
    if _has_known_prefix_substring(text):
-        text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)
+        _prefix_sub = _mask_token_nonreusable if file_read else _mask_token
+        text = _PREFIX_RE.sub(lambda m: _prefix_sub(m.group(1)), text)

    # ENV assignments: OPENAI_API_KEY=***  (skip for code files — false positives)
    if not code_file:
@@ -382,6 +516,13 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
                name, quote, value = m.group(1), m.group(2), m.group(3)
                return f"{name}={quote}{_mask_token(value)}{quote}"
            text = _ENV_ASSIGN_RE.sub(_redact_env, text)
+            # Lowercase/dotted config keys (issue #16413). Skip URLs entirely —
+            # web-URL query params are intentionally passed through (see note
+            # near the bottom of this function); _DB_CONNSTR_RE still guards
+            # connection-string passwords.
+            if "://" not in text:
+                text = _CFG_DOTTED_RE.sub(_redact_env, text)
+                text = _CFG_ANCHORED_RE.sub(_redact_env, text)

        # JSON fields: "apiKey": "***"  (skip for code files — false positives)
        if ":" in text and '"' in text:
@@ -390,6 +531,15 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
                return f'{key}: "{_mask_token(value)}"'
            text = _JSON_FIELD_RE.sub(_redact_json, text)

+        # Unquoted YAML / colon config: password: ***  (after JSON so quoted
+        # values are handled there; the lookahead in _YAML_ASSIGN_RE skips
+        # quotes). Skip URLs — web-URL query params pass through by design.
+        if ":" in text and "://" not in text:
+            def _redact_yaml(m):
+                key, sep, value = m.group(1), m.group(2), m.group(3)
+                return f"{key}{sep}{_mask_token(value)}"
+            text = _YAML_ASSIGN_RE.sub(_redact_yaml, text)
+
    # Authorization headers — _AUTH_HEADER_RE matches any scheme after
    # "[Proxy-]Authorization:" case-insensitively, so "uthorization" is the
    # cheapest substring gate that covers every casing without a casefold().
@@ -419,9 +569,32 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    if "BEGIN" in text and "-----" in text:
        text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)

-    # Database connection string passwords
+    # Database connection string passwords. With code_file=True, a password
+    # group that is a pure ``{...}`` brace expression is an f-string template
+    # reference (e.g. f"postgresql://{user}:{pass}@{host}"), not a literal
+    # credential — preserve it. Literal passwords are still redacted. The regex
+    # forbids whitespace in the password group, so a single-line template's
+    # group(2) is exactly the brace expression. See issue #33801.
    if "://" in text:
-        text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
+        if code_file:
+            def _redact_db(m):
+                pw = m.group(2)
+                if pw.startswith("{") and pw.endswith("}"):
+                    return m.group(0)
+                return f"{m.group(1)}***{m.group(3)}"
+            text = _DB_CONNSTR_RE.sub(_redact_db, text)
+        else:
+            text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
+
+        # Bare-token userinfo in web/transport URLs: ``scheme://TOKEN@host``.
+        # The git-remote-with-embedded-password shape from #6396. Only the
+        # colon-less bare-token form is redacted — ``user:pass@`` and
+        # query-string tokens are left to pass through (see the web-URL note
+        # below). See _URL_BARE_TOKEN_RE for the false-positive guards.
+        text = _URL_BARE_TOKEN_RE.sub(
+            lambda m: f"{m.group(1)}{_mask_token(m.group(2))}{m.group(3)}",
+            text,
+        )

    # JWT tokens (eyJ... — base64-encoded JSON headers)
    if "eyJ" in text:
@@ -434,7 +607,12 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    # blanket-redacting param values by name breaks those skills mid-flow.
    # Known credential shapes (sk-, ghp_, JWTs, etc.) inside URLs are still
    # caught by _PREFIX_RE and _JWT_RE above. DB connection-string passwords
-    # are still caught by _DB_CONNSTR_RE.
+    # are still caught by _DB_CONNSTR_RE. The ONE userinfo case still redacted
+    # is the colon-less bare-token form ``scheme://TOKEN@host`` (#6396, handled
+    # by _URL_BARE_TOKEN_RE in the ``://`` block above): a bare credential in
+    # userinfo is never a round-trip workflow token (those live in the query
+    # string), so masking it can't break a skill. The ``user:pass@`` form is
+    # left to pass through per #34029.

    # Form-urlencoded bodies (only triggers on clean k=v&k=v inputs).
    if "&" in text and "=" in text:
@@ -452,6 +630,66 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    return text


+# Commands whose stdout is an environment-variable dump (KEY=value lines),
+# NOT source code. For these, terminal-output redaction must run the
+# ENV-assignment pass (code_file=False) so opaque tokens with no recognized
+# vendor prefix (e.g. ``MY_SERVICE_TOKEN=abc123randomstring``) are still
+# masked. For all other commands, code_file=True is used to avoid mangling
+# legitimate source/config dumps (``MAX_TOKENS=100``, ``"apiKey": "x"``
+# fixtures, ``postgresql://{user}`` f-string templates). See issue #43025.
+_ENV_DUMP_COMMANDS = frozenset({"env", "printenv", "set", "export", "declare"})
+
+
+def is_env_dump_command(command: str | None) -> bool:
+    """Return True if ``command`` dumps environment variables to stdout.
+
+    Detects ``env`` / ``printenv`` / ``set`` / ``export`` / ``declare`` as the
+    first token of any segment in a pipeline or sequence (``;`` / ``&&`` /
+    ``||`` / ``|``). Conservative: a parse failure or anything unrecognized
+    returns False (callers then fall back to the safer code_file=True path,
+    which still masks prefix-shaped keys).
+    """
+    if not command or not isinstance(command, str):
+        return False
+    # Split on shell separators, then inspect the first token of each segment.
+    segments = re.split(r"[|;&]+", command)
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+        try:
+            tokens = shlex.split(seg)
+        except ValueError:
+            tokens = seg.split()
+        if tokens and tokens[0] in _ENV_DUMP_COMMANDS:
+            return True
+    return False
+
+
+def redact_terminal_output(
+    output: str, command: str | None = None, *, force: bool = False
+) -> str:
+    """Redact secrets from terminal/process stdout.
+
+    Single redaction policy for ALL terminal-output surfaces — foreground
+    ``terminal`` results AND background ``process(action=poll/log/wait)``
+    output — so they can't diverge. Picks ``code_file`` based on whether
+    ``command`` is an environment dump:
+
+    - env-dump command (``env``/``printenv``/``set``/``export``/``declare``)
+      → ``code_file=False`` so the ENV-assignment pass masks opaque tokens.
+    - anything else (or unknown command) → ``code_file=True`` to avoid
+      false positives on source/config dumps.
+
+    ``force=True`` bypasses the global ``security.redact_secrets`` preference
+    for safety boundaries that must never emit raw credentials.
+    """
+    if not output:
+        return output
+    code_file = not is_env_dump_command(command or "")
+    return redact_sensitive_text(output, force=force, code_file=code_file)
+
+
 # Substrings used to gate ``_PREFIX_RE`` execution. If none of these appear in
 # the input string, the prefix regex cannot match anything, so we skip it.
 # False positives are fine (they just run the regex, which then matches
--- a/agent/replay_cleanup.py
+++ b/agent/replay_cleanup.py
@@ -0,0 +1,140 @@
+"""Replay-history sanitization shared across resume code paths.
+
+When a session's last turn dies mid-tool-loop — the process is killed by a
+restart/shutdown command, a stale-timeout fires, or an interrupt lands before
+the tool result is written — the persisted transcript can end with a dangling
+``assistant(tool_calls)`` (no matching ``tool`` answer) or an interrupted
+``assistant→tool`` block.  On resume the model sees that broken tail and
+re-issues the unanswered call, producing an endless "thinking"/reboot loop
+(#49201, #29086).
+
+These pure helpers strip those tails before the history is replayed to the
+model.  They were originally local to ``gateway/run.py`` (which fixed the
+messaging-gateway path) and are extracted here so every resume surface — the
+messaging gateway AND the TUI/WebUI gateway — shares the same cleanup instead
+of the WebUI path silently skipping it.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+def is_interrupted_tool_result(content: Any) -> bool:
+    """Return True if a tool result indicates the tool was interrupted."""
+    if not isinstance(content, str):
+        return False
+    lowered = content.lower()
+    if "[command interrupted]" in lowered:
+        return True
+    if "exit_code" in lowered and ("130" in lowered or "-1" in lowered):
+        return "interrupt" in lowered
+    return False
+
+
+def strip_interrupted_tool_tails(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Strip interrupted assistant→tool sequences from replay history.
+
+    Older interrupted gateway turns can be followed by a queued real user
+    message, so the interrupted assistant/tool block is not necessarily the
+    final tail by the time we rebuild replay history.  Remove any contiguous
+    assistant(tool_calls) + tool-result block that contains an interrupted tool
+    result, while preserving successful tool-call sequences intact.
+    """
+    if not agent_history:
+        return agent_history
+
+    cleaned: List[Dict[str, Any]] = []
+    i = 0
+    n = len(agent_history)
+    while i < n:
+        msg = agent_history[i]
+        if msg.get("role") == "assistant" and "tool_calls" in msg:
+            j = i + 1
+            tool_results: List[Dict[str, Any]] = []
+            while j < n and agent_history[j].get("role") == "tool":
+                tool_results.append(agent_history[j])
+                j += 1
+            if tool_results and any(
+                is_interrupted_tool_result(m.get("content", ""))
+                for m in tool_results
+            ):
+                logger.debug(
+                    "Stripping interrupted assistant→tool replay block "
+                    "(indices %d–%d, tool_results=%d)",
+                    i, j - 1, len(tool_results),
+                )
+                i = j
+                continue
+        if msg.get("role") == "tool" and is_interrupted_tool_result(msg.get("content", "")):
+            logger.debug("Stripping orphan interrupted tool result from replay history")
+            i += 1
+            continue
+        cleaned.append(msg)
+        i += 1
+
+    return cleaned
+
+
+def strip_dangling_tool_call_tail(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
+
+    When a tool call itself kills the gateway process (``docker restart``,
+    ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
+    is terminated by SIGKILL *mid-call* — before the tool result is ever
+    written and before the orderly shutdown rewind
+    (``_drop_trailing_empty_response_scaffolding``) can run.  The last thing
+    persisted is the ``assistant`` message that issued the ``tool_calls``,
+    with zero matching ``tool`` rows.
+
+    On resume the model sees an unanswered tool call at the tail and naturally
+    re-issues it — which restarts the gateway again, producing the infinite
+    reboot loop in #49201.  ``strip_interrupted_tool_tails`` does not catch
+    this because there is no tool result to inspect for an interrupt marker.
+
+    This strips that dangling tail at the source so there is nothing for the
+    model to re-execute.  It only acts when the tail is an
+    ``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
+    results — a completed assistant→tool pair (any tool answers present) is
+    left untouched so genuine mid-progress tool loops still resume.
+    """
+    if not agent_history:
+        return agent_history
+
+    last = agent_history[-1]
+    if not (
+        isinstance(last, dict)
+        and last.get("role") == "assistant"
+        and last.get("tool_calls")
+    ):
+        return agent_history
+
+    logger.debug(
+        "Stripping dangling unanswered assistant(tool_calls) tail "
+        "(%d call(s)) — process likely killed mid-tool-call by a "
+        "restart/shutdown command (#49201)",
+        len(last.get("tool_calls") or []),
+    )
+    return agent_history[:-1]
+
+
+def sanitize_replay_history(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Apply both replay-tail strippers in the canonical order.
+
+    Convenience entry point for resume code paths: removes interrupted
+    assistant→tool blocks anywhere in the history, then removes a dangling
+    unanswered ``assistant(tool_calls)`` tail.  Returns the same list object
+    when there is nothing to strip.
+    """
+    if not agent_history:
+        return agent_history
+    return strip_dangling_tool_call_tail(strip_interrupted_tool_tails(agent_history))
--- a/agent/shell_hooks.py
+++ b/agent/shell_hooks.py
@@ -122,6 +122,8 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 try:
    import fcntl  # POSIX only; Windows falls back to best-effort without flock.
 except ImportError:  # pragma: no cover
@@ -441,6 +443,7 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
        return result

    t0 = time.monotonic()
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        proc = subprocess.run(
            argv,
@@ -449,6 +452,7 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
            timeout=spec.timeout,
            text=True,
            shell=False,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        result["timed_out"] = True
--- a/agent/skill_preprocessing.py
+++ b/agent/skill_preprocessing.py
@@ -5,6 +5,8 @@ import re
 import subprocess
 from pathlib import Path

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 logger = logging.getLogger(__name__)

 # Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
@@ -66,6 +68,7 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
    Failures return a short ``[inline-shell error: ...]`` marker instead of
    raising, so one bad snippet can't wreck the whole skill message.
    """
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        completed = subprocess.run(
            ["bash", "-c", command],
@@ -75,6 +78,7 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
            timeout=max(1, int(timeout)),
            check=False,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"[inline-shell timeout after {timeout}s: {command}]"
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@@ -28,6 +28,7 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional

+from agent.conversation_compression import conversation_history_after_compression
 from agent.iteration_budget import IterationBudget
 from agent.model_metadata import (
    estimate_messages_tokens_rough,
@@ -400,7 +401,9 @@ def build_turn_context(
                    _orig_len, len(messages), _orig_tokens, _preflight_tokens
                ):
                    break  # Cannot compress further: neither rows nor tokens moved
-                conversation_history = None
+                conversation_history = conversation_history_after_compression(
+                    agent, messages
+                )
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
--- a/agent/turn_finalizer.py
+++ b/agent/turn_finalizer.py
@@ -289,7 +289,14 @@ def finalize_turn(
                    and len(_stripped) <= 24
                    and _stripped[-1:] not in {".", "!", "?", "。", "！", "？", "`", ")"}
                )
-                if _is_empty_terminal or _is_partial_fragment:
+                _is_partial_stream_recovery = (
+                    str(_turn_exit_reason) == "partial_stream_recovery"
+                )
+                if (
+                    _is_empty_terminal
+                    or _is_partial_fragment
+                    or _is_partial_stream_recovery
+                ):
                    _explanation = agent._format_turn_completion_explanation(
                        _turn_exit_reason
                    )
--- a/agent/turn_retry_state.py
+++ b/agent/turn_retry_state.py
@@ -67,6 +67,11 @@ class TurnRetryState:
    # ── Restart signals (read by the outer loop after the attempt) ───────
    restart_with_compressed_messages: bool = False
    restart_with_length_continuation: bool = False
+    # Set when a content-filter stream stall (e.g. MiniMax "new_sensitive")
+    # has been escalated to the fallback chain: the partial-stream content
+    # was rolled back off ``messages`` and the loop should re-issue the API
+    # call against the newly-activated provider (#32421).
+    restart_with_rebuilt_messages: bool = False

    def __iter__(self):
        # Convenience for debugging / tests: iterate (name, value) pairs.
--- a/agent/verification_stop.py
+++ b/agent/verification_stop.py
@@ -15,6 +15,63 @@ from typing import Any, Iterable

 _MAX_CHANGED_PATHS_IN_NUDGE = 8

+# Non-code file extensions whose edits carry no verifiable runtime behavior:
+# documentation, prose, and data/markup that no test/build exercises. When a
+# turn touches ONLY these, verify-on-stop has nothing to check, so the nudge is
+# suppressed (this is fix "C" for the doc/markdown/skill false-positive — a
+# SKILL.md or README edit must never demand a /tmp verification script). A turn
+# that edits any non-listed path (a real source/code/config file) still nudges.
+_NON_CODE_VERIFY_EXTENSIONS = frozenset(
+    {
+        ".md",
+        ".markdown",
+        ".mdx",
+        ".rst",
+        ".txt",
+        ".text",
+        ".adoc",
+        ".asciidoc",
+        ".org",
+        ".log",
+        ".csv",
+        ".tsv",
+    }
+)
+
+# Filenames (case-insensitive, extension-less or otherwise) that are pure prose
+# even without a recognized doc extension.
+_NON_CODE_VERIFY_FILENAMES = frozenset(
+    {
+        "license",
+        "licence",
+        "notice",
+        "authors",
+        "contributors",
+        "changelog",
+        "codeowners",
+    }
+)
+
+
+def _is_non_code_path(raw: str) -> bool:
+    """Return True when a changed path is documentation/prose with nothing to verify."""
+    try:
+        p = Path(str(raw))
+    except Exception:
+        return False
+    suffix = p.suffix.lower()
+    if suffix in _NON_CODE_VERIFY_EXTENSIONS:
+        return True
+    if not suffix and p.name.lower() in _NON_CODE_VERIFY_FILENAMES:
+        return True
+    return False
+
+
+def _filter_verifiable_paths(paths: Iterable[str]) -> list[str]:
+    """Drop documentation/prose paths; keep paths that could have verifiable behavior."""
+    return [p for p in paths if p and not _is_non_code_path(p)]
+
+
 # Session identities (platform or source) that are NOT human conversational
 # messaging surfaces: interactive coding surfaces (CLI, TUI, desktop, codex,
 # local, gateway) and programmatic callers (API server, webhooks, tools).
@@ -79,12 +136,13 @@ def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
    """Return whether edit -> verify-before-finish behavior is enabled.

    Precedence: an explicit ``HERMES_VERIFY_ON_STOP`` env var wins, then an
-    explicit boolean ``agent.verify_on_stop`` config value, then a surface-aware
-    default. The config default is the sentinel ``"auto"`` (see
-    ``DEFAULT_CONFIG``), which resolves to ON for interactive coding surfaces
+    explicit ``agent.verify_on_stop`` config value. The config default is
+    ``False`` (see ``DEFAULT_CONFIG``) — verify-on-stop is OFF unless the user
+    opts in. The legacy ``"auto"`` sentinel is still honored for anyone who
+    sets it explicitly: it resolves to ON for interactive coding surfaces
    (CLI, TUI, desktop) and programmatic callers, and OFF for conversational
-    messaging surfaces (Telegram, Discord, etc.) where the verification
-    narrative would otherwise reach a human as chat noise.
+    messaging surfaces (Telegram, Discord, etc.). A missing/unknown value
+    falls back to OFF.
    """
    env = os.environ.get("HERMES_VERIFY_ON_STOP")
    if env is not None:
@@ -106,8 +164,11 @@ def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
            return True
        if token in {"0", "false", "no", "off"}:
            return False
-    # "auto", missing, or any other value -> surface-aware default.
-    return not _session_is_messaging_surface()
+        if token == "auto":
+            # Explicit opt-in to the legacy surface-aware behavior.
+            return not _session_is_messaging_surface()
+    # Missing or unknown value -> OFF (the new default).
+    return False


 def _candidate_cwds(paths: Iterable[str]) -> list[Path]:
@@ -190,7 +251,10 @@ def build_verify_on_stop_nudge(
    max_attempts: int = 2,
 ) -> str | None:
    """Return a synthetic follow-up when edited code lacks fresh verification."""
-    paths = sorted({str(p) for p in changed_paths if p})
+    # Drop documentation/prose paths (markdown, skills, README, LICENSE, ...) —
+    # they carry no verifiable behavior, so a turn that touched only those has
+    # nothing to verify and must not nudge.
+    paths = sorted({str(p) for p in _filter_verifiable_paths(changed_paths)})
    if not paths or attempts >= max_attempts:
        return None

--- a/apps/desktop/README.md
+++ b/apps/desktop/README.md
@@ -85,7 +85,7 @@ Installers are built and uploaded to GitHub Releases manually. macOS/Windows sig

 ### How it works

-The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a `hermes dashboard` backend over the `tui_gateway`/dashboard APIs and reuses the agent runtime rather than embedding `hermes --tui`. The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.
+The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a headless backend the app launches for you — a `hermes serve` process that serves the `tui_gateway` JSON-RPC/WebSocket API — through the framework-agnostic client in [`apps/shared`](../shared/) (the same client the web dashboard consumes), and reuses the agent runtime rather than embedding `hermes --tui`. The app is **self-contained**: it runs its own `hermes serve` backend and never opens or requires the web dashboard UI. (For backward compatibility, a runtime that predates the `serve` command automatically falls back to a headless `dashboard --no-open` — see `electron/backend-command.cjs` — so mid-upgrade installs never break.) The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.

 ### Verification

--- a/apps/desktop/electron/backend-command.cjs
+++ b/apps/desktop/electron/backend-command.cjs
@@ -0,0 +1,51 @@
+'use strict'
+
+// Backend subcommand routing for the desktop-managed Hermes process.
+//
+// The desktop app launches its own headless backend via `hermes serve` — it
+// must NEVER depend on or launch the browser `dashboard`. But `serve` is a
+// newer subcommand: a runtime that predates it (an older managed install the
+// app hasn't updated yet, or an older `hermes` resolved from PATH) only knows
+// `dashboard --no-open`. To avoid bricking those users mid-upgrade we detect
+// whether the resolved runtime understands `serve` and, only when it does not,
+// fall back to the legacy `dashboard --no-open` invocation. Both produce the
+// exact same headless gateway; `serve` is just the decoupled name.
+//
+// These helpers are pure so they can be unit-tested without Electron.
+
+/**
+ * Build the canonical headless backend argv (always `serve`).
+ * @param {string} [profile] optional Hermes profile to pin via `--profile`.
+ */
+function serveBackendArgs(profile) {
+  const head = profile ? ['--profile', profile] : []
+  return [...head, 'serve', '--host', '127.0.0.1', '--port', '0']
+}
+
+/**
+ * Rewrite a resolved backend argv from `serve` to the legacy
+ * `dashboard --no-open` form, preserving every other argument (incl. a leading
+ * `-m hermes_cli.main` and any `--profile <name>`). Returns a copy; if there is
+ * no `serve` token the argv is returned unchanged.
+ */
+function dashboardFallbackArgs(args) {
+  const i = args.indexOf('serve')
+  if (i === -1) return args.slice()
+  return [...args.slice(0, i), 'dashboard', '--no-open', ...args.slice(i + 1)]
+}
+
+/**
+ * True when a runtime's `hermes_cli/subcommands/dashboard.py` source registers
+ * the `serve` subcommand. Matches `add_parser("serve"` / `add_parser('serve'`
+ * specifically so the substring "server" (e.g. "start_server", "web server")
+ * never produces a false positive.
+ */
+function sourceDeclaresServe(dashboardPySource) {
+  return /add_parser\(\s*["']serve["']/.test(String(dashboardPySource || ''))
+}
+
+module.exports = {
+  serveBackendArgs,
+  dashboardFallbackArgs,
+  sourceDeclaresServe,
+}
--- a/apps/desktop/electron/backend-command.test.cjs
+++ b/apps/desktop/electron/backend-command.test.cjs
@@ -0,0 +1,83 @@
+'use strict'
+
+const test = require('node:test')
+const assert = require('node:assert/strict')
+
+const {
+  serveBackendArgs,
+  dashboardFallbackArgs,
+  sourceDeclaresServe,
+} = require('./backend-command.cjs')
+
+test('serveBackendArgs builds a headless serve invocation', () => {
+  assert.deepEqual(serveBackendArgs(), [
+    'serve',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('serveBackendArgs pins a profile when provided', () => {
+  assert.deepEqual(serveBackendArgs('worker'), [
+    '--profile',
+    'worker',
+    'serve',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs rewrites serve -> dashboard --no-open, keeping the -m prefix', () => {
+  const serve = ['-m', 'hermes_cli.main', 'serve', '--host', '127.0.0.1', '--port', '0']
+  assert.deepEqual(dashboardFallbackArgs(serve), [
+    '-m',
+    'hermes_cli.main',
+    'dashboard',
+    '--no-open',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs preserves a --profile flag ahead of serve', () => {
+  const serve = ['-m', 'hermes_cli.main', '--profile', 'worker', 'serve', '--host', '127.0.0.1', '--port', '0']
+  assert.deepEqual(dashboardFallbackArgs(serve), [
+    '-m',
+    'hermes_cli.main',
+    '--profile',
+    'worker',
+    'dashboard',
+    '--no-open',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs is a no-op (copy) when there is no serve token', () => {
+  const args = ['-m', 'hermes_cli.main', 'dashboard', '--no-open']
+  const out = dashboardFallbackArgs(args)
+  assert.deepEqual(out, args)
+  assert.notEqual(out, args, 'should return a copy, not the same reference')
+})
+
+test('sourceDeclaresServe detects the serve subparser registration', () => {
+  assert.equal(sourceDeclaresServe('subparsers.add_parser("serve", help="...")'), true)
+  assert.equal(sourceDeclaresServe("subparsers.add_parser('serve')"), true)
+  assert.equal(sourceDeclaresServe('subparsers.add_parser(\n        "serve",\n)'), true)
+})
+
+test('sourceDeclaresServe does not false-positive on the substring "server"', () => {
+  const oldSource = `
+    dashboard_parser = subparsers.add_parser("dashboard", help="Start the web UI dashboard")
+    from hermes_cli.web_server import start_server  # web server
+  `
+  assert.equal(sourceDeclaresServe(oldSource), false)
+})
--- a/apps/desktop/electron/backend-probes.cjs
+++ b/apps/desktop/electron/backend-probes.cjs
@@ -37,7 +37,18 @@ const { execFileSync } = require('node:child_process')
 const PROBE_TIMEOUT_MS = 5000

 /**
- * Return true iff `python -c "import hermes_cli"` exits 0.
+ * Return the Python snippet used to verify Hermes can import far enough to
+ * launch the CLI. Kept exported for tests so dependency regressions are
+ * caught without needing a real broken venv fixture.
+ *
+ * @returns {string}
+ */
+function hermesRuntimeImportProbe() {
+  return 'import yaml; import hermes_cli.config'
+}
+
+/**
+ * Return true iff the Hermes runtime import probe exits 0.
 *
 * Used to gate the "fallback to system Python with hermes_cli installed"
 * rung of resolveHermesBackend. Without this, a system Python 3.11-3.13
@@ -46,13 +57,20 @@ const PROBE_TIMEOUT_MS = 5000
 * site-packages -- and the resolver returns a backend that immediately
 * dies on spawn.
 *
+ * The probe intentionally imports hermes_cli.config, not just the top-level
+ * package: a broken/empty Windows launcher venv can still see the source tree
+ * through PYTHONPATH but lack PyYAML, then die on the first real CLI import.
+ *
 * @param {string} pythonPath - Absolute path to a python.exe / python.
+ * @param {object} [opts]
+ * @param {object} [opts.env] - Additional environment for the probe.
 * @returns {boolean}
 */
-function canImportHermesCli(pythonPath) {
+function canImportHermesCli(pythonPath, opts = {}) {
  if (!pythonPath) return false
  try {
-    execFileSync(pythonPath, ['-c', 'import hermes_cli'], {
+    execFileSync(pythonPath, ['-c', hermesRuntimeImportProbe()], {
+      env: { ...process.env, ...(opts.env || {}) },
      stdio: 'ignore',
      timeout: PROBE_TIMEOUT_MS,
      windowsHide: true
@@ -101,6 +119,7 @@ function verifyHermesCli(hermesCommand, opts = {}) {

 module.exports = {
  canImportHermesCli,
+  hermesRuntimeImportProbe,
  verifyHermesCli,
  PROBE_TIMEOUT_MS
 }
--- a/apps/desktop/electron/backend-probes.test.cjs
+++ b/apps/desktop/electron/backend-probes.test.cjs
@@ -11,7 +11,7 @@ const fs = require('node:fs')
 const os = require('node:os')
 const path = require('node:path')

-const { canImportHermesCli, verifyHermesCli } = require('./backend-probes.cjs')
+const { canImportHermesCli, hermesRuntimeImportProbe, verifyHermesCli } = require('./backend-probes.cjs')

 // Resolve the host's own Node binary -- guaranteed to be on disk and
 // runnable. We use it as both a stand-in for "a python that doesn't
@@ -40,6 +40,12 @@ test('canImportHermesCli returns false when binary does not exist', () => {
  assert.equal(canImportHermesCli(ghost), false)
 })

+test('hermes runtime import probe checks config dependencies', () => {
+  const probe = hermesRuntimeImportProbe()
+  assert.match(probe, /\bimport yaml\b/)
+  assert.match(probe, /\bimport hermes_cli\.config\b/)
+})
+
 test('verifyHermesCli returns false when command is falsy', () => {
  assert.equal(verifyHermesCli(''), false)
  assert.equal(verifyHermesCli(null), false)
--- a/apps/desktop/electron/git-review-ops.cjs
+++ b/apps/desktop/electron/git-review-ops.cjs
@@ -10,7 +10,26 @@ const { execFile } = require('node:child_process')
 const fs = require('node:fs/promises')
 const path = require('node:path')

-const simpleGit = require('simple-git')
+// `simple-git` is a pure-JS runtime dep that workspace dedup hoists into the
+// repo-root node_modules.  Packaged builds set `files:` in package.json, which
+// excludes node_modules from the asar, so the normal require() fails at launch
+// (issue #52735: "Cannot find module 'simple-git'").  We ship the dep's
+// closure under resources/native-deps/vendor/node_modules/ via extraResources
+// + scripts/stage-native-deps.cjs, and resolve from there when the hoisted
+// require() isn't reachable.  The `vendor/` nesting matters: electron-builder
+// drops a node_modules dir at the root of an extraResources copy but keeps a
+// nested one.  Dev mode never hits the fallback -- Node's normal lookup finds
+// the hoisted copy.
+let simpleGit
+try {
+  simpleGit = require('simple-git')
+} catch {
+  const resourcesPath = process.resourcesPath
+  if (!resourcesPath) {
+    throw new Error("git-review IPC: 'simple-git' not found and no resourcesPath to fall back to")
+  }
+  simpleGit = require(path.join(resourcesPath, 'native-deps', 'vendor', 'node_modules', 'simple-git'))
+}

 const { resolveRequestedPathForIpc } = require('./hardening.cjs')

--- a/apps/desktop/electron/main.cjs
+++ b/apps/desktop/electron/main.cjs
@@ -39,6 +39,7 @@ const { createLinkTitleWindow } = require('./link-title-window.cjs')
 const { probeGatewayWebSocket } = require('./gateway-ws-probe.cjs')
 const { adoptServedDashboardToken } = require('./dashboard-token.cjs')
 const { waitForDashboardPortAnnouncement } = require('./backend-ready.cjs')
+const { dashboardFallbackArgs, sourceDeclaresServe } = require('./backend-command.cjs')
 const { serializeJsonBody, setJsonRequestHeaders } = require('./oauth-net-request.cjs')
 const { fetchMarketplaceThemes, searchMarketplaceThemes } = require('./vscode-marketplace.cjs')
 const { buildDesktopBackendEnv, normalizeHermesHomeRoot } = require('./backend-env.cjs')
@@ -534,9 +535,10 @@ function getTitleBarOverlayOptions() {
    return { height: TITLEBAR_HEIGHT }
  }

-  // Windows + WSLg paint WCO natively; plain Linux disables it (frameless hidden
-  // titlebar still applies).
-  if (!IS_WINDOWS && !IS_WSL) {
+  // WSLg paints WCO via the RDP host's own min/max/close, so requesting
+  // an Electron overlay there just leaves a dead gap. Plain Linux (KDE,
+  // GNOME) can use the native overlay — let it through.
+  if (!IS_WINDOWS && IS_WSL) {
    return false
  }

@@ -790,7 +792,7 @@ let rendererReloadTimes = []
 // the renderer's "Reload and retry" path or by quitting the app.
 let bootstrapFailure = null
 // Latched non-bootstrap backend spawn failure — stops getConnection() from
-// respawning hermes dashboard children in a tight loop while boot is broken.
+// respawning hermes serve backend children in a tight loop while boot is broken.
 let backendStartFailure = null
 // Active first-launch install, so the renderer's Cancel button (and app quit)
 // can abort the in-flight install.sh/ps1 instead of leaving it running.
@@ -1284,8 +1286,14 @@ function findOnPath(command) {
  const pathEntries = String(process.env.PATH || '')
    .split(path.delimiter)
    .filter(Boolean)
+  // On Windows, try PATHEXT extensions BEFORE the bare (empty-extension) name.
+  // A real command must resolve via its .exe/.cmd (Windows command-resolution
+  // semantics consult PATHEXT); an extensionless file — e.g. a Git-Bash
+  // shell-script shim named `hermes` — must not shadow `hermes.cmd`/`hermes.exe`.
+  // The empty entry is kept LAST so callers that already include the extension
+  // (py.exe, pwsh.exe, powershell.exe) still resolve.
  const extensions = IS_WINDOWS
-    ? ['', ...(process.env.PATHEXT || '.COM;.EXE;.BAT;.CMD').split(';').filter(Boolean)]
+    ? [...(process.env.PATHEXT || '.COM;.EXE;.BAT;.CMD').split(';').filter(Boolean), '']
    : ['']

  for (const entry of pathEntries) {
@@ -1302,7 +1310,7 @@ function isCommandScript(command) {
  return IS_WINDOWS && /\.(cmd|bat)$/i.test(command || '')
 }

-function unwrapWindowsVenvHermesCommand(command, dashboardArgs) {
+function unwrapWindowsVenvHermesCommand(command, backendArgs) {
  if (!IS_WINDOWS || !command || isCommandScript(command)) return null

  const resolved = path.resolve(String(command))
@@ -1312,14 +1320,14 @@ function unwrapWindowsVenvHermesCommand(command, dashboardArgs) {
  if (path.basename(scriptsDir).toLowerCase() !== 'scripts') return null

  const venvRoot = path.dirname(scriptsDir)
-  const python = getNoConsoleVenvPython(venvRoot)
+  const python = getVenvPython(venvRoot)
  if (!fileExists(python)) return null

  const root = path.dirname(venvRoot)
  return {
-    label: `existing Hermes no-console Python at ${python}`,
+    label: `existing Hermes Python at ${python}`,
    command: python,
-    args: ['-m', 'hermes_cli.main', ...dashboardArgs],
+    args: ['-m', 'hermes_cli.main', ...backendArgs],
    bootstrap: false,
    env: buildDesktopBackendEnv({
      hermesHome: HERMES_HOME,
@@ -1327,11 +1335,72 @@ function unwrapWindowsVenvHermesCommand(command, dashboardArgs) {
      venvRoot
    }),
    kind: 'python',
-    readyFile: true,
+    // Surfaced so backendSupportsServe() can read this runtime's source for the
+    // `serve` capability check instead of falling back to a heavyweight probe.
+    root,
    shell: false
  }
 }

+// Does the resolved runtime understand the `serve` subcommand? The desktop
+// spawns `hermes serve`; runtimes older than serve only have `dashboard`. We
+// detect support so getBackendArgsForRuntime() can route old runtimes through
+// the legacy `dashboard --no-open` form instead of crashing on an unknown
+// subcommand (would brick every user mid-upgrade — #54568 follow-up).
+//
+// Fast path: read the runtime's own dashboard.py (instant, covers managed
+// installs, dev checkouts, and the Windows venv). Fallback: probe the CLI once
+// (covers a bare `hermes` resolved from PATH with no known source root). Result
+// is cached per resolved runtime so we probe at most once per backend.
+const _serveSupportCache = new Map()
+function backendSupportsServe(backend) {
+  if (!backend || !backend.command) return true
+  const key = `${backend.command}::${backend.root || ''}`
+  if (_serveSupportCache.has(key)) return _serveSupportCache.get(key)
+
+  let supported = null
+  if (backend.root) {
+    try {
+      const src = fs.readFileSync(
+        path.join(backend.root, 'hermes_cli', 'subcommands', 'dashboard.py'),
+        'utf8'
+      )
+      supported = sourceDeclaresServe(src)
+    } catch {
+      supported = null // source unreadable — fall through to the probe
+    }
+  }
+
+  if (supported === null) {
+    try {
+      const prefix = backend.args && backend.args[0] === '-m' ? backend.args.slice(0, 2) : []
+      execFileSync(backend.command, [...prefix, 'serve', '--help'], {
+        cwd: backend.root || undefined,
+        env: { ...process.env, HERMES_HOME, ...(backend.env || {}) },
+        timeout: 15000,
+        stdio: 'ignore',
+        windowsHide: true
+      })
+      supported = true
+    } catch {
+      supported = false
+    }
+  }
+
+  _serveSupportCache.set(key, supported)
+  rememberLog(
+    `[backend] \`serve\` ${supported ? 'supported' : 'unsupported → routing via legacy `dashboard`'} for ${backend.label || key}`
+  )
+  return supported
+}
+
+// Given a resolved backend whose args target `serve`, return the args the
+// runtime actually understands: unchanged when `serve` is supported, or
+// rewritten to `dashboard --no-open` for older runtimes.
+function getBackendArgsForRuntime(backend) {
+  return backendSupportsServe(backend) ? backend.args : dashboardFallbackArgs(backend.args)
+}
+
 function normalizeExecutablePathForCompare(commandPath) {
  if (!commandPath) return null

@@ -1552,64 +1621,26 @@ function getVenvPython(venvRoot) {
  return path.join(venvRoot, IS_WINDOWS ? path.join('Scripts', 'python.exe') : path.join('bin', 'python'))
 }

-function readVenvHome(venvRoot) {
-  try {
-    const cfg = fs.readFileSync(path.join(venvRoot, 'pyvenv.cfg'), 'utf8')
-    const match = cfg.match(/^home\s*=\s*(.+?)\s*$/im)
-    return match ? match[1].trim() : null
-  } catch {
-    return null
-  }
-}
-
-function getNoConsoleVenvPython(venvRoot) {
-  if (!IS_WINDOWS) return getVenvPython(venvRoot)
-
-  // Prefer the venv's own pythonw shim — it carries pyvenv.cfg / site-packages
-  // wiring. Falling back to the base uv/python.org pythonw.exe skips the venv
-  // and breaks imports (yaml, hermes_cli, …) even when PYTHONPATH is patched.
-  const venvPythonw = path.join(venvRoot, 'Scripts', 'pythonw.exe')
-  if (fileExists(venvPythonw)) return venvPythonw
-
-  const baseHome = readVenvHome(venvRoot)
-  if (baseHome) {
-    const basePythonw = path.join(baseHome, 'pythonw.exe')
-    if (fileExists(basePythonw)) return basePythonw
-  }
-
-  return venvPythonw
-}
-
-function toNoConsolePython(pythonPath) {
-  if (!IS_WINDOWS || !pythonPath) return pythonPath
-
-  const resolved = String(pythonPath)
-  if (/pythonw\.exe$/i.test(resolved)) return resolved
-
-  if (/python\.exe$/i.test(resolved)) {
-    const pythonw = path.join(path.dirname(resolved), 'pythonw.exe')
-    if (fileExists(pythonw)) return pythonw
-  }
-
-  return pythonPath
-}
-
-function applyWindowsNoConsoleSpawnHints(backend) {
-  if (!IS_WINDOWS || !backend?.command) return backend
-
-  const usesHermesModule =
-    backend.kind === 'python' ||
-    (Array.isArray(backend.args) && backend.args[0] === '-m' && backend.args[1] === 'hermes_cli.main')
-
-  if (!usesHermesModule) return backend
-
-  backend.command = toNoConsolePython(backend.command)
-  if (/pythonw\.exe$/i.test(path.basename(String(backend.command || '')))) {
-    backend.readyFile = true
-  }
-
-  return backend
-}
+// Windows console-window flashes are governed by the *parent's* console, not by
+// each child spawn. A GUI-subsystem parent (pythonw.exe) has no console, so every
+// console-subsystem child it spawns (git, gh, cmd, ...) must allocate its own —
+// which flashes a window. A console-subsystem parent (python.exe) instead owns a
+// single console that all of its children inherit, so none of them flash.
+//
+// Note this change adds no new creationflag: the backend spawn is ALREADY wrapped
+// in hiddenWindowsChildOptions() (windowsHide: true), but that setting is INERT
+// against pythonw.exe — a GUI-subsystem process has no console for it to act on.
+// Switching the backend to the venv's console python.exe is what makes the
+// existing wrapper load-bearing: with windowsHide the process comes up owning a
+// *windowless* console (verified at runtime — it has an attachable console whose
+// window handle is NULL), and its children inherit that one windowless console
+// instead of each allocating a visible one.
+//
+// This makes "no flashing windows" a property of the one backend launch rather
+// than a flag that has to be remembered at every descendant spawn site. Restoring
+// console python also restores stdout, so the backend announces its port on the
+// normal HERMES_DASHBOARD_READY stdout line and no ready-file side channel is
+// needed.

 function getVenvSitePackagesEntries(venvRoot) {
  const entries = []
@@ -1964,6 +1995,16 @@ async function readCommitLog(cwd, branch) {

 let updateInFlight = false

+// Set to true when the desktop is about to quit so a detached swap/install/
+// uninstall script can take over. On macOS, app.quit() closes windows but
+// window-all-closed deliberately keeps the process alive (standard Electron
+// macOS convention). Without this flag the process never exits — the detached
+// hand-off script spins its PID-wait for the full timeout, and the user sees a
+// blank app with no window (and an uninstall that appears to do nothing). When
+// set, window-all-closed calls app.quit() on every platform so the process
+// actually dies and the hand-off script can proceed immediately.
+let isQuittingForHandoff = false
+
 // Resolve the staged updater binary. The Tauri installer copies itself to
 // HERMES_HOME/hermes-setup.exe on a successful install (see
 // apps/bootstrap-installer paths::copy_self_to_hermes_home). That binary owns
@@ -2219,6 +2260,7 @@ async function applyUpdates(opts = {}) {
    // appears), THEN quit to release the venv shim. The updater rebuilds and
    // relaunches us when it's done. (#50419 — a 600ms quit looked like a crash
    // and lured users into the #50238 relaunch loop.)
+    isQuittingForHandoff = true
    setTimeout(() => {
      app.quit()
    }, UPDATE_HANDOFF_DWELL_MS)
@@ -2242,7 +2284,18 @@ async function handOffWindowsBootstrapRecovery(reason) {
    : configuredBranch || DEFAULT_UPDATE_BRANCH
  const venvBin = path.join(updateRoot, 'venv', IS_WINDOWS ? 'Scripts' : 'bin')
  const venvHermes = path.join(venvBin, IS_WINDOWS ? 'hermes.exe' : 'hermes')
-  const updaterArgs = fileExists(venvHermes) ? ['--update', '--branch', branch] : ['--repair', '--branch', branch]
+  const venvPython = path.join(venvBin, IS_WINDOWS ? 'python.exe' : 'python')
+  // Choose the gentle in-place --update when ANY real-install signal is present,
+  // not just the `hermes.exe` console-script shim. That shim is generated at the
+  // END of venv setup and is absent in exactly the interrupted/quarantined states
+  // this recovery exists to heal — gating on it alone forced the destructive
+  // --repair (full venv recreate) and drove reinstall loops. The venv interpreter
+  // and the bootstrap-complete marker are present earlier and are better signals.
+  const haveRealInstall =
+    fileExists(venvPython) ||
+    fileExists(venvHermes) ||
+    fileExists(path.join(updateRoot, '.hermes-bootstrap-complete'))
+  const updaterArgs = haveRealInstall ? ['--update', '--branch', branch] : ['--repair', '--branch', branch]

  await releaseBackendLockForUpdate(updateRoot)

@@ -2265,6 +2318,7 @@ async function handOffWindowsBootstrapRecovery(reason) {
  // Same dwell as the in-app update hand-off (#50419): give the updater's
  // window time to appear before we vanish, so the recovery doesn't look like
  // a crash and provoke a mid-recovery relaunch.
+  isQuittingForHandoff = true
  setTimeout(() => {
    app.quit()
  }, UPDATE_HANDOFF_DWELL_MS)
@@ -2344,14 +2398,14 @@ async function applyUpdatesPosixInApp() {
    PATH: pathWithHermesManagedNode(path.join(updateRoot, 'venv', 'bin'))
  }

-  // `hermes update` reaps stale `hermes dashboard` backends (a code update
+  // `hermes update` reaps stale `hermes serve` backends (a code update
  // leaves the running process serving old Python against the freshly-updated
  // JS bundle). But OUR backend is one of those processes, and killing it
  // mid-update produces the boot→kill→crash loop in #37532 — the desktop
  // already restarts its own backend via the rebuild+relaunch below, so the
  // reap must spare it. Hand the live backend's PID to the update process;
  // _kill_stale_dashboard_processes reads HERMES_DESKTOP_CHILD_PID and excludes
-  // it while still reaping any genuinely-orphaned dashboards. (#37532)
+  // it while still reaping any genuinely-orphaned backends. (#37532)
  // Exclude every desktop-managed backend (primary + all pool profiles) from
  // the update reaper. _kill_stale_dashboard_processes accepts a comma-separated
  // list (a single int still parses for back-compat).
@@ -2472,6 +2526,7 @@ async function applyUpdatesPosixInApp() {
          `[updates] launched linux relaunch: ${scriptPath} -> ${process.execPath} ` +
            `(args=${relaunchArgs.length}, env=${Object.keys(relaunchEnv).length})`
        )
+        isQuittingForHandoff = true
        setTimeout(() => app.quit(), UPDATE_HANDOFF_DWELL_MS)
        return { ok: true, handedOff: true }
      } catch (err) {
@@ -2577,6 +2632,7 @@ fi
  child.unref()
  rememberLog(`[updates] launched mac swap+relaunch: ${scriptPath} (${rebuiltApp} -> ${targetApp})`)

+  isQuittingForHandoff = true
  setTimeout(() => app.quit(), 600)
  return { ok: true, handedOff: true, rebuiltApp, targetApp }
 }
@@ -2607,6 +2663,24 @@ function readBootstrapMarker() {
  return readJson(BOOTSTRAP_COMPLETE_MARKER)
 }

+// Marker-independent: is the canonical install at ACTIVE_HERMES_ROOT actually
+// runnable right now? A complete CLI install (`install.sh --include-desktop`)
+// or a DMG launch over a prior CLI install satisfies this WITHOUT the desktop
+// ever having written the bootstrap marker -- so we must be able to recognise
+// "already installed" off the filesystem alone, not just the marker.
+function isActiveRuntimeUsable() {
+  const venvPython = getVenvPython(VENV_ROOT)
+  return (
+    isHermesSourceRoot(ACTIVE_HERMES_ROOT) &&
+    fileExists(venvPython) &&
+    canImportHermesCli(venvPython, {
+      env: {
+        PYTHONPATH: [ACTIVE_HERMES_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter)
+      }
+    })
+  )
+}
+
 function isBootstrapComplete() {
  const marker = readBootstrapMarker()
  if (!marker || typeof marker !== 'object') return false
@@ -2619,7 +2693,7 @@ function isBootstrapComplete() {
  // a runnable venv: an interrupted or split-home install can leave the marker
  // + checkout without a venv, and trusting that spawns a dead backend
  // ("gateway offline") instead of re-running bootstrap to repair it.
-  return isHermesSourceRoot(ACTIVE_HERMES_ROOT) && fileExists(getVenvPython(VENV_ROOT))
+  return isActiveRuntimeUsable()
 }

 function writeBootstrapMarker(payload) {
@@ -2782,60 +2856,60 @@ function writeDefaultProjectDir(dir) {
  }
 }

-function createPythonBackend(root, label, dashboardArgs, options = {}) {
+function createPythonBackend(root, label, backendArgs, options = {}) {
  const python = findPythonForRoot(root)
  if (!python) return null

  const venvRoot = path.join(root, 'venv')
  const venvPython = getVenvPython(venvRoot)
-  const command = IS_WINDOWS && fileExists(venvPython) ? getNoConsoleVenvPython(venvRoot) : toNoConsolePython(python)
+  const command = IS_WINDOWS && fileExists(venvPython) ? venvPython : python

-  return applyWindowsNoConsoleSpawnHints({
+  return {
    kind: 'python',
    label,
    command,
-    args: ['-m', 'hermes_cli.main', ...dashboardArgs],
+    args: ['-m', 'hermes_cli.main', ...backendArgs],
    env: buildDesktopBackendEnv({
      hermesHome: HERMES_HOME,
-      pythonPathEntries: [root],
+      pythonPathEntries: [root, ...getVenvSitePackagesEntries(venvRoot)],
      venvRoot
    }),
    root,
    bootstrap: Boolean(options.bootstrap),
    shell: false
-  })
+  }
 }

 // createActiveBackend — build a backend pointing at ACTIVE_HERMES_ROOT, the
 // canonical install location shared with the CLI installer. The venv at
 // VENV_ROOT may not exist yet on first run; bootstrap=true tells
 // ensureRuntime() to create / refresh it before launch.
-function createActiveBackend(dashboardArgs) {
+function createActiveBackend(backendArgs) {
  const venvPython = getVenvPython(VENV_ROOT)
-  const command = fileExists(venvPython) ? getNoConsoleVenvPython(VENV_ROOT) : toNoConsolePython(findSystemPython())
+  const command = fileExists(venvPython) ? venvPython : findSystemPython()

-  return applyWindowsNoConsoleSpawnHints({
+  return {
    kind: 'python',
    label: `Hermes at ${ACTIVE_HERMES_ROOT}`,
    command,
-    args: ['-m', 'hermes_cli.main', ...dashboardArgs],
+    args: ['-m', 'hermes_cli.main', ...backendArgs],
    env: buildDesktopBackendEnv({
      hermesHome: HERMES_HOME,
-      pythonPathEntries: [ACTIVE_HERMES_ROOT],
+      pythonPathEntries: [ACTIVE_HERMES_ROOT, ...getVenvSitePackagesEntries(VENV_ROOT)],
      venvRoot: VENV_ROOT
    }),
    root: ACTIVE_HERMES_ROOT,
    bootstrap: true,
    shell: false
-  })
+  }
 }

-function resolveHermesBackend(dashboardArgs) {
+function resolveHermesBackend(backendArgs) {
  // 1. Explicit override -- HERMES_DESKTOP_HERMES_ROOT points at a developer
  //    checkout. Honour it as-is (no bootstrap; the user is driving).
  const overrideRoot = process.env.HERMES_DESKTOP_HERMES_ROOT && path.resolve(process.env.HERMES_DESKTOP_HERMES_ROOT)
  if (overrideRoot && isHermesSourceRoot(overrideRoot)) {
-    const backend = createPythonBackend(overrideRoot, `Hermes source at ${overrideRoot}`, dashboardArgs)
+    const backend = createPythonBackend(overrideRoot, `Hermes source at ${overrideRoot}`, backendArgs)
    if (backend) return backend
  }

@@ -2844,7 +2918,7 @@ function resolveHermesBackend(dashboardArgs) {
  //    installed `hermes` on PATH so local Python edits are actually exercised.
  //    (In dev with no checkout, SOURCE_REPO_ROOT won't pass isHermesSourceRoot.)
  if (!IS_PACKAGED && isHermesSourceRoot(SOURCE_REPO_ROOT)) {
-    const backend = createPythonBackend(SOURCE_REPO_ROOT, `Hermes source at ${SOURCE_REPO_ROOT}`, dashboardArgs)
+    const backend = createPythonBackend(SOURCE_REPO_ROOT, `Hermes source at ${SOURCE_REPO_ROOT}`, backendArgs)
    if (backend) return backend
  }

@@ -2855,7 +2929,7 @@ function resolveHermesBackend(dashboardArgs) {
  //    to spawning hermes. Updates flow through the in-app update path
  //    (applyUpdates -> git pull) or `hermes update` from the CLI.
  if (isBootstrapComplete()) {
-    return createActiveBackend(dashboardArgs)
+    return createActiveBackend(backendArgs)
  }

  // 4. Existing `hermes` on PATH -- installed via install.ps1 / install.sh from
@@ -2888,7 +2962,7 @@ function resolveHermesBackend(dashboardArgs) {
    }

    if (hermesCommand) {
-      const unwrapped = unwrapWindowsVenvHermesCommand(hermesCommand, dashboardArgs)
+      const unwrapped = unwrapWindowsVenvHermesCommand(hermesCommand, backendArgs)
      if (unwrapped) {
        return unwrapped
      }
@@ -2903,10 +2977,10 @@ function resolveHermesBackend(dashboardArgs) {
      const shellForProbe = isCommandScript(hermesCommand)
      if (verifyHermesCli(hermesCommand, { shell: shellForProbe })) {
        return (
-          unwrapWindowsVenvHermesCommand(hermesCommand, dashboardArgs) || {
+          unwrapWindowsVenvHermesCommand(hermesCommand, backendArgs) || {
            label: `existing Hermes CLI at ${hermesCommand}`,
            command: hermesCommand,
-            args: dashboardArgs,
+            args: backendArgs,
            bootstrap: false,
            env: {},
            kind: 'command',
@@ -2934,15 +3008,15 @@ function resolveHermesBackend(dashboardArgs) {
    // failure, fall through to step 6 so the bootstrap runner pulls
    // a uv-managed 3.11 into %LOCALAPPDATA%\hermes\hermes-agent\venv.
    if (canImportHermesCli(python)) {
-      return applyWindowsNoConsoleSpawnHints({
+      return {
        kind: 'python',
        label: `installed hermes_cli module via ${python}`,
-        command: toNoConsolePython(python),
-        args: ['-m', 'hermes_cli.main', ...dashboardArgs],
+        command: python,
+        args: ['-m', 'hermes_cli.main', ...backendArgs],
        bootstrap: false,
        env: {},
        shell: false
-      })
+      }
    }
    rememberLog(`Ignoring system Python ${python}: hermes_cli is not importable; falling through to bootstrap.`)
  }
@@ -2961,7 +3035,7 @@ function resolveHermesBackend(dashboardArgs) {
    kind: 'bootstrap-needed',
    label: 'Hermes Agent not installed yet; bootstrap required',
    command: null,
-    args: dashboardArgs,
+    args: backendArgs,
    bootstrap: true,
    env: {},
    shell: false,
@@ -2976,7 +3050,7 @@ function resolveHermesBackend(dashboardArgs) {
 async function ensureRuntime(backend) {
  if (!backend.bootstrap) {
    await advanceBootProgress('runtime.external', `Using ${backend.label}`, 32)
-    return applyWindowsNoConsoleSpawnHints(backend)
+    return backend
  }

  // backend.kind === 'bootstrap-needed' means resolveHermesBackend couldn't
@@ -3118,7 +3192,7 @@ async function ensureRuntime(backend) {
    )
  }

-  backend.command = getNoConsoleVenvPython(VENV_ROOT)
+  backend.command = getVenvPython(VENV_ROOT)
  backend.label = `Hermes at ${ACTIVE_HERMES_ROOT} (venv: ${VENV_ROOT})`
  updateBootProgress({
    phase: 'runtime.ready',
@@ -3127,7 +3201,7 @@ async function ensureRuntime(backend) {
    running: true,
    error: null
  })
-  return applyWindowsNoConsoleSpawnHints(backend)
+  return backend
 }

 function fetchJson(url, token, options = {}) {
@@ -3788,7 +3862,7 @@ function getWindowButtonPosition() {
 }

 function getNativeOverlayWidth() {
-  return computeNativeOverlayWidth({ isWindows: IS_WINDOWS, isWsl: IS_WSL })
+  return computeNativeOverlayWidth({ isWindows: IS_WINDOWS, isWsl: IS_WSL, isMac: IS_MAC })
 }

 function getWindowState() {
@@ -5194,8 +5268,10 @@ async function spawnPoolBackend(profile, entry) {
  // --profile wins over the inherited HERMES_HOME env (see _apply_profile_override
  // step 3 in hermes_cli/main.py), so the child re-homes to this profile.
  // --port 0: the OS assigns an ephemeral port; the child announces it on stdout.
-  const dashboardArgs = ['--profile', profile, 'dashboard', '--no-open', '--host', '127.0.0.1', '--port', '0']
-  const backend = await ensureRuntime(resolveHermesBackend(dashboardArgs))
+  const backendArgs = ['--profile', profile, 'serve', '--host', '127.0.0.1', '--port', '0']
+  const backend = await ensureRuntime(resolveHermesBackend(backendArgs))
+  // Route old runtimes (no `serve`) through the legacy `dashboard --no-open`.
+  backend.args = getBackendArgsForRuntime(backend)
  const hermesCwd = resolveHermesCwd()
  const webDist = resolveWebDist()
  const readyFile = backend.readyFile ? makeDashboardReadyFile() : null
@@ -5411,7 +5487,7 @@ async function startHermes() {

    const token = crypto.randomBytes(32).toString('base64url')
    // --port 0: the OS assigns an ephemeral port; the child announces it on stdout.
-    const dashboardArgs = ['dashboard', '--no-open', '--host', '127.0.0.1', '--port', '0']
+    const backendArgs = ['serve', '--host', '127.0.0.1', '--port', '0']
    // Pin the desktop's chosen profile via the global --profile flag. This is
    // deterministic (it wins over the sticky ~/.hermes/active_profile file) and
    // resolves HERMES_HOME the same way `hermes -p <name>` does on the CLI. An
@@ -5419,10 +5495,12 @@ async function startHermes() {
    // unaffected.
    const activeProfile = readActiveDesktopProfile()
    if (activeProfile) {
-      dashboardArgs.unshift('--profile', activeProfile)
+      backendArgs.unshift('--profile', activeProfile)
    }
    await advanceBootProgress('backend.runtime', 'Resolving Hermes runtime', 28)
-    const backend = await ensureRuntime(resolveHermesBackend(dashboardArgs))
+    const backend = await ensureRuntime(resolveHermesBackend(backendArgs))
+    // Route old runtimes (no `serve`) through the legacy `dashboard --no-open`.
+    backend.args = getBackendArgsForRuntime(backend)
    const hermesCwd = resolveHermesCwd()
    const webDist = resolveWebDist()
    const readyFile = backend.readyFile ? makeDashboardReadyFile() : null
@@ -7323,6 +7401,7 @@ async function runDesktopUninstall(mode) {

  // Give the renderer a beat to show its "uninstalling…" state, then quit so
  // the venv python shim + app bundle unlock and the cleanup script can run.
+  isQuittingForHandoff = true
  setTimeout(() => app.quit(), 800)
  return { ok: true, mode, willRemoveAppBundle: Boolean(removeBundle), scriptPath }
 }
@@ -7528,5 +7607,11 @@ app.on('before-quit', () => {
 })

 app.on('window-all-closed', () => {
-  if (process.platform !== 'darwin') app.quit()
+  // macOS convention: keep the process alive in the Dock when the user closes
+  // the last window. But when we're handing off to a detached updater / swap /
+  // uninstall script, the process MUST exit so the script can replace or remove
+  // the bundle and relaunch — without this the script's PID-wait spins to its
+  // full timeout and the user is left with an invisible app (or an uninstall
+  // that appears to do nothing).
+  if (process.platform !== 'darwin' || isQuittingForHandoff) app.quit()
 })
--- a/apps/desktop/electron/titlebar-overlay-width.cjs
+++ b/apps/desktop/electron/titlebar-overlay-width.cjs
@@ -1,11 +1,24 @@
-// Pre-layout fallback for WCO right-edge reservation (--titlebar-tools-right).
-// Live width comes from navigator.windowControlsOverlay in the renderer.
+'use strict'

 const OVERLAY_FALLBACK_WIDTH = 144

-/** @param {{ isWindows?: boolean, isWsl?: boolean }} opts */
-function nativeOverlayWidth({ isWindows = false, isWsl = false } = {}) {
-  return isWindows || isWsl ? OVERLAY_FALLBACK_WIDTH : 0
+/**
+ * Static pre-layout reservation (px) for the right-side native window-controls
+ * overlay (min/max/close). Only a FALLBACK — once laid out the renderer reads
+ * the exact width from navigator.windowControlsOverlay
+ * (use-window-controls-overlay-width.ts) and uses this value only when the WCO
+ * API is unavailable.
+ *
+ * macOS uses traffic lights positioned via trafficLightPosition, not a WCO
+ * overlay, so it reserves nothing here. Every other desktop platform now paints
+ * the Electron overlay (Windows, WSLg, and plain Linux KDE/GNOME), so they all
+ * reserve the fallback width.
+ *
+ * @param {{ isWindows?: boolean, isWsl?: boolean, isMac?: boolean }} opts
+ */
+function nativeOverlayWidth({ isWindows = false, isWsl = false, isMac = false } = {}) {
+  if (isMac) return 0
+  return OVERLAY_FALLBACK_WIDTH
 }

 module.exports = { OVERLAY_FALLBACK_WIDTH, nativeOverlayWidth }
--- a/apps/desktop/electron/titlebar-overlay-width.test.cjs
+++ b/apps/desktop/electron/titlebar-overlay-width.test.cjs
@@ -18,10 +18,17 @@ test('WSLg paints the same WCO, so it reserves the same fallback width', () => {
  assert.equal(nativeOverlayWidth({ isWsl: true }), OVERLAY_FALLBACK_WIDTH)
 })

-test('plain Linux and macOS reserve nothing', () => {
-  assert.equal(nativeOverlayWidth({ isWindows: false, isWsl: false }), 0)
-  assert.equal(nativeOverlayWidth(), 0)
-  assert.equal(nativeOverlayWidth({}), 0)
+test('plain Linux paints the WCO too, so it reserves the fallback width', () => {
+  // Regression #53185: re-enabling the overlay on plain Linux (KDE/GNOME)
+  // without reserving its width left the native min/max/close buttons painting
+  // on top of the app's right-edge titlebar tools.
+  assert.equal(nativeOverlayWidth({ isWindows: false, isWsl: false }), OVERLAY_FALLBACK_WIDTH)
+  assert.equal(nativeOverlayWidth(), OVERLAY_FALLBACK_WIDTH)
+  assert.equal(nativeOverlayWidth({}), OVERLAY_FALLBACK_WIDTH)
+})
+
+test('macOS uses traffic lights, not a WCO overlay, so it reserves nothing', () => {
+  assert.equal(nativeOverlayWidth({ isMac: true }), 0)
 })

 test('the fallback width is a sane positive pixel value', () => {
--- a/apps/desktop/electron/windows-child-process.test.cjs
+++ b/apps/desktop/electron/windows-child-process.test.cjs
@@ -38,19 +38,40 @@ test('desktop background child processes opt into hidden Windows consoles', () =
  requireHiddenChildOptions(source, /hermesProcess = spawn\(\s*backend\.command,\s*backend\.args/)
  requireHiddenChildOptions(source, /spawn\(\s*py,\s*\['-m', 'hermes_cli\.main', 'uninstall', '--gui-summary'\]/)

-  assert.match(source, /function unwrapWindowsVenvHermesCommand\(command, dashboardArgs\)/)
-  assert.match(source, /existing Hermes no-console Python at/)
-  assert.match(source, /function getNoConsoleVenvPython\(venvRoot\)/)
-  assert.match(source, /function toNoConsolePython\(pythonPath\)/)
-  assert.match(source, /function applyWindowsNoConsoleSpawnHints\(backend\)/)
-  assert.match(source, /function readVenvHome\(venvRoot\)/)
-  assert.match(source, /path\.join\(venvRoot, 'Scripts', 'pythonw\.exe'\)/)
-  assert.match(source, /backendStartFailure/)
-  assert.match(source, /HERMES_DESKTOP_READY_FILE/)
-  assert.match(source, /readyFile: true/)
+  assert.match(source, /function unwrapWindowsVenvHermesCommand\(command, backendArgs\)/)
  assert.match(source, /function getVenvSitePackagesEntries\(venvRoot\)/)
  assert.match(source, /path\.join\(venvRoot, 'Lib', 'site-packages'\)/)
-  assert.match(source, /args: \['-m', 'hermes_cli\.main', \.\.\.dashboardArgs\]/)
+  assert.match(source, /args: \['-m', 'hermes_cli\.main', \.\.\.backendArgs\]/)
+})
+
+test('desktop backend launches console python so child consoles are inherited, not pythonw', () => {
+  const source = readElectronFile('main.cjs')
+
+  // The flash fix is structural: the backend runs as a console-subsystem
+  // python.exe under hiddenWindowsChildOptions() (-> CREATE_NO_WINDOW), so it
+  // owns ONE windowless console that every descendant spawn inherits. Launching
+  // it as GUI-subsystem pythonw.exe is what made each child allocate (and flash)
+  // its own console, so the backend command must never be pythonw.
+  assert.doesNotMatch(source, /pythonw\.exe'\)/, 'backend must not be launched via pythonw.exe')
+  assert.doesNotMatch(
+    source,
+    /function getNoConsoleVenvPython\b/,
+    'pythonw-conversion helper should be gone; console python is launched directly'
+  )
+  assert.doesNotMatch(
+    source,
+    /function applyWindowsNoConsoleSpawnHints\b/,
+    'pythonw spawn-hint rewriter should be gone'
+  )
+
+  // Console python restores stdout, so the port is announced on the normal
+  // HERMES_DASHBOARD_READY stdout line — no ready-file side channel is set.
+  assert.doesNotMatch(source, /readyFile: true/, 'no backend should opt into the pythonw ready-file path')
+
+  // Both desktop backend launches must still go through hiddenWindowsChildOptions
+  // so the single backend console is created windowless.
+  requireHiddenChildOptions(source, /spawn\(\s*backend\.command,\s*backend\.args/)
+  requireHiddenChildOptions(source, /hermesProcess = spawn\(\s*backend\.command,\s*backend\.args/)
 })

 test('intentional or interactive desktop child processes stay documented', () => {
@@ -68,5 +89,5 @@ test('bootstrap PowerShell runner hides Windows console children', () => {
  const source = readElectronFile('bootstrap-runner.cjs')

  assert.match(source, /function hiddenWindowsChildOptions\(options = \{\}\)/)
-  requireHiddenChildOptions(source, 'spawn(ps, fullArgs')
+  requireHiddenChildOptions(source, /spawn\(\s*ps,\s*fullArgs/)
 })
--- a/apps/desktop/electron/windows-hermes-resolution.test.cjs
+++ b/apps/desktop/electron/windows-hermes-resolution.test.cjs
@@ -0,0 +1,67 @@
+'use strict'
+
+// Regression guards for Windows `hermes` resolution in main.cjs.
+//
+// main.cjs has no module.exports, so these follow the repo's source-assertion
+// test pattern (see windows-child-process.test.cjs). They pin the two Windows
+// resolution bugs that caused desktop reinstall loops:
+//   1. findOnPath() tried the empty extension FIRST, so an extensionless
+//      Git-Bash `hermes` shim shadowed the real hermes.cmd/hermes.exe; the
+//      shim then failed the --version probe and the desktop fell through to a
+//      spurious bootstrap/repair.
+//   2. handOffWindowsBootstrapRecovery() chose --update vs the destructive
+//      --repair by checking ONLY venv\Scripts\hermes.exe (the console-script
+//      shim, written at the END of venv setup and absent in interrupted
+//      states), so it escalated to a full venv recreate even on healthy
+//      installs.
+
+const test = require('node:test')
+const assert = require('node:assert/strict')
+const fs = require('node:fs')
+const path = require('node:path')
+
+function readMain() {
+  return fs.readFileSync(path.join(__dirname, 'main.cjs'), 'utf8').replace(/\r\n/g, '\n')
+}
+
+test('findOnPath tries PATHEXT extensions before the bare (empty) name on Windows', () => {
+  const source = readMain()
+  // Fixed order: PATHEXT first, empty string LAST.
+  assert.match(
+    source,
+    /\(process\.env\.PATHEXT \|\| '\.COM;\.EXE;\.BAT;\.CMD'\)\.split\(';'\)\.filter\(Boolean\), ''\]/,
+    'extensions array must end with the empty string, not start with it'
+  )
+  // The buggy empty-first order must not return.
+  assert.doesNotMatch(
+    source,
+    /\['', \.\.\.\(process\.env\.PATHEXT/,
+    'empty-extension-first order regressed: an extensionless shim can shadow hermes.cmd/.exe'
+  )
+})
+
+test('Windows bootstrap recovery chooses --update when any real-install signal is present', () => {
+  const source = readMain()
+  assert.match(source, /const haveRealInstall =/, 'recovery must compute haveRealInstall')
+  assert.match(
+    source,
+    /fileExists\(venvPython\)/,
+    'recovery must accept the venv interpreter as a real-install signal'
+  )
+  assert.match(
+    source,
+    /\.hermes-bootstrap-complete/,
+    'recovery must accept the bootstrap-complete marker as a real-install signal'
+  )
+  assert.match(
+    source,
+    /updaterArgs = haveRealInstall \? \['--update'/,
+    'updaterArgs must gate on haveRealInstall'
+  )
+  // The old too-narrow check (only venv\Scripts\hermes.exe) must not return.
+  assert.doesNotMatch(
+    source,
+    /updaterArgs = fileExists\(venvHermes\) \?/,
+    'recovery regressed to gating only on the hermes.exe shim, which forces destructive --repair'
+  )
+})
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@@ -18,7 +18,7 @@
    "profile:main": "wait-on http://127.0.0.1:5174 && cross-env XCURSOR_SIZE=24 HERMES_DESKTOP_DEV_SERVER=http://127.0.0.1:5174 electron --inspect=9229 .",
    "profile:main:cpu": "wait-on http://127.0.0.1:5174 && cross-env XCURSOR_SIZE=24 NODE_OPTIONS=--cpu-prof HERMES_DESKTOP_DEV_SERVER=http://127.0.0.1:5174 electron .",
    "start": "npm run build && electron .",
-    "build": "node scripts/assert-root-install.cjs && node scripts/write-build-stamp.cjs && node scripts/stage-native-deps.cjs && tsc -b && vite build &&  node scripts/bundle-electron-main.mjs && npm run postbuild",
+    "build": "node scripts/assert-root-install.cjs && node scripts/write-build-stamp.cjs && node scripts/stage-native-deps.cjs && tsc -b && vite build && npm run postbuild",
    "postbuild": "node scripts/assert-dist-built.cjs",
    "prebuilder": "node scripts/patch-electron-builder-mac-binary.cjs",
    "builder": "cross-env NODE_OPTIONS=--max-old-space-size=16384 node scripts/run-electron-builder.cjs",
@@ -37,7 +37,7 @@
    "test:desktop:nsis": "node scripts/test-desktop.mjs nsis",
    "test:desktop:existing": "node scripts/test-desktop.mjs existing",
    "test:desktop:fresh": "node scripts/test-desktop.mjs fresh",
-    "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/git-worktree-ops.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-count.test.cjs electron/update-rebuild.test.cjs electron/update-marker.test.cjs electron/update-relaunch.test.cjs electron/windows-user-env.test.cjs electron/wsl-clipboard-image.test.cjs electron/titlebar-overlay-width.test.cjs electron/window-state.test.cjs",
+    "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/git-worktree-ops.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-count.test.cjs electron/update-rebuild.test.cjs electron/update-marker.test.cjs electron/update-relaunch.test.cjs electron/windows-user-env.test.cjs electron/wsl-clipboard-image.test.cjs electron/titlebar-overlay-width.test.cjs electron/window-state.test.cjs electron/windows-hermes-resolution.test.cjs",
    "typecheck": "tsc -p . --noEmit",
    "lint": "eslint src/ electron/",
    "lint:fix": "eslint src/ electron/ --fix",
@@ -73,6 +73,7 @@
    "@tanstack/react-virtual": "^3.13.24",
    "@vscode/codicons": "^0.0.45",
    "@xterm/addon-fit": "^0.11.0",
+    "@xterm/addon-serialize": "^0.14.0",
    "@xterm/addon-unicode11": "^0.9.0",
    "@xterm/addon-web-links": "^0.12.0",
    "@xterm/addon-webgl": "^0.19.0",
--- a/apps/desktop/scripts/bundle-electron-main.mjs
+++ b/apps/desktop/scripts/bundle-electron-main.mjs
@@ -1,33 +0,0 @@
-#!/usr/bin/env node
-// bundle-electron-main.mjs — bundles electron/main.cjs into a single
-// self-contained file so the nix build doesn't need to ship node_modules/.
-//
-// `electron` is provided by the runtime; `node-pty` is staged separately
-// via stage-native-deps.cjs.  `preload.cjs` is NOT require()'d by main —
-// Electron loads it via path.join(__dirname, 'preload.cjs') — so it stays
-// as a separate file and doesn't need bundling.
-import { build } from 'esbuild'
-import { resolve, dirname } from 'node:path'
-import { fileURLToPath } from 'node:url'
-import { renameSync } from 'node:fs'
-
-const here = dirname(fileURLToPath(import.meta.url))
-const root = resolve(here, '..')
-const entry = resolve(root, 'electron/main.cjs')
-const tmp = resolve(root, 'electron/main.bundled.cjs')
-
-await build({
-  entryPoints: [entry],
-  bundle: true,
-  platform: 'node',
-  format: 'cjs',
-  target: 'node20',
-  outfile: tmp,
-  external: ['electron', 'node-pty'],
-  logLevel: 'info'
-})
-
-// Overwrite the original with the bundled version.
-renameSync(tmp, entry)
-
-console.log(`bundled ${entry}`)
--- a/apps/desktop/scripts/stage-native-deps.cjs
+++ b/apps/desktop/scripts/stage-native-deps.cjs
@@ -66,6 +66,31 @@ const NATIVE_DEPS = [
  }
 ]

+// Pure-JS runtime dependencies that the packaged electron main require()s but
+// that workspace dedup hoists into the repo-root node_modules -- out of reach
+// of electron-builder's file collector, exactly like node-pty above.  Unlike
+// node-pty there is no native binary to select; we stage each package's whole
+// directory into build/native-deps/vendor/node_modules/<name> so the dep's own
+// internal require()s resolve against a real node_modules tree, and the
+// requiring file (electron/git-review-ops.cjs) falls back to that path via
+// process.resourcesPath when the normal require() fails.  See issue #52735
+// (packaged app crashed at launch on `Cannot find module 'simple-git'`).
+//
+// The closure is resolved at stage time by walking dependencies +
+// optionalDependencies, so a simple-git version bump that pulls in a new
+// transitive dep can't silently re-introduce the crash.
+//
+// Layout note: the closure lands in build/native-deps/vendor/node_modules/,
+// NOT build/native-deps/node_modules/.  electron-builder's file collector
+// hard-drops a `node_modules` directory that sits at the ROOT of an
+// extraResources copy (app-builder-lib/out/util/filter.js: `if (relative ===
+// "node_modules") return false`), but keeps a NESTED one.  Nesting under
+// `vendor/` makes node_modules a subdirectory so it survives packing; the
+// require() fallback in git-review-ops.cjs resolves the matching
+// vendor/node_modules path.
+const JS_DEP_ROOTS = ['simple-git']
+const JS_DEP_STAGE_ROOT = path.join(STAGE_ROOT, 'vendor', 'node_modules')
+
 function rmrf(target) {
  fs.rmSync(target, { recursive: true, force: true })
 }
@@ -148,12 +173,111 @@ function stageOne(spec) {
  console.log(`[stage-native-deps] ${path.relative(APP_ROOT, spec.to)}: ${copied} files`)
 }

+// Resolve a package's directory by name, searching the repo-root node_modules
+// first (where workspace dedup hoists everything) and then the requiring
+// package's own node_modules for any non-hoisted nested copy.
+//
+// We deliberately do NOT use require.resolve(`${name}/package.json`): packages
+// with an "exports" map that doesn't list "./package.json" (e.g. simple-git
+// 3.x) make that subpath unresolvable under Node's exports enforcement
+// (ERR_PACKAGE_PATH_NOT_EXPORTED), which fails on CI even though it happened to
+// work locally.  Instead resolve the package's main entry (exports-aware) and
+// walk up to the directory whose package.json's "name" matches.
+function resolvePkgDir(name, fromDir) {
+  const searchPaths = [fromDir, REPO_ROOT, path.join(REPO_ROOT, 'node_modules')]
+  let entry
+  try {
+    entry = require.resolve(name, { paths: searchPaths })
+  } catch {
+    return null
+  }
+  // Walk up from the resolved entry file to the package root: the first
+  // ancestor dir whose package.json declares this package's name.
+  let dir = path.dirname(entry)
+  while (true) {
+    const pjPath = path.join(dir, 'package.json')
+    try {
+      const pj = JSON.parse(fs.readFileSync(pjPath, 'utf8'))
+      if (pj.name === name) {
+        return dir
+      }
+    } catch {
+      // no package.json here (or unreadable) — keep walking up
+    }
+    const parent = path.dirname(dir)
+    if (parent === dir) {
+      return null
+    }
+    dir = parent
+  }
+}
+
+// Walk dependencies + optionalDependencies from each root package and return
+// the set of resolved package directories in the runtime closure.  Keyed by
+// package name so a dep reached via two paths is staged once.
+function resolveJsClosure(roots) {
+  const closure = new Map() // name -> absolute package dir
+  const stack = roots.map(name => ({ name, fromDir: REPO_ROOT }))
+  while (stack.length) {
+    const { name, fromDir } = stack.pop()
+    if (closure.has(name)) continue
+    const dir = resolvePkgDir(name, fromDir)
+    if (!dir) {
+      throw new Error(
+        `stage-native-deps: could not resolve '${name}' for the simple-git ` +
+          `closure.  Run \`npm install\` at the workspace root first.`
+      )
+    }
+    closure.set(name, dir)
+    let pj
+    try {
+      pj = JSON.parse(fs.readFileSync(path.join(dir, 'package.json'), 'utf8'))
+    } catch {
+      continue
+    }
+    const deps = { ...(pj.dependencies || {}), ...(pj.optionalDependencies || {}) }
+    for (const depName of Object.keys(deps)) {
+      stack.push({ name: depName, fromDir: dir })
+    }
+  }
+  return closure
+}
+
+// Stage the resolved JS dependency closure into build/native-deps/vendor/node_modules/
+// so the packaged app (and the nix output) can require() it from
+// process.resourcesPath when the hoisted-root require() isn't reachable.  Each
+// package is copied whole (minus node_modules/ — the closure is flattened so
+// every dep already has its own top-level entry) into a real node_modules
+// layout, which keeps the deps' own internal require()s working unchanged.
+function stageJsClosure(roots) {
+  const closure = resolveJsClosure(roots)
+  rmrf(JS_DEP_STAGE_ROOT)
+  ensureDir(JS_DEP_STAGE_ROOT)
+  let staged = 0
+  for (const [name, fromDir] of closure) {
+    const dest = path.join(JS_DEP_STAGE_ROOT, name)
+    ensureDir(path.dirname(dest))
+    // Copy the package directory but skip any nested node_modules/ — the
+    // closure is flattened, so nested copies would just bloat the bundle.
+    fs.cpSync(fromDir, dest, {
+      recursive: true,
+      filter: src => path.basename(src) !== 'node_modules'
+    })
+    staged += 1
+  }
+  console.log(
+    `[stage-native-deps] vendor/node_modules/: ${staged} package(s) ` +
+      `(${[...closure.keys()].sort().join(', ')})`
+  )
+}
+
 function main() {
  rmrf(STAGE_ROOT)
  ensureDir(STAGE_ROOT)
  for (const spec of NATIVE_DEPS) {
    stageOne(spec)
  }
+  stageJsClosure(JS_DEP_ROOTS)
 }

 main()
--- a/apps/desktop/src/app/agents/index.tsx
+++ b/apps/desktop/src/app/agents/index.tsx
@@ -19,7 +19,7 @@ import {
  type SubagentStreamEntry
 } from '@/store/subagents'

-import { OverlayView } from '../overlays/overlay-view'
+import { Panel, PanelEmpty, PanelHeader } from '../overlays/panel'

 // Mirrors statusGlyph() in tool-fallback.tsx so subagent rows speak the
 // same visual vocabulary as the chat tool blocks.
@@ -86,18 +86,16 @@ export function AgentsView({ onClose }: AgentsViewProps) {
  const tree = useMemo(() => buildSubagentTree(allSubagents(subagentsBySession)), [subagentsBySession])

  return (
-    <OverlayView
-      closeLabel={t.agents.close}
-      contentClassName="px-5 pt-5 pb-4 sm:px-6"
-      onClose={onClose}
-      rootClassName="mx-auto max-w-3xl"
-    >
-      <header className="mb-3 shrink-0">
-        <h2 className="text-sm font-semibold text-foreground">{t.agents.title}</h2>
-        <p className="text-xs text-muted-foreground/80">{t.agents.subtitle}</p>
-      </header>
-      <SubagentTree tree={tree} />
-    </OverlayView>
+    <Panel closeLabel={t.agents.close} onClose={onClose}>
+      {tree.length === 0 ? (
+        <PanelEmpty description={t.agents.emptyDesc} icon="hubot" title={t.agents.emptyTitle} />
+      ) : (
+        <>
+          <PanelHeader subtitle={t.agents.subtitle} title={t.agents.title} />
+          <SubagentTree tree={tree} />
+        </>
+      )}
+    </Panel>
  )
 }

--- a/apps/desktop/src/app/chat/composer/status-stack/status-row.tsx
+++ b/apps/desktop/src/app/chat/composer/status-stack/status-row.tsx
@@ -1,10 +1,9 @@
-import { Fragment, memo, type ReactNode, useState } from 'react'
+import { Fragment, memo, type ReactNode } from 'react'

+import { openAgentTerminal } from '@/app/right-sidebar/terminal/terminals'
 import { StatusRow } from '@/components/chat/status-row'
-import { TerminalOutput } from '@/components/chat/terminal-output'
 import { Button } from '@/components/ui/button'
 import { Codicon } from '@/components/ui/codicon'
-import { DisclosureCaret } from '@/components/ui/disclosure-caret'
 import { GlyphSpinner } from '@/components/ui/glyph-spinner'
 import { Tip } from '@/components/ui/tooltip'
 import { type Translations, useI18n } from '@/i18n'
@@ -82,7 +81,6 @@ interface StatusItemRowProps {
 export const StatusItemRow = memo(function StatusItemRow({ item, onDismiss, onOpen, onStop }: StatusItemRowProps) {
  const { t } = useI18n()
  const s = t.statusStack
-  const [outputOpen, setOutputOpen] = useState(false)
  const failed = item.state === 'failed'
  const running = item.state === 'running'

@@ -94,8 +92,10 @@ export const StatusItemRow = memo(function StatusItemRow({ item, onDismiss, onOp
      : null

  const canOpen = item.type === 'subagent' && !!onOpen
-  const hasOutput = item.type === 'background' && !!item.output
-  const onActivate = canOpen ? onOpen : hasOutput ? () => setOutputOpen(open => !open) : undefined
+
+  // Background rows link to their read-only terminal tab; subagents open their session.
+  const onActivate =
+    item.type === 'background' ? () => openAgentTerminal(item.id, item.title) : canOpen ? onOpen : undefined

  return (
    <Fragment>
@@ -146,9 +146,7 @@ export const StatusItemRow = memo(function StatusItemRow({ item, onDismiss, onOp
            {s.exit(item.exitCode)}
          </span>
        )}
-        {hasOutput && <DisclosureCaret className="shrink-0 text-muted-foreground/45" open={outputOpen} size="0.8em" />}
      </StatusRow>
-      {hasOutput && outputOpen && <TerminalOutput className="mx-auto mb-1 max-w-[90%]" text={item.output!} />}
    </Fragment>
  )
 })
--- a/apps/desktop/src/app/chat/hooks/use-composer-actions.ts
+++ b/apps/desktop/src/app/chat/hooks/use-composer-actions.ts
@@ -5,6 +5,7 @@ import { droppedFileInlineRef } from '@/app/chat/composer/inline-refs'
 import { formatRefValue } from '@/components/assistant-ui/directive-text'
 import { useI18n } from '@/i18n'
 import { attachmentId, contextPath, pathLabel } from '@/lib/chat-runtime'
+import { readDesktopFileDataUrl, selectDesktopPaths } from '@/lib/desktop-fs'
 import {
  addComposerAttachment,
  type ComposerAttachment,
@@ -262,7 +263,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway

  const pickContextPaths = useCallback(
    async (kind: 'file' | 'folder') => {
-      const paths = await window.hermesDesktop?.selectPaths({
+      const paths = await selectDesktopPaths({
        title: kind === 'file' ? 'Add files as context' : 'Add folders as context',
        defaultPath: currentCwd || undefined,
        directories: kind === 'folder'
@@ -347,7 +348,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway
      attachToMain(baseAttachment)

      try {
-        const previewUrl = await window.hermesDesktop?.readFileDataUrl(filePath)
+        const previewUrl = await readDesktopFileDataUrl(filePath)

        if (previewUrl) {
          addComposerAttachment({ ...baseAttachment, previewUrl })
@@ -395,7 +396,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway
  )

  const pickImages = useCallback(async () => {
-    const paths = await window.hermesDesktop?.selectPaths({
+    const paths = await selectDesktopPaths({
      title: copy.attachImages,
      defaultPath: currentCwd || undefined,
      filters: [
--- a/apps/desktop/src/app/chat/sidebar/index.tsx
+++ b/apps/desktop/src/app/chat/sidebar/index.tsx
@@ -1149,7 +1149,8 @@ export function ChatSidebar({

  const showSessionSkeletons = sessionsLoading && sortedSessions.length === 0

-  const showSessionSections = showSessionSkeletons || sortedSessions.length > 0
+  const showSessionSections =
+    showSessionSkeletons || sortedSessions.length > 0 || projectModel.length > 0

  // Each reorderable list reports its OWN new id order; persisting is a direct,
  // typed write — no id-prefix sniffing to figure out which level moved.
@@ -1537,7 +1538,7 @@ export function ChatSidebar({
          </div>
        )}

-        {contentVisible && !showSessionSections && <div className="min-h-0 flex-1" />}
+        {contentVisible && !showSessionSections && <SidebarBlankState onNewProject={openProjectCreate} />}

        {contentVisible && (
          <div className="shrink-0 px-0.5 pb-1 pt-0.5">
@@ -1618,6 +1619,29 @@ function SidebarSessionSkeletons() {
  )
 }

+function SidebarBlankState({ onNewProject }: { onNewProject: () => void }) {
+  const { t } = useI18n()
+  const s = t.sidebar
+
+  return (
+    <div className="grid min-h-0 flex-1 place-items-center px-4 text-center">
+      <div className="flex flex-col items-center gap-2">
+        <Codicon className="text-(--ui-text-quaternary)" name="root-folder" size="1.25rem" />
+        <p className="text-xs text-(--ui-text-tertiary)">{s.noSessions}</p>
+        <Button
+          className="mt-0.5 text-(--ui-text-secondary)"
+          onClick={onNewProject}
+          size="sm"
+          variant="ghost"
+        >
+          <Codicon name="add" size="0.75rem" />
+          {s.projects.newButton}
+        </Button>
+      </div>
+    </div>
+  )
+}
+
 function SidebarPinnedEmptyState() {
  const { t } = useI18n()

--- a/apps/desktop/src/app/chat/sidebar/project-dialog.tsx
+++ b/apps/desktop/src/app/chat/sidebar/project-dialog.tsx
@@ -87,21 +87,25 @@ export function ProjectDialog() {
  }

  const pickFolder = async () => {
-    const dir = await pickProjectFolder()
+    try {
+      const dir = await pickProjectFolder()

-    if (!dir) {
-      return
+      if (!dir) {
+        return
+      }
+
+      const projectId = state?.projectId
+
+      if (mode === 'add-folder' && projectId) {
+        await runSubmit(() => addProjectFolder(projectId, dir))
+
+        return
+      }
+
+      setFolders(prev => (prev.includes(dir) ? prev : [...prev, dir]))
+    } catch (err) {
+      notifyError(err, p.createFailed)
    }
-
-    const projectId = state?.projectId
-
-    if (mode === 'add-folder' && projectId) {
-      await runSubmit(() => addProjectFolder(projectId, dir))
-
-      return
-    }
-
-    setFolders(prev => (prev.includes(dir) ? prev : [...prev, dir]))
  }

  const submit = async () => {
@@ -145,7 +149,10 @@ export function ProjectDialog() {

  return (
    <Dialog onOpenChange={onOpenChange} open={open}>
-      <DialogContent className="max-w-md">
+      <DialogContent
+        className="max-w-md"
+        onInteractOutside={event => event.preventDefault()}
+      >
        <DialogHeader>
          <DialogTitle>{title}</DialogTitle>
          {mode === 'create' && <DialogDescription>{p.createDesc}</DialogDescription>}
--- a/apps/desktop/src/app/chat/sidebar/projects/model.ts
+++ b/apps/desktop/src/app/chat/sidebar/projects/model.ts
@@ -3,6 +3,7 @@ import { useEffect, useMemo, useState } from 'react'

 import type { HermesGitWorktree } from '@/global'
 import type { SessionInfo } from '@/hermes'
+import { desktopGit } from '@/lib/desktop-git'
 import { mapPool } from '@/lib/pool'
 import { $sidebarWorkspaceCollapsedIds, toggleWorkspaceNodeCollapsed } from '@/store/layout'
 import { $worktreeRefreshToken } from '@/store/projects'
@@ -88,7 +89,7 @@ export function useRepoWorktreeMap(
  const refreshToken = useStore($worktreeRefreshToken)

  useEffect(() => {
-    const git = window.hermesDesktop?.git
+    const git = desktopGit()

    if (!enabled || !repoPaths.length || !git?.worktreeList) {
      setMap({})
--- a/apps/desktop/src/app/command-center/index.tsx
+++ b/apps/desktop/src/app/command-center/index.tsx
@@ -9,7 +9,16 @@ import { getActionStatus, getLogs, getStatus, getUsageAnalytics, restartGateway,
 import type { ActionStatusResponse, AnalyticsResponse, StatusResponse } from '@/hermes'
 import { useI18n } from '@/i18n'
 import { sessionTitle } from '@/lib/chat-runtime'
-import { Activity, AlertCircle, BarChart3, Bookmark, BookmarkFilled, Download, Pin, Trash2 } from '@/lib/icons'
+import {
+  Activity,
+  AlertCircle,
+  BarChart3,
+  Bookmark,
+  BookmarkFilled,
+  Download,
+  MessageCircle,
+  Trash2
+} from '@/lib/icons'
 import { exportSession } from '@/lib/session-export'
 import { cn } from '@/lib/utils'
 import { upsertDesktopActionTask } from '@/store/activity'
@@ -263,7 +272,7 @@ export function CommandCenterView({ initialSection, onClose, onDeleteSession, on
          {SECTIONS.map(value => (
            <OverlayNavItem
              active={section === value}
-              icon={value === 'sessions' ? Pin : value === 'system' ? Activity : BarChart3}
+              icon={value === 'sessions' ? MessageCircle : value === 'system' ? Activity : BarChart3}
              key={value}
              label={cc.sections[value]}
              onClick={() => setSection(value)}
@@ -361,7 +370,7 @@ export function CommandCenterView({ initialSection, onClose, onDeleteSession, on
            />
          ) : (
            <div className="grid min-h-0 flex-1 grid-rows-[auto_minmax(0,1fr)] gap-4">
-              <div className="border-b border-(--ui-stroke-tertiary) pb-4">
+              <div>
                {status ? (
                  <div className="grid gap-2">
                    <div className="flex items-start justify-between gap-3">
@@ -406,7 +415,7 @@ export function CommandCenterView({ initialSection, onClose, onDeleteSession, on
                )}
              </div>

-              <div className="flex min-h-0 flex-col">
+              <div className="flex min-h-0 flex-col pt-2">
                <div className="mb-2 flex items-center justify-between">
                  <span className="text-[0.625rem] font-medium uppercase tracking-[0.08em] text-(--ui-text-tertiary)">
                    {cc.recentLogs}
@@ -503,7 +512,7 @@ function UsagePanel({ error, loading, onRefresh, period, usage }: UsagePanelProp
        </span>
      )}

-      <div className="grid grid-cols-2 gap-x-4 gap-y-4 border-b border-(--ui-stroke-tertiary) pb-5 sm:grid-cols-3">
+      <div className="grid grid-cols-2 gap-x-4 gap-y-4 py-2 sm:grid-cols-3">
        <UsageStat label={cc.statSessions} value={formatInteger(totals.total_sessions)} />
        <UsageStat label={cc.statApiCalls} value={formatInteger(totals.total_api_calls)} />
        <UsageStat
@@ -563,7 +572,7 @@ function UsagePanel({ error, loading, onRefresh, period, usage }: UsagePanelProp
        )}
      </section>

-      <div className="grid min-h-0 gap-x-8 gap-y-5 border-t border-(--ui-stroke-tertiary) pt-5 sm:grid-cols-2">
+      <div className="grid min-h-0 gap-x-8 gap-y-5 pt-1 sm:grid-cols-2">
        <UsageList
          emptyLabel={cc.noModelUsage}
          rows={byModel.slice(0, 6).map(entry => ({
--- a/apps/desktop/src/app/cron/index.tsx
+++ b/apps/desktop/src/app/cron/index.tsx
@@ -14,7 +14,6 @@ import {
  DialogTitle
 } from '@/components/ui/dialog'
 import { Input } from '@/components/ui/input'
-import { SearchField } from '@/components/ui/search-field'
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select'
 import { Textarea } from '@/components/ui/textarea'
 import {
@@ -30,14 +29,28 @@ import {
  updateCronJob
 } from '@/hermes'
 import { type Translations, useI18n } from '@/i18n'
-import { AlertTriangle, Clock } from '@/lib/icons'
-import { cn } from '@/lib/utils'
+import { AlertTriangle } from '@/lib/icons'
 import { $cronFocusJobId, $cronJobs, setCronFocusJobId, setCronJobs, updateCronJobs } from '@/store/cron'
 import { notify, notifyError } from '@/store/notifications'

 import { useRefreshHotkey } from '../hooks/use-refresh-hotkey'
-import { OverlayMain, OverlayNewButton, OverlaySidebar, OverlaySplitLayout } from '../overlays/overlay-split-layout'
-import { OverlayView } from '../overlays/overlay-view'
+import {
+  Panel,
+  PanelAction,
+  PanelAddButton,
+  PanelBlock,
+  PanelBody,
+  PanelDetail,
+  PanelEmpty,
+  PanelHeader,
+  PanelList,
+  PanelListRow,
+  PanelMeta,
+  PanelPill,
+  type PanelPillTone,
+  PanelRowMenu,
+  PanelSectionLabel
+} from '../overlays/panel'
 import type { SetStatusbarItemGroup } from '../shell/statusbar-controls'

 import { jobState, jobTitle, STATE_DOT } from './job-state'
@@ -56,7 +69,7 @@ const SCHEDULE_OPTIONS: ReadonlyArray<ScheduleOption> = [
  { value: 'custom' }
 ]

-const STATE_TONE: Record<string, 'good' | 'muted' | 'warn' | 'bad'> = {
+const STATE_TONE: Record<string, PanelPillTone> = {
  enabled: 'good',
  scheduled: 'good',
  running: 'good',
@@ -66,13 +79,6 @@ const STATE_TONE: Record<string, 'good' | 'muted' | 'warn' | 'bad'> = {
  completed: 'muted'
 }

-const PILL_TONE: Record<'good' | 'muted' | 'warn' | 'bad', string> = {
-  good: 'bg-primary/10 text-primary',
-  muted: 'bg-muted text-muted-foreground',
-  warn: 'bg-amber-500/10 text-amber-600 dark:text-amber-300',
-  bad: 'bg-destructive/10 text-destructive'
-}
-
 const asText = (value: unknown): string => (typeof value === 'string' ? value : '')

 const truncate = (value: string, max = 80): string => (value.length > max ? `${value.slice(0, max)}…` : value)
@@ -321,7 +327,7 @@ export function CronView({ onClose, onOpenSession, setStatusbarItemGroup: _setSt

    pendingScrollRef.current = null
    requestAnimationFrame(() => {
-      document.querySelector(`[data-cron-row="${CSS.escape(target)}"]`)?.scrollIntoView({ block: 'nearest' })
+      document.querySelector(`[data-panel-row="${CSS.escape(target)}"]`)?.scrollIntoView({ block: 'nearest' })
    })
  }, [selectedJob])

@@ -406,60 +412,66 @@ export function CronView({ onClose, onOpenSession, setStatusbarItemGroup: _setSt
  }

  return (
-    <OverlayView closeLabel={c.close} onClose={onClose}>
+    <Panel closeLabel={c.close} onClose={onClose}>
      {loading && jobs.length === 0 ? (
        <PageLoader label={c.loading} />
+      ) : totalCount === 0 ? (
+        <PanelEmpty
+          action={
+            <Button onClick={() => setEditor({ mode: 'create' })} size="sm">
+              {c.newCron}
+            </Button>
+          }
+          description={c.emptyDescNew}
+          icon="watch"
+          title={c.emptyTitleNew}
+        />
      ) : (
-        <OverlaySplitLayout>
-          <OverlaySidebar>
-            <OverlayNewButton label={c.newCron} onClick={() => setEditor({ mode: 'create' })} />
-            {totalCount > 0 && (
-              <SearchField
-                aria-label={c.search}
-                containerClassName="mb-1 w-full px-2"
-                onChange={setQuery}
-                placeholder={c.search}
-                value={query}
-              />
-            )}
-            {visibleJobs.map(job => (
-              <CronJobListRow
-                active={selectedJob?.id === job.id}
-                c={c}
-                job={job}
-                key={job.id}
-                onSelect={() => setSelectedJobId(job.id)}
-              />
-            ))}
-            {visibleJobs.length === 0 && (
-              <p className="px-2 py-4 text-center text-xs text-muted-foreground">
-                {totalCount === 0 ? c.emptyTitleNew : c.emptyTitleSearch}
-              </p>
-            )}
-          </OverlaySidebar>
+        <>
+          <PanelHeader subtitle={c.count(totalCount)} title={c.title} />
+          <PanelBody>
+            <PanelList
+              onSearchChange={setQuery}
+              searchLabel={c.search}
+              searchPlaceholder={c.search}
+              searchValue={query}
+            >
+              {visibleJobs.map(job => (
+                <CronJobListRow
+                  active={selectedJob?.id === job.id}
+                  job={job}
+                  key={job.id}
+                  menu={
+                    <PanelRowMenu
+                      items={[
+                        { icon: 'edit', label: c.edit, onSelect: () => setEditor({ mode: 'edit', job }) },
+                        { icon: 'trash', label: t.common.delete, onSelect: () => setPendingDelete(job), tone: 'danger' }
+                      ]}
+                    />
+                  }
+                  onSelect={() => setSelectedJobId(job.id)}
+                />
+              ))}
+              {visibleJobs.length === 0 && (
+                <p className="px-2 py-4 text-center text-xs text-muted-foreground">{c.emptyTitleSearch}</p>
+              )}
+              <PanelAddButton label={c.newCron} onClick={() => setEditor({ mode: 'create' })} />
+            </PanelList>

-          <OverlayMain className="px-0">
            {selectedJob ? (
              <CronJobDetail
                busy={busyJobId === selectedJob.id}
                c={c}
                job={selectedJob}
-                onDelete={() => setPendingDelete(selectedJob)}
-                onEdit={() => setEditor({ mode: 'edit', job: selectedJob })}
                onOpenSession={onOpenSession}
                onPauseResume={() => void handlePauseResume(selectedJob)}
                onTrigger={() => void handleTrigger(selectedJob)}
              />
            ) : (
-              <div className="grid h-full place-items-center px-6 py-12 text-center text-sm text-muted-foreground">
-                <div>
-                  <Clock className="mx-auto size-6 text-muted-foreground/60" />
-                  <p className="mt-3">{totalCount === 0 ? c.emptyDescNew : c.emptyDescSearch}</p>
-                </div>
-              </div>
+              <PanelEmpty description={c.emptyDescSearch} icon="search" />
            )}
-          </OverlayMain>
-        </OverlaySplitLayout>
+          </PanelBody>
+        </>
      )}

      <CronEditorDialog editor={editor} onClose={() => setEditor({ mode: 'closed' })} onSave={handleEditorSave} />
@@ -488,42 +500,32 @@ export function CronView({ onClose, onOpenSession, setStatusbarItemGroup: _setSt
          </DialogFooter>
        </DialogContent>
      </Dialog>
-    </OverlayView>
+    </Panel>
  )
 }

 function CronJobListRow({
  active,
-  c,
  job,
+  menu,
  onSelect
 }: {
  active: boolean
-  c: Translations['cron']
  job: CronJob
+  menu?: React.ReactNode
  onSelect: () => void
 }) {
  const state = jobState(job)

  return (
-    <button
-      className={cn(
-        'flex w-full flex-col items-start gap-0.5 rounded-md px-2 py-1.5 text-left transition-colors',
-        active ? 'bg-accent text-foreground' : 'text-foreground/85 hover:bg-accent/60'
-      )}
-      data-cron-row={job.id}
-      onClick={onSelect}
-      type="button"
-    >
-      <span className="flex w-full items-center gap-2">
-        <span
-          aria-hidden="true"
-          className={cn('size-1.5 shrink-0 rounded-full', STATE_DOT[state] ?? 'bg-muted-foreground')}
-        />
-        <span className="min-w-0 flex-1 truncate text-sm font-medium">{jobTitle(job)}</span>
-      </span>
-      <span className="truncate pl-3.5 text-[0.66rem] text-muted-foreground">{jobScheduleDisplay(job)}</span>
-    </button>
+    <PanelListRow
+      active={active}
+      dotClassName={STATE_DOT[state] ?? 'bg-muted-foreground'}
+      menu={menu}
+      onSelect={onSelect}
+      rowKey={job.id}
+      title={jobTitle(job)}
+    />
  )
 }

@@ -531,8 +533,6 @@ function CronJobDetail({
  busy,
  c,
  job,
-  onDelete,
-  onEdit,
  onOpenSession,
  onPauseResume,
  onTrigger
@@ -540,8 +540,6 @@ function CronJobDetail({
  busy: boolean
  c: Translations['cron']
  job: CronJob
-  onDelete: () => void
-  onEdit: () => void
  onOpenSession?: (sessionId: string) => void
  onPauseResume: () => void
  onTrigger: () => void
@@ -552,69 +550,49 @@ function CronJobDetail({
  const prompt = jobPrompt(job)

  return (
-    <div className="flex h-full min-h-0 flex-col">
-      <div className="min-h-0 flex-1 overflow-y-auto">
-        <div className="mx-auto max-w-2xl space-y-6 px-6 py-6">
-          <header className="space-y-3">
-            <div className="flex flex-wrap items-start justify-between gap-3">
-              <div className="min-w-0 space-y-1">
-                <div className="flex flex-wrap items-center gap-2">
-                  <h3 className="text-xl font-semibold tracking-tight">{jobTitle(job)}</h3>
-                  <StatePill tone={STATE_TONE[state] ?? 'muted'}>{c.states[state] ?? state}</StatePill>
-                  {deliver && deliver !== DEFAULT_DELIVER && (
-                    <StatePill tone="muted">{c.deliveryLabels[deliver] ?? deliver}</StatePill>
-                  )}
-                </div>
-                <div className="flex flex-wrap items-center gap-x-4 gap-y-1 text-[0.7rem] text-muted-foreground">
-                  <span className="inline-flex items-center gap-1">
-                    <Clock className="size-3" />
-                    {jobScheduleDisplay(job)}
-                  </span>
-                  <span>
-                    {c.last} {formatTime(job.last_run_at)}
-                  </span>
-                  <span>
-                    {c.next} {formatTime(job.next_run_at)}
-                  </span>
-                </div>
-              </div>
-              <div className="flex shrink-0 items-center gap-1">
-                <Button disabled={busy} onClick={onPauseResume} size="sm" variant="outline">
-                  <Codicon name={isPaused ? 'play' : 'debug-pause'} size="0.875rem" />
-                  {isPaused ? c.resumeTitle : c.pauseTitle}
-                </Button>
-                <Button disabled={busy} onClick={onTrigger} size="sm" variant="outline">
-                  <Codicon name="zap" size="0.875rem" />
-                  {c.triggerNow}
-                </Button>
-                <Button onClick={onEdit} size="sm" variant="outline">
-                  <Codicon name="edit" size="0.875rem" />
-                  {c.edit}
-                </Button>
-                <Button
-                  className="text-muted-foreground hover:bg-destructive/10 hover:text-destructive"
-                  onClick={onDelete}
-                  size="sm"
-                  variant="ghost"
-                >
-                  <Codicon name="trash" size="0.875rem" />
-                </Button>
-              </div>
-            </div>
-
-            {prompt && <p className="line-clamp-3 text-xs text-muted-foreground">{prompt}</p>}
-            {job.last_error && (
-              <p className="inline-flex items-start gap-1 text-[0.7rem] text-destructive">
-                <AlertTriangle className="mt-px size-3 shrink-0" />
-                <span className="line-clamp-2">{job.last_error}</span>
-              </p>
-            )}
-          </header>
-
-          <CronJobRuns c={c} jobId={job.id} onOpenSession={onOpenSession} />
+    <PanelDetail>
+      <header className="space-y-3">
+        <div className="flex flex-wrap items-start justify-between gap-3">
+          <div className="flex min-w-0 flex-wrap items-center gap-2">
+            <h3 className="text-[0.95rem] font-semibold tracking-tight text-foreground">{jobTitle(job)}</h3>
+            <PanelPill tone={STATE_TONE[state] ?? 'muted'}>{c.states[state] ?? state}</PanelPill>
+          </div>
+          <div className="flex shrink-0 items-center gap-0.5">
+            <PanelAction disabled={busy} icon={isPaused ? 'play' : 'debug-pause'} onClick={onPauseResume}>
+              {isPaused ? c.resumeTitle : c.pauseTitle}
+            </PanelAction>
+            <PanelAction disabled={busy} icon="zap" onClick={onTrigger}>
+              {c.triggerNow}
+            </PanelAction>
+          </div>
        </div>
-      </div>
-    </div>
+
+        <PanelMeta
+          rows={[
+            { label: c.frequencyLabel, value: jobScheduleDisplay(job) },
+            { label: c.last.replace(/:$/, ''), value: formatTime(job.last_run_at) },
+            { label: c.next.replace(/:$/, ''), value: formatTime(job.next_run_at) },
+            { label: c.deliverLabel, value: c.deliveryLabels[deliver] ?? deliver }
+          ]}
+        />
+
+        {job.last_error ? (
+          <div className="flex items-start gap-1.5 rounded bg-destructive/10 p-2 text-[0.7rem] text-destructive">
+            <AlertTriangle className="mt-px size-3 shrink-0" />
+            <span className="min-w-0 break-words">{job.last_error}</span>
+          </div>
+        ) : null}
+      </header>
+
+      {prompt ? (
+        <section className="space-y-1.5">
+          <PanelSectionLabel>{c.promptLabel}</PanelSectionLabel>
+          <PanelBlock>{prompt}</PanelBlock>
+        </section>
+      ) : null}
+
+      <CronJobRuns c={c} jobId={job.id} onOpenSession={onOpenSession} />
+    </PanelDetail>
  )
 }

@@ -685,10 +663,10 @@ function CronJobRuns({

  return (
    <div>
-      <div className="mb-1.5 text-[0.62rem] font-medium uppercase tracking-wide text-muted-foreground">
+      <PanelSectionLabel className="mb-1.5">
        {c.runHistory}
        {runs && runs.length > 0 ? ` · ${runs.length}` : ''}
-      </div>
+      </PanelSectionLabel>
      {runs === null ? (
        <div className="flex items-center gap-1.5 py-1 text-xs text-muted-foreground">
          <Codicon name="loading" size="0.75rem" spinning />
@@ -699,13 +677,13 @@ function CronJobRuns({
        <div className="flex flex-col gap-px">
          {runs.map(run => (
            <button
-              className="flex items-center justify-between gap-3 rounded-md px-2 py-1 text-left text-xs hover:bg-(--chrome-action-hover) focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring/40"
+              className="flex items-center justify-between gap-3 rounded-md px-2 py-1 text-left text-xs transition-colors duration-100 hover:bg-(--ui-row-hover-background) focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring/40"
              key={run.id}
              onClick={() => onOpenSession?.(run.id)}
              type="button"
            >
-              <span className="truncate text-foreground">{run.title?.trim() || run.preview?.trim() || run.id}</span>
-              <span className="shrink-0 text-[0.62rem] text-muted-foreground tabular-nums">
+              <span className="truncate text-foreground/85">{run.title?.trim() || run.preview?.trim() || run.id}</span>
+              <span className="shrink-0 text-[0.62rem] text-muted-foreground/55 tabular-nums">
                {formatRunTime(run.last_active || run.started_at)}
              </span>
            </button>
@@ -716,16 +694,6 @@ function CronJobRuns({
  )
 }

-function StatePill({ children, tone }: { children: string; tone: keyof typeof PILL_TONE }) {
-  return (
-    <span
-      className={cn('inline-flex items-center rounded-full px-1.5 py-0.5 text-[0.64rem] capitalize', PILL_TONE[tone])}
-    >
-      {children}
-    </span>
-  )
-}
-
 function CronEditorDialog({
  editor,
  onClose,
--- a/apps/desktop/src/app/desktop-controller.tsx
+++ b/apps/desktop/src/app/desktop-controller.tsx
@@ -10,6 +10,7 @@ import { GatewayConnectingOverlay } from '@/components/gateway-connecting-overla
 import { Pane, PaneMain } from '@/components/pane-shell'
 import { RemoteDisplayBanner } from '@/components/remote-display-banner'
 import { useMediaQuery } from '@/hooks/use-media-query'
+import { isFocusWithin } from '@/lib/keybinds/combo'
 import { cn } from '@/lib/utils'
 import { useSkinCommand } from '@/themes/use-skin-command'

@@ -124,9 +125,12 @@ import { ModelVisibilityOverlay } from './model-visibility-overlay'
 import { PetGenerateOverlay } from './pet-generate/pet-generate-overlay'
 import { RightSidebarPane } from './right-sidebar'
 import { FileActionDialogs } from './right-sidebar/file-actions'
+import { RemoteFolderPicker } from './right-sidebar/files/remote-picker'
 import { ReviewPane } from './right-sidebar/review'
 import { $terminalTakeover } from './right-sidebar/store'
-import { PersistentTerminal, TerminalSlot } from './right-sidebar/terminal/persistent'
+import { TerminalPaneChrome } from './right-sidebar/terminal/chrome'
+import { PersistentTerminal } from './right-sidebar/terminal/persistent'
+import { closeActiveTerminal } from './right-sidebar/terminal/terminals'
 import { CRON_ROUTE, NEW_CHAT_ROUTE, routeSessionId, sessionRoute, SETTINGS_ROUTE } from './routes'
 import { SessionPickerOverlay } from './session-picker-overlay'
 import { SessionSwitcher } from './session-switcher'
@@ -387,11 +391,25 @@ export function DesktopController() {

  useEffect(() => {
    const onKeyDown = (event: KeyboardEvent) => {
-      if (!$filePreviewTarget.get() && !$previewTarget.get()) {
+      if (event.altKey || event.shiftKey || event.key.toLowerCase() !== 'w' || (!event.metaKey && !event.ctrlKey)) {
        return
      }

-      if ((event.metaKey || event.ctrlKey) && !event.altKey && !event.shiftKey && event.key.toLowerCase() === 'w') {
+      // Terminal focused: ⌘W closes the active terminal. Ctrl+W is left untouched
+      // for the shell's werase, and nothing else may steal ⌘/Ctrl+W from a
+      // focused terminal (so it never closes a preview tab out from under it).
+      if (isFocusWithin('[data-terminal]')) {
+        if (event.metaKey && !event.ctrlKey) {
+          event.preventDefault()
+          event.stopPropagation()
+          closeActiveTerminal()
+        }
+
+        return
+      }
+
+      // Otherwise ⌘/Ctrl+W closes the active preview tab when one is open.
+      if ($filePreviewTarget.get() || $previewTarget.get()) {
        event.preventDefault()
        event.stopPropagation()
        closeActiveRightRailTab()
@@ -580,7 +598,7 @@ export function DesktopController() {
    }
  }, [])

-  const { gatewayLogLines, inferenceStatus, statusSnapshot } = useStatusSnapshot(gatewayState, requestGateway)
+  const { inferenceStatus, statusSnapshot } = useStatusSnapshot(gatewayState, requestGateway)

  const updateActiveSessionRuntimeInfo = useCallback(
    (info: { branch?: string; cwd?: string }) => {
@@ -1060,7 +1078,6 @@ export function DesktopController() {
    commandCenterOpen,
    extraLeftItems: statusbarItemGroups.flat.left,
    extraRightItems: statusbarItemGroups.flat.right,
-    gatewayLogLines,
    gatewayState,
    inferenceStatus,
    openAgents,
@@ -1095,11 +1112,13 @@ export function DesktopController() {
    />
  )

-  // One PTY-backed terminal mounted forever; <TerminalSlot /> placeholders decide
-  // where it shows. Lives in main's stacking context (not the root overlay layer)
-  // so pane resize handles still paint above it. Toggling never rebuilds the shell.
+  // The persistent xterm layer (one host per terminal tab), CSS-overlaid onto the
+  // pane's <TerminalSlot />. Lives in main's stacking context (not the root overlay
+  // layer) so pane resize handles still paint above it. Terminals own their state
+  // (incl. a snapshotted cwd) independent of the session, so switching sessions
+  // never rebuilds or closes them; toggling the pane never rebuilds the shells.
  const mainOverlays = (
-    <PersistentTerminal cwd={currentCwd} onAddSelectionToChat={composer.addTerminalSelectionAttachment} />
+    <PersistentTerminal onAddSelectionToChat={composer.addTerminalSelectionAttachment} />
  )

  const overlays = (
@@ -1127,6 +1146,7 @@ export function DesktopController() {
      <PetGenerateOverlay />
      <SessionSwitcher />
      <FileActionDialogs />
+      <RemoteFolderPicker />

      {settingsOpen && (
        <Suspense fallback={null}>
@@ -1329,7 +1349,7 @@ export function DesktopController() {
          terminalAsRow ? 'border-l border-(--ui-stroke-secondary) pt-0' : 'pt-(--titlebar-height)'
        )}
      >
-        <TerminalSlot />
+        <TerminalPaneChrome />
      </div>
    </Pane>
  )
--- a/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts
+++ b/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts
@@ -1,10 +1,10 @@
+import { isGatewayReauthRequired, resolveGatewayWsUrl } from '@hermes/shared'
 import { useEffect, useRef } from 'react'

 import type { HermesConnection } from '@/global'
 import { HermesGateway } from '@/hermes'
 import { translateNow } from '@/i18n'
 import { desktopDefaultCwd } from '@/lib/desktop-fs'
-import { isGatewayReauthRequired, resolveGatewayWsUrl } from '@/lib/gateway-ws-url'
 import {
  $desktopBoot,
  applyDesktopBootProgress,
--- a/apps/desktop/src/app/gateway/hooks/use-gateway-request.ts
+++ b/apps/desktop/src/app/gateway/hooks/use-gateway-request.ts
@@ -1,8 +1,8 @@
+import { isGatewayReauthRequired, resolveGatewayWsUrl } from '@hermes/shared'
 import { useStore } from '@nanostores/react'
 import { useCallback, useEffect, useRef } from 'react'

 import type { HermesGateway } from '@/hermes'
-import { isGatewayReauthRequired, resolveGatewayWsUrl } from '@/lib/gateway-ws-url'
 import { $gateway, ensureActiveGatewayOpen, isActivePrimary } from '@/store/gateway'
 import { $activeGatewayProfile } from '@/store/profile'
 import { $gatewayState, setConnection } from '@/store/session'
--- a/apps/desktop/src/app/hooks/use-keybinds.ts
+++ b/apps/desktop/src/app/hooks/use-keybinds.ts
@@ -2,6 +2,7 @@ import { useEffect, useRef } from 'react'
 import { useNavigate } from 'react-router-dom'

 import { $terminalTakeover, setTerminalTakeover } from '@/app/right-sidebar/store'
+import { closeActiveTerminal, createTerminal, cycleTerminal } from '@/app/right-sidebar/terminal/terminals'
 import { PANE_TOGGLE_REVEAL_EVENT } from '@/components/pane-shell'
 import { matchesQuery } from '@/hooks/use-media-query'
 import { PROFILE_SLOT_COUNT, SESSION_SLOT_COUNT } from '@/lib/keybinds/actions'
@@ -164,6 +165,17 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {
    'view.toggleReview': toggleReview,
    'view.showFiles': showFiles,
    'view.showTerminal': () => setTerminalTakeover(!$terminalTakeover.get()),
+    // Create first so the pane's open-effect ensure sees a non-empty set and
+    // doesn't also spawn one — net effect is exactly one fresh terminal.
+    'view.newTerminal': () => {
+      createTerminal()
+      setTerminalTakeover(true)
+    },
+    // Switch / close only act while the pane is open (no focus-scoping here, so
+    // this stands in for "terminal is showing").
+    'view.nextTerminal': () => $terminalTakeover.get() && cycleTerminal(1),
+    'view.prevTerminal': () => $terminalTakeover.get() && cycleTerminal(-1),
+    'view.closeTerminal': () => $terminalTakeover.get() && closeActiveTerminal(),
    'view.flipPanes': togglePanesFlipped,

    'appearance.toggleMode': () => setMode(resolvedMode === 'dark' ? 'light' : 'dark'),
--- a/apps/desktop/src/app/messaging/index.test.tsx
+++ b/apps/desktop/src/app/messaging/index.test.tsx
@@ -0,0 +1,89 @@
+// @vitest-environment jsdom
+import { cleanup, fireEvent, render, screen, waitFor } from '@testing-library/react'
+import { MemoryRouter } from 'react-router-dom'
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+
+import type { MessagingPlatformInfo } from '@/types/hermes'
+
+const getMessagingPlatforms = vi.fn()
+const updateMessagingPlatform = vi.fn()
+const openExternalLink = vi.fn()
+
+vi.mock('@/hermes', () => ({
+  getMessagingPlatforms: () => getMessagingPlatforms(),
+  updateMessagingPlatform: (id: string, body: unknown) => updateMessagingPlatform(id, body)
+}))
+
+vi.mock('@/lib/external-link', () => ({
+  openExternalLink: (href: string) => openExternalLink(href)
+}))
+
+vi.mock('@/store/notifications', () => ({
+  notify: vi.fn(),
+  notifyError: vi.fn()
+}))
+
+vi.mock('@/store/system-actions', () => ({
+  runGatewayRestart: vi.fn()
+}))
+
+function platform(patch: Partial<MessagingPlatformInfo> = {}): MessagingPlatformInfo {
+  return {
+    configured: false,
+    description: 'A platform.',
+    docs_url: '',
+    enabled: false,
+    env_vars: [],
+    gateway_running: true,
+    id: 'teams',
+    name: 'Microsoft Teams',
+    state: 'disabled',
+    ...patch
+  }
+}
+
+beforeEach(() => {
+  updateMessagingPlatform.mockResolvedValue({ ok: true, platform: 'teams' })
+})
+
+afterEach(() => {
+  cleanup()
+  vi.clearAllMocks()
+})
+
+async function renderMessaging() {
+  const { MessagingView } = await import('./index')
+
+  return render(
+    <MemoryRouter>
+      <MessagingView />
+    </MemoryRouter>
+  )
+}
+
+describe('MessagingView setup-guide link', () => {
+  it('hides the setup-guide button for a plugin platform with no docs URL', async () => {
+    // Teams (and other plugin platforms) ship an empty docs_url. Rendering an
+    // anchor with href="" let Electron resolve it to the app's own packaged
+    // index.html and fail with an OS "file not found" dialog. The button must
+    // simply not appear when there is no guide to open.
+    getMessagingPlatforms.mockResolvedValue({ platforms: [platform({ docs_url: '' })] })
+
+    await renderMessaging()
+
+    expect((await screen.findAllByText('Microsoft Teams')).length).toBeGreaterThan(0)
+    expect(screen.queryByText('Open setup guide')).toBeNull()
+  })
+
+  it('opens a real docs URL through the validated external opener', async () => {
+    const docsUrl = 'https://hermes-agent.nousresearch.com/docs/user-guide/messaging/teams'
+    getMessagingPlatforms.mockResolvedValue({ platforms: [platform({ docs_url: docsUrl })] })
+
+    await renderMessaging()
+
+    const link = await screen.findByText('Open setup guide')
+    fireEvent.click(link)
+
+    await waitFor(() => expect(openExternalLink).toHaveBeenCalledWith(docsUrl))
+  })
+})
--- a/apps/desktop/src/app/messaging/index.tsx
+++ b/apps/desktop/src/app/messaging/index.tsx
@@ -14,6 +14,7 @@ import {
  updateMessagingPlatform
 } from '@/hermes'
 import { type Translations, useI18n } from '@/i18n'
+import { openExternalLink } from '@/lib/external-link'
 import { AlertTriangle, ExternalLink, Save, Trash2 } from '@/lib/icons'
 import { cn } from '@/lib/utils'
 import { notify, notifyError } from '@/store/notifications'
@@ -404,14 +405,31 @@ function PlatformDetail({
            <p className="mt-1 text-[length:var(--conversation-caption-font-size)] leading-(--conversation-caption-line-height) text-(--ui-text-tertiary)">
              {introCopy(platform, m)}
            </p>
-            <div className="mt-3">
-              <Button asChild size="sm" variant="textStrong">
-                <a href={platform.docs_url} rel="noreferrer" target="_blank">
-                  {m.openSetupGuide}
-                  <ExternalLink className="size-3.5" />
-                </a>
-              </Button>
-            </div>
+            {platform.docs_url && (
+              <div className="mt-3">
+                <Button asChild size="sm" variant="textStrong">
+                  <a
+                    href={platform.docs_url}
+                    onClick={event => {
+                      // Route through the validated external opener instead of
+                      // letting Electron resolve the anchor. A packaged build's
+                      // empty/relative href resolves to the app's own
+                      // index.html file path, which shell.openPath then fails to
+                      // open ("file not found"). Plugin platforms (Teams, etc.)
+                      // ship no docs_url, so this guard + handler keeps the
+                      // button from ever pointing at a local bundle path.
+                      event.preventDefault()
+                      openExternalLink(platform.docs_url)
+                    }}
+                    rel="noreferrer"
+                    target="_blank"
+                  >
+                    {m.openSetupGuide}
+                    <ExternalLink className="size-3.5" />
+                  </a>
+                </Button>
+              </div>
+            )}
          </section>

          <section>
--- a/apps/desktop/src/app/overlays/overlay-chrome.tsx
+++ b/apps/desktop/src/app/overlays/overlay-chrome.tsx
@@ -1,26 +1,11 @@
-import type { ButtonHTMLAttributes, ComponentProps, ReactNode } from 'react'
+import type { ButtonHTMLAttributes, ReactNode } from 'react'

 import { cn } from '@/lib/utils'

-export const overlayCardClass =
-  'rounded-lg border border-[color-mix(in_srgb,var(--dt-border)_52%,transparent)] bg-[color-mix(in_srgb,var(--dt-card)_72%,transparent)] shadow-[inset_0_0.0625rem_0_color-mix(in_srgb,white_34%,transparent)]'
-
-interface OverlayCardProps extends ComponentProps<'div'> {
-  children: ReactNode
-}
-
 interface OverlayActionButtonProps extends ButtonHTMLAttributes<HTMLButtonElement> {
  tone?: 'default' | 'danger' | 'subtle'
 }

-export function OverlayCard({ children, className, ...props }: OverlayCardProps) {
-  return (
-    <div className={cn(overlayCardClass, className)} {...props}>
-      {children}
-    </div>
-  )
-}
-
 export function OverlayActionButton({
  children,
  className,
--- a/apps/desktop/src/app/overlays/overlay-search-input.tsx
+++ b/apps/desktop/src/app/overlays/overlay-search-input.tsx
@@ -1,33 +0,0 @@
-import type { RefObject } from 'react'
-
-import { SearchField } from '@/components/ui/search-field'
-
-interface OverlaySearchInputProps {
-  containerClassName?: string
-  inputRef?: RefObject<HTMLInputElement | null>
-  loading?: boolean
-  onChange: (value: string) => void
-  placeholder: string
-  value: string
-}
-
-// Borderless underline search — matches the tools/skills page (PageSearchShell).
-export function OverlaySearchInput({
-  containerClassName,
-  inputRef,
-  loading = false,
-  onChange,
-  placeholder,
-  value
-}: OverlaySearchInputProps) {
-  return (
-    <SearchField
-      containerClassName={containerClassName}
-      inputRef={inputRef}
-      loading={loading}
-      onChange={onChange}
-      placeholder={placeholder}
-      value={value}
-    />
-  )
-}
--- a/apps/desktop/src/app/overlays/overlay-split-layout.tsx
+++ b/apps/desktop/src/app/overlays/overlay-split-layout.tsx
@@ -1,7 +1,5 @@
 import type { ReactNode } from 'react'

-import { Button } from '@/components/ui/button'
-import { Codicon } from '@/components/ui/codicon'
 import type { IconComponent } from '@/lib/icons'
 import { cn } from '@/lib/utils'

@@ -50,9 +48,10 @@ export function OverlaySidebar({ children, className }: OverlaySidebarProps) {
  return (
    <aside
      className={cn(
-        // pt clears the floating titlebar/header; the bg itself fills from the
-        // card's top edge so there's no surface-colored gap above the sidebar.
-        'flex min-h-0 flex-col gap-0.5 overflow-y-auto bg-(--ui-sidebar-surface-background) px-2.5 pb-3 pt-[calc(var(--titlebar-height)+1rem)]',
+        // pt clears the in-card close button (the OverlayView now insets the
+        // whole card below the OS titlebar); the bg fills from the card's top
+        // edge so there's no surface-colored gap above the sidebar.
+        'flex min-h-0 flex-col gap-0.5 overflow-y-auto bg-(--ui-sidebar-surface-background) px-2.5 pb-3 pt-[calc(var(--titlebar-height)/2+1rem)]',
        className
      )}
    >
@@ -65,7 +64,7 @@ export function OverlayMain({ children, className }: OverlayMainProps) {
  return (
    <main
      className={cn(
-        'flex min-h-0 flex-1 flex-col overflow-hidden bg-transparent pb-3 pt-[calc(var(--titlebar-height)+1rem)]',
+        'flex min-h-0 flex-1 flex-col overflow-hidden bg-transparent pb-3 pt-[calc(var(--titlebar-height)/2+1rem)]',
        PAGE_INSET_X,
        className
      )}
@@ -75,31 +74,6 @@ export function OverlayMain({ children, className }: OverlayMainProps) {
  )
 }

-// Boxless "+ New …" action that tops an OverlaySidebar list (profiles, cron, …).
-// The text variant underlines on hover, which also strokes the icon glyph — so
-// we keep the button itself underline-free and underline only the label span.
-export function OverlayNewButton({
-  icon = 'add',
-  label,
-  onClick
-}: {
-  icon?: string
-  label: string
-  onClick: () => void
-}) {
-  return (
-    <Button
-      className="group mb-1 w-full justify-start gap-2 text-muted-foreground hover:bg-transparent hover:text-foreground"
-      onClick={onClick}
-      size="sm"
-      variant="ghost"
-    >
-      <Codicon name={icon} />
-      <span className="underline-offset-4 group-hover:underline">{label}</span>
-    </Button>
-  )
-}
-
 export function OverlayNavItem({ active, icon: Icon, label, nested, onClick, trailing }: OverlayNavItemProps) {
  return (
    <button
--- a/apps/desktop/src/app/overlays/overlay-view.tsx
+++ b/apps/desktop/src/app/overlays/overlay-view.tsx
@@ -49,7 +49,15 @@ export function OverlayView({

  return (
    <div
-      className="fixed inset-0 z-50 bg-black/22 p-3 backdrop-blur-[0.125rem] sm:p-6"
+      className={cn(
+        'fixed inset-0 z-50 bg-black/22 backdrop-blur-[0.125rem]',
+        // Equidistant inset on every side. The top value is driven by the
+        // titlebar height so the card clears the OS traffic-lights vertically;
+        // since the card top already sits below them, the left needs no extra
+        // inset — keeping all sides equal so the card is ~full-width at any size.
+        'p-[calc(var(--titlebar-height)+0.625rem)]',
+        'sm:p-[calc(var(--titlebar-height)+0.875rem)]'
+      )}
      onClick={event => {
        if (event.target === event.currentTarget) {
          closeOverlay()
--- a/apps/desktop/src/app/overlays/panel.tsx
+++ b/apps/desktop/src/app/overlays/panel.tsx
@@ -0,0 +1,377 @@
+import type { ReactNode } from 'react'
+
+import { Button } from '@/components/ui/button'
+import { Codicon } from '@/components/ui/codicon'
+import { DropdownMenu, DropdownMenuContent, DropdownMenuItem, DropdownMenuTrigger } from '@/components/ui/dropdown-menu'
+import { SearchField } from '@/components/ui/search-field'
+import { translateNow } from '@/i18n'
+import { cn } from '@/lib/utils'
+
+import { OverlayView } from './overlay-view'
+
+// Overlay "panel" primitive — the centered, capped card + framed chrome lifted
+// straight from the trace / agents overlay so every non-settings overlay (cron,
+// profiles, …) speaks the same visual language: tight type scale, muted
+// opacities, NO container borders (rows separate via the row-hover/active bg
+// vars + gaps, exactly like the trace waterfall labels).
+//
+// Compose it as:
+//   <Panel onClose>
+//     <PanelHeader title subtitle actions={…} />
+//     <PanelBody>                 // master/detail row
+//       <PanelList>…</PanelList>
+//       <PanelDetail>…</PanelDetail>
+//     </PanelBody>
+//   </Panel>
+//
+// Single-column views drop their content straight after the header.
+
+interface PanelProps {
+  children: ReactNode
+  // Root layout override (the card already fills the equidistant inset).
+  className?: string
+  closeLabel?: string
+  contentClassName?: string
+  onClose: () => void
+}
+
+export function Panel({
+  children,
+  className,
+  closeLabel = translateNow('common.close'),
+  contentClassName,
+  onClose
+}: PanelProps) {
+  return (
+    <OverlayView
+      closeLabel={closeLabel}
+      // Top pad aligns the header title's center with the floating close button
+      // (which sits at 0.1875rem + titlebar/2, -translate-y-1/2). The X is
+      // absolute so it costs no layout space — the header rides up next to it.
+      contentClassName={cn(
+        'flex h-full min-h-0 flex-col px-4 pb-4 pt-[calc(var(--titlebar-height)/2-0.4375rem)] sm:px-5',
+        contentClassName
+      )}
+      onClose={onClose}
+      rootClassName={cn('flex h-full w-full flex-col', className)}
+    >
+      {children}
+    </OverlayView>
+  )
+}
+
+interface PanelHeaderProps {
+  // Right-aligned controls (search, "+ New", segmented control, …).
+  actions?: ReactNode
+  subtitle?: ReactNode
+  title: ReactNode
+}
+
+export function PanelHeader({ actions, subtitle, title }: PanelHeaderProps) {
+  return (
+    <header className="mb-3 flex shrink-0 items-start justify-between gap-3">
+      <div className="min-w-0">
+        <h2 className="text-sm font-semibold text-foreground">{title}</h2>
+        {subtitle ? <p className="truncate text-xs text-muted-foreground/80">{subtitle}</p> : null}
+      </div>
+      {actions ? <div className="flex shrink-0 items-center gap-1.5">{actions}</div> : null}
+    </header>
+  )
+}
+
+export function PanelBody({ children, className }: { children: ReactNode; className?: string }) {
+  return <div className={cn('flex min-h-0 flex-1 gap-5 overflow-hidden', className)}>{children}</div>
+}
+
+interface PanelListProps {
+  children: ReactNode
+  className?: string
+  // Pass an onSearchChange to bake a full-bleed filter field in above the items
+  // (pinned; the rows scroll under it). Controlled via searchValue.
+  onSearchChange?: (value: string) => void
+  searchLabel?: string
+  searchPlaceholder?: string
+  searchValue?: string
+}
+
+// Left master list. Dense + borderless, like the trace waterfall's label tree:
+// single-line rows that touch, separated from the detail only by the body gap.
+// An optional search field pins to the top, full-bleed, above the scroll.
+export function PanelList({
+  children,
+  className,
+  onSearchChange,
+  searchLabel,
+  searchPlaceholder,
+  searchValue
+}: PanelListProps) {
+  return (
+    <div className={cn('flex w-52 shrink-0 flex-col', className)}>
+      {onSearchChange ? (
+        <SearchField
+          aria-label={searchLabel ?? searchPlaceholder ?? ''}
+          containerClassName="mb-1 w-full shrink-0"
+          onChange={onSearchChange}
+          placeholder={searchPlaceholder ?? ''}
+          value={searchValue ?? ''}
+        />
+      ) : null}
+      <div className="flex min-h-0 flex-1 flex-col overflow-y-auto overscroll-contain">{children}</div>
+    </div>
+  )
+}
+
+interface PanelListRowProps {
+  active: boolean
+  // Leading status dot color class (e.g. 'bg-emerald-500'); omit for none.
+  dotClassName?: string
+  // Leading codicon glyph name (used when there's no lead/dot).
+  icon?: string
+  // Custom leading element (colored swatch, avatar, …). Wins over dot/icon.
+  lead?: ReactNode
+  // Trailing per-row kebab menu (pass a <PanelRowMenu/>). Reveals on hover/focus.
+  menu?: ReactNode
+  // Short always-visible trailing meta (a tag/time, like the trace label's duration).
+  meta?: ReactNode
+  onSelect: () => void
+  rowKey?: string
+  title: ReactNode
+}
+
+// A row is a container (not a <button>) so it can host both the select target
+// and a kebab menu without nesting interactive elements. Hover/active bg lives
+// on the wrapper so the whole row highlights as one.
+export function PanelListRow({
+  active,
+  dotClassName,
+  icon,
+  lead,
+  menu,
+  meta,
+  onSelect,
+  rowKey,
+  title
+}: PanelListRowProps) {
+  return (
+    <div
+      className={cn(
+        'group/row relative flex h-7 w-full items-center rounded-md text-[0.78rem] transition-colors duration-100 ease-out',
+        active
+          ? 'bg-(--ui-row-active-background) text-foreground'
+          : 'text-(--ui-text-secondary) hover:bg-(--ui-row-hover-background) hover:text-foreground'
+      )}
+      data-panel-row={rowKey}
+    >
+      <button
+        className="flex h-full min-w-0 flex-1 items-center gap-2 rounded-md pl-2 pr-1 text-left"
+        onClick={onSelect}
+        type="button"
+      >
+        {lead ??
+          (dotClassName ? (
+            <span aria-hidden="true" className={cn('size-1.5 shrink-0 rounded-full', dotClassName)} />
+          ) : icon ? (
+            <Codicon className="shrink-0 text-muted-foreground/55" name={icon} size="0.85rem" />
+          ) : null)}
+        <span className="min-w-0 flex-1 truncate font-medium text-foreground/85">{title}</span>
+      </button>
+      {meta ? <span className="shrink-0 pr-2 text-[0.62rem] tabular-nums text-muted-foreground/45">{meta}</span> : null}
+      {menu ? <div className="shrink-0 pr-1">{menu}</div> : null}
+    </div>
+  )
+}
+
+export interface PanelMenuItem {
+  disabled?: boolean
+  icon?: string
+  label: string
+  onSelect: () => void
+  tone?: 'danger' | 'default'
+}
+
+// Per-row "⋮" actions menu — mirrors the sidebar session row's settled pattern
+// (size-5 ghost trigger + kebab-vertical codicon + w-40 content). Hidden until
+// the row is hovered/focused (or the menu is open). Returns null with no items
+// (e.g. the default profile, which can't be renamed/deleted).
+export function PanelRowMenu({ items, label = 'Actions' }: { items: PanelMenuItem[]; label?: string }) {
+  if (items.length === 0) {
+    return null
+  }
+
+  return (
+    <DropdownMenu>
+      <DropdownMenuTrigger asChild>
+        <Button
+          aria-label={label}
+          className="size-5 rounded-[4px] bg-transparent text-(--ui-text-tertiary) opacity-0 transition-colors duration-100 hover:bg-(--ui-control-active-background) hover:text-foreground focus-visible:opacity-100 focus-visible:ring-0 group-hover/row:opacity-100 data-[state=open]:bg-(--ui-control-active-background) data-[state=open]:text-foreground data-[state=open]:opacity-100 [&_svg]:size-3.5!"
+          size="icon"
+          title={label}
+          variant="ghost"
+        >
+          <Codicon name="kebab-vertical" size="0.875rem" />
+        </Button>
+      </DropdownMenuTrigger>
+      <DropdownMenuContent align="end" className="w-40" sideOffset={6}>
+        {items.map(item => (
+          <DropdownMenuItem
+            disabled={item.disabled}
+            key={item.label}
+            onSelect={item.onSelect}
+            variant={item.tone === 'danger' ? 'destructive' : undefined}
+          >
+            {item.icon ? <Codicon name={item.icon} size="0.875rem" /> : null}
+            <span>{item.label}</span>
+          </DropdownMenuItem>
+        ))}
+      </DropdownMenuContent>
+    </DropdownMenu>
+  )
+}
+
+// Scrolling detail region. Fills the column (no right rail here, unlike the
+// trace inspector), so the content stretches the full available width.
+export function PanelDetail({ children, className }: { children: ReactNode; className?: string }) {
+  return (
+    <div className={cn('min-h-0 flex-1 overflow-y-auto overscroll-contain', className)}>
+      <div className="space-y-4 pb-6 pl-1 pr-2">{children}</div>
+    </div>
+  )
+}
+
+interface PanelEmptyProps {
+  action?: ReactNode
+  description?: ReactNode
+  // Codicon glyph name (e.g. 'hubot', 'warning', 'loading~spin').
+  icon?: string
+  title?: ReactNode
+}
+
+export function PanelEmpty({ action, description, icon = 'inbox', title }: PanelEmptyProps) {
+  return (
+    <div className="grid flex-1 place-items-center px-6 py-10 text-center">
+      <div className="flex flex-col items-center gap-2">
+        <Codicon className="text-muted-foreground/50" name={icon} size="1.25rem" />
+        {title ? <p className="text-sm font-medium text-foreground/90">{title}</p> : null}
+        {description ? (
+          <p className="max-w-sm text-xs leading-relaxed text-muted-foreground/70">{description}</p>
+        ) : null}
+        {action ? <div className="mt-2">{action}</div> : null}
+      </div>
+    </div>
+  )
+}
+
+export function PanelSectionLabel({ children, className }: { children: ReactNode; className?: string }) {
+  return (
+    <div className={cn('text-[0.6rem] font-medium uppercase tracking-wider text-muted-foreground/50', className)}>
+      {children}
+    </div>
+  )
+}
+
+// Inspector-style key/value grid (mirrors the trace span inspector's <dl>).
+export interface PanelMetaRow {
+  label: ReactNode
+  value: ReactNode
+}
+
+export function PanelMeta({ className, rows }: { className?: string; rows: PanelMetaRow[] }) {
+  return (
+    <dl className={cn('grid grid-cols-[5rem_1fr] gap-x-2 gap-y-1 text-[0.7rem]', className)}>
+      {rows.map((row, i) => (
+        <div className="contents" key={typeof row.label === 'string' ? row.label : i}>
+          <dt className="truncate text-muted-foreground/55">{row.label}</dt>
+          <dd className="min-w-0 break-words text-foreground/85">{row.value}</dd>
+        </div>
+      ))}
+    </dl>
+  )
+}
+
+// Monospace content block (job prompt, etc.) — mirrors the inspector's
+// input/output <pre> blocks: subtle bg, no border.
+export function PanelBlock({ children, className }: { children: ReactNode; className?: string }) {
+  return (
+    <pre
+      className={cn(
+        'max-h-48 overflow-auto whitespace-pre-wrap break-words rounded bg-foreground/5 p-2.5 text-[0.68rem] leading-relaxed text-foreground/80',
+        className
+      )}
+    >
+      {children}
+    </pre>
+  )
+}
+
+export type PanelPillTone = 'bad' | 'good' | 'muted' | 'warn'
+
+const PILL_TONE: Record<PanelPillTone, string> = {
+  bad: 'bg-destructive/10 text-destructive',
+  good: 'bg-primary/10 text-primary',
+  muted: 'bg-foreground/10 text-muted-foreground',
+  warn: 'bg-amber-500/10 text-amber-600 dark:text-amber-300'
+}
+
+export function PanelPill({ children, tone = 'muted' }: { children: ReactNode; tone?: PanelPillTone }) {
+  return (
+    <span
+      className={cn(
+        'inline-flex items-center rounded-full px-1.5 py-0.5 text-[0.62rem] font-medium capitalize',
+        PILL_TONE[tone]
+      )}
+    >
+      {children}
+    </span>
+  )
+}
+
+// Self-describing centered "+" that sits as the LAST item in a PanelList. The
+// label rides aria/title only — no visible text.
+export function PanelAddButton({
+  icon = 'add',
+  label,
+  onClick
+}: {
+  icon?: string
+  label: string
+  onClick: () => void
+}) {
+  return (
+    <Button
+      aria-label={label}
+      className="h-7 w-full shrink-0 justify-center text-muted-foreground/70 hover:bg-(--ui-row-hover-background) hover:text-foreground"
+      onClick={onClick}
+      size="sm"
+      title={label}
+      variant="ghost"
+    >
+      <Codicon name={icon} size="0.875rem" />
+    </Button>
+  )
+}
+
+// Visible ghost action for a detail header (cron pause/resume/trigger, …).
+export function PanelAction({
+  children,
+  disabled,
+  icon,
+  onClick
+}: {
+  children: ReactNode
+  disabled?: boolean
+  icon: string
+  onClick: () => void
+}) {
+  return (
+    <Button
+      className="gap-1.5 text-muted-foreground hover:bg-(--ui-row-hover-background) hover:text-foreground"
+      disabled={disabled}
+      onClick={onClick}
+      size="sm"
+      variant="ghost"
+    >
+      <Codicon name={icon} size="0.875rem" />
+      {children}
+    </Button>
+  )
+}
--- a/apps/desktop/src/app/profiles/index.tsx
+++ b/apps/desktop/src/app/profiles/index.tsx
@@ -1,8 +1,10 @@
+import { useStore } from '@nanostores/react'
 import type * as React from 'react'
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react'

 import { PageLoader } from '@/components/page-loader'
 import { Button } from '@/components/ui/button'
+import { Codicon } from '@/components/ui/codicon'
 import {
  Dialog,
  DialogContent,
@@ -18,21 +20,34 @@ import {
  createProfile,
  deleteProfile,
  getProfiles,
-  getProfileSetupCommand,
  getProfileSoul,
  type ProfileInfo,
  renameProfile,
  updateProfileSoul
 } from '@/hermes'
 import { useI18n } from '@/i18n'
-import { AlertTriangle, Pencil, Save, Terminal, Trash2, Users } from '@/lib/icons'
+import { AlertTriangle, Save } from '@/lib/icons'
+import { profileColorSoft, resolveProfileColor } from '@/lib/profile-color'
 import { slug } from '@/lib/sanitize'
 import { cn } from '@/lib/utils'
 import { notify, notifyError } from '@/store/notifications'
+import { $profileColors } from '@/store/profile'

 import { useRefreshHotkey } from '../hooks/use-refresh-hotkey'
-import { OverlayMain, OverlayNewButton, OverlaySidebar, OverlaySplitLayout } from '../overlays/overlay-split-layout'
-import { OverlayView } from '../overlays/overlay-view'
+import {
+  Panel,
+  PanelAddButton,
+  PanelBody,
+  PanelDetail,
+  PanelEmpty,
+  PanelHeader,
+  PanelList,
+  PanelListRow,
+  PanelMeta,
+  PanelPill,
+  PanelRowMenu,
+  PanelSectionLabel
+} from '../overlays/panel'

 const PROFILE_NAME_RE = /^[a-z0-9][a-z0-9_-]{0,63}$/

@@ -49,7 +64,9 @@ export function ProfilesView({ onClose }: ProfilesViewProps) {
  const p = t.profiles
  const [profiles, setProfiles] = useState<null | ProfileInfo[]>(null)
  const [selectedName, setSelectedName] = useState<null | string>(null)
+  const [query, setQuery] = useState('')
  const [createOpen, setCreateOpen] = useState(false)
+  const [pendingRename, setPendingRename] = useState<null | ProfileInfo>(null)
  const [pendingDelete, setPendingDelete] = useState<null | ProfileInfo>(null)
  const [deleting, setDeleting] = useState(false)

@@ -83,6 +100,18 @@ export function ProfilesView({ onClose }: ProfilesViewProps) {
    return profiles.find(p => p.name === selectedName) ?? profiles[0] ?? null
  }, [profiles, selectedName])

+  const visibleProfiles = useMemo(() => {
+    const q = query.trim().toLowerCase()
+
+    if (!profiles || !q) {
+      return profiles ?? []
+    }
+
+    return profiles.filter(
+      profile => profile.name.toLowerCase().includes(q) || (profile.model ?? '').toLowerCase().includes(q)
+    )
+  }, [profiles, query])
+
  const handleCreate = useCallback(
    async (name: string, cloneFrom: null | string) => {
      const trimmed = name.trim()
@@ -140,46 +169,79 @@ export function ProfilesView({ onClose }: ProfilesViewProps) {
  }, [p, pendingDelete, refresh])

  return (
-    <OverlayView closeLabel={p.close} onClose={onClose}>
+    <Panel closeLabel={p.close} onClose={onClose}>
      {!profiles ? (
        <PageLoader label={p.loading} />
+      ) : profiles.length === 0 ? (
+        <PanelEmpty
+          action={
+            <Button onClick={() => setCreateOpen(true)} size="sm">
+              {p.newProfile}
+            </Button>
+          }
+          description={p.createDesc}
+          icon="organization"
+          title={p.noProfiles}
+        />
      ) : (
-        <OverlaySplitLayout>
-          <OverlaySidebar>
-            <OverlayNewButton label={p.newProfile} onClick={() => setCreateOpen(true)} />
-            {profiles.map(profile => (
-              <ProfileRow
-                active={selected?.name === profile.name}
-                key={profile.name}
-                onSelect={() => setSelectedName(profile.name)}
-                profile={profile}
-              />
-            ))}
-            {profiles.length === 0 && (
-              <p className="px-2 py-4 text-center text-xs text-muted-foreground">{p.noProfiles}</p>
-            )}
-          </OverlaySidebar>
+        <>
+          <PanelHeader subtitle={p.count(profiles.length)} title={p.title} />
+          <PanelBody>
+            <PanelList
+              onSearchChange={setQuery}
+              searchLabel={p.search}
+              searchPlaceholder={p.search}
+              searchValue={query}
+            >
+              {visibleProfiles.map(profile => (
+                <ProfileRow
+                  active={selected?.name === profile.name}
+                  key={profile.name}
+                  menu={
+                    <PanelRowMenu
+                      items={
+                        profile.is_default
+                          ? []
+                          : [
+                              { icon: 'edit', label: p.rename, onSelect: () => setPendingRename(profile) },
+                              {
+                                icon: 'trash',
+                                label: t.common.delete,
+                                onSelect: () => setPendingDelete(profile),
+                                tone: 'danger'
+                              }
+                            ]
+                      }
+                    />
+                  }
+                  onSelect={() => setSelectedName(profile.name)}
+                  profile={profile}
+                />
+              ))}
+              <PanelAddButton label={p.newProfile} onClick={() => setCreateOpen(true)} />
+            </PanelList>

-          <OverlayMain className="px-0">
            {selected ? (
-              <ProfileDetail
-                key={selected.name}
-                onDelete={() => setPendingDelete(selected)}
-                onRename={newName => handleRename(selected.name, newName)}
-                profile={selected}
-              />
+              <ProfileDetail key={selected.name} profile={selected} />
            ) : (
-              <div className="grid h-full place-items-center px-6 py-12 text-center text-sm text-muted-foreground">
-                <div>
-                  <Users className="mx-auto size-6 text-muted-foreground/60" />
-                  <p className="mt-3">{p.selectPrompt}</p>
-                </div>
-              </div>
+              <PanelEmpty description={p.selectPrompt} icon="account" />
            )}
-          </OverlayMain>
-        </OverlaySplitLayout>
+          </PanelBody>
+        </>
      )}

+      <RenameProfileDialog
+        currentName={pendingRename?.name ?? ''}
+        onClose={() => setPendingRename(null)}
+        onRename={async newName => {
+          if (pendingRename) {
+            await handleRename(pendingRename.name, newName)
+            setPendingRename(null)
+          }
+        }}
+        open={pendingRename !== null}
+      />
+
      <CreateProfileDialog
        onClose={() => setCreateOpen(false)}
        onCreate={async (name, cloneFrom) => handleCreate(name, cloneFrom)}
@@ -213,150 +275,106 @@ export function ProfilesView({ onClose }: ProfilesViewProps) {
          </DialogFooter>
        </DialogContent>
      </Dialog>
-    </OverlayView>
+    </Panel>
  )
 }

-function ProfileRow({ active, onSelect, profile }: { active: boolean; onSelect: () => void; profile: ProfileInfo }) {
-  const { t } = useI18n()
-  const p = t.profiles
-
-  return (
-    <button
-      className={cn(
-        'flex w-full flex-col items-start gap-0.5 rounded-md px-2 py-1.5 text-left transition-colors',
-        active ? 'bg-accent text-foreground' : 'text-foreground/85 hover:bg-accent/60'
-      )}
-      onClick={onSelect}
-      type="button"
-    >
-      <span className="flex w-full items-center justify-between gap-2">
-        <span className="truncate text-sm font-medium">{profile.name}</span>
-        {profile.is_default && <span className="text-[0.6rem] text-primary">{p.default}</span>}
-      </span>
-      <span className="text-[0.66rem] text-muted-foreground">
-        {p.skills(profile.skill_count)}
-        {profile.has_env ? ` · ${p.env}` : ''}
-      </span>
-    </button>
-  )
-}
-
-function ProfileDetail({
-  onDelete,
-  onRename,
+function ProfileRow({
+  active,
+  menu,
+  onSelect,
  profile
 }: {
-  onDelete: () => void
-  onRename: (newName: string) => Promise<void>
+  active: boolean
+  menu?: React.ReactNode
+  onSelect: () => void
  profile: ProfileInfo
 }) {
-  const { t } = useI18n()
-  const p = t.profiles
-  const [renameOpen, setRenameOpen] = useState(false)
-  const [copying, setCopying] = useState(false)
-
-  const handleCopySetup = useCallback(async () => {
-    setCopying(true)
-
-    try {
-      const { command } = await getProfileSetupCommand(profile.name)
-      await navigator.clipboard.writeText(command)
-      notify({ kind: 'success', title: p.setupCopied, message: command })
-    } catch (err) {
-      notifyError(err, p.failedCopy)
-    } finally {
-      setCopying(false)
-    }
-  }, [p, profile.name])
+  const colors = useStore($profileColors)

  return (
-    <div className="flex h-full min-h-0 flex-col">
-      <div className="min-h-0 flex-1 overflow-y-auto">
-        <div className="mx-auto max-w-2xl space-y-6 px-6 py-6">
-          <header className="space-y-3">
-            <div className="flex flex-wrap items-start justify-between gap-3">
-              <div className="min-w-0">
-                <div className="flex flex-wrap items-center gap-2">
-                  <h3 className="text-xl font-semibold tracking-tight">{profile.name}</h3>
-                  {profile.is_default && (
-                    <span className="rounded-full bg-primary/10 px-2 py-0.5 text-[0.65rem] font-medium text-primary">
-                      {p.defaultBadge}
-                    </span>
-                  )}
-                  {profile.has_env && (
-                    <span className="rounded-full bg-muted px-2 py-0.5 text-[0.65rem] font-medium text-muted-foreground">
-                      .env
-                    </span>
-                  )}
-                </div>
-                <p className="mt-1 font-mono text-[0.7rem] text-muted-foreground" title={profile.path}>
-                  {profile.path}
-                </p>
-              </div>
-              <div className="flex shrink-0 items-center gap-1">
-                {!profile.is_default && (
-                  <Button onClick={() => setRenameOpen(true)} size="sm" variant="outline">
-                    <Pencil />
-                    {p.rename}
-                  </Button>
-                )}
-                <Button disabled={copying} onClick={() => void handleCopySetup()} size="sm" variant="outline">
-                  <Terminal />
-                  {copying ? p.copying : p.copySetup}
-                </Button>
-                {!profile.is_default && (
-                  <Button
-                    className="text-muted-foreground hover:bg-destructive/10 hover:text-destructive"
-                    onClick={onDelete}
-                    size="sm"
-                    variant="ghost"
-                  >
-                    <Trash2 />
-                    {t.common.delete}
-                  </Button>
-                )}
-              </div>
-            </div>
-
-            <dl className="grid gap-2 text-xs sm:grid-cols-2">
-              <DetailRow label={p.modelLabel}>
-                {profile.model ? (
-                  <>
-                    <span className="font-mono">{profile.model}</span>
-                    {profile.provider && <span className="text-muted-foreground"> · {profile.provider}</span>}
-                  </>
-                ) : (
-                  <span className="text-muted-foreground">{p.notSet}</span>
-                )}
-              </DetailRow>
-              <DetailRow label={p.skillsLabel}>{profile.skill_count}</DetailRow>
-            </dl>
-          </header>
-
-          <SoulEditor profileName={profile.name} />
-        </div>
-      </div>
-
-      <RenameProfileDialog
-        currentName={profile.name}
-        onClose={() => setRenameOpen(false)}
-        onRename={async newName => {
-          await onRename(newName)
-          setRenameOpen(false)
-        }}
-        open={renameOpen}
-      />
-    </div>
+    <PanelListRow
+      active={active}
+      lead={
+        <ProfileGlyph
+          color={resolveProfileColor(profile.name, colors)}
+          isDefault={profile.is_default}
+          name={profile.name}
+        />
+      }
+      menu={menu}
+      onSelect={onSelect}
+      rowKey={profile.name}
+      title={profile.name}
+    />
  )
 }

-function DetailRow({ children, label }: { children: React.ReactNode; label: string }) {
+// Leading glyph for a profile row, mirroring the sidebar rail: the default
+// profile gets the `home` icon; named profiles get a soft color-tinted square
+// with their initial in the profile's color.
+function ProfileGlyph({ color, isDefault, name }: { color: null | string; isDefault: boolean; name: string }) {
+  if (isDefault) {
+    return <Codicon className="shrink-0 text-muted-foreground/70" name="home" size="0.9rem" />
+  }
+
+  const hue = color ?? 'var(--ui-text-quaternary)'
+
+  const initial =
+    name
+      .replace(/[^a-z0-9]/gi, '')
+      .charAt(0)
+      .toUpperCase() || '?'
+
  return (
-    <div className="flex flex-wrap items-baseline gap-2">
-      <dt className="text-[0.65rem] font-semibold uppercase tracking-[0.12em] text-muted-foreground">{label}</dt>
-      <dd className="text-sm text-foreground">{children}</dd>
-    </div>
+    <span
+      aria-hidden="true"
+      className="grid size-4 shrink-0 place-items-center rounded-[3px] text-[0.5rem] font-semibold uppercase leading-none"
+      style={{ backgroundColor: profileColorSoft(hue, 22), color: color ?? undefined }}
+    >
+      {initial}
+    </span>
+  )
+}
+
+function ProfileDetail({ profile }: { profile: ProfileInfo }) {
+  const { t } = useI18n()
+  const p = t.profiles
+
+  return (
+    <PanelDetail>
+      <header className="space-y-3">
+        <div className="min-w-0">
+          <div className="flex flex-wrap items-center gap-2">
+            <h3 className="text-[0.95rem] font-semibold tracking-tight text-foreground">{profile.name}</h3>
+            {profile.is_default && <PanelPill tone="good">{p.defaultBadge}</PanelPill>}
+            {profile.has_env && <PanelPill tone="muted">.env</PanelPill>}
+          </div>
+          <p className="mt-1 truncate font-mono text-[0.66rem] text-muted-foreground/55" title={profile.path}>
+            {profile.path}
+          </p>
+        </div>
+
+        <PanelMeta
+          rows={[
+            {
+              label: p.modelLabel,
+              value: profile.model ? (
+                <span className="font-mono">
+                  {profile.model}
+                  {profile.provider ? <span className="text-muted-foreground/55"> · {profile.provider}</span> : null}
+                </span>
+              ) : (
+                <span className="text-muted-foreground/55">{p.notSet}</span>
+              )
+            },
+            { label: p.skillsLabel, value: profile.skill_count }
+          ]}
+        />
+      </header>
+
+      <SoulEditor profileName={profile.name} />
+    </PanelDetail>
  )
 }

@@ -419,7 +437,7 @@ function SoulEditor({ profileName }: { profileName: string }) {
    <section className="space-y-2">
      <div className="flex flex-wrap items-baseline justify-between gap-2">
        <div>
-          <h4 className="text-[0.7rem] font-semibold uppercase tracking-[0.14em] text-muted-foreground">SOUL.md</h4>
+          <PanelSectionLabel className="text-[0.7rem] tracking-[0.14em]">SOUL.md</PanelSectionLabel>
          <p className="text-xs text-muted-foreground">{p.soulDesc}</p>
        </div>
        {dirty && <span className="text-[0.65rem] text-muted-foreground">{p.unsavedChanges}</span>}
@@ -429,7 +447,7 @@ function SoulEditor({ profileName }: { profileName: string }) {
        <PageLoader className="min-h-44" label={p.loadingSoul} />
      ) : (
        <Textarea
-          className="min-h-72 font-mono text-xs leading-5"
+          className="min-h-48 font-mono text-xs leading-5"
          onChange={event => setContent(event.target.value)}
          placeholder={isEmpty ? p.emptySoul : undefined}
          value={content}
@@ -437,7 +455,7 @@ function SoulEditor({ profileName }: { profileName: string }) {
      )}

      {error && (
-        <div className="flex items-start gap-2 rounded-md border border-destructive/30 bg-destructive/10 px-3 py-2 text-xs text-destructive">
+        <div className="flex items-start gap-2 rounded bg-destructive/10 px-3 py-2 text-xs text-destructive">
          <AlertTriangle className="mt-0.5 size-3.5 shrink-0" />
          <span>{error}</span>
        </div>
--- a/apps/desktop/src/app/right-sidebar/files/remote-picker.tsx
+++ b/apps/desktop/src/app/right-sidebar/files/remote-picker.tsx
@@ -120,14 +120,14 @@ export function RemoteFolderPicker() {

  return (
    <Dialog onOpenChange={open => !open && close()} open={Boolean(pending)}>
-      <DialogContent className="max-w-lg gap-0 overflow-hidden p-0">
-        <div className="border-b border-border/70 px-4 py-3">
+      <DialogContent className="flex h-[min(36rem,calc(100vh-4rem))] max-w-lg flex-col gap-0 overflow-hidden p-0">
+        <div className="shrink-0 border-b border-border/70 px-4 py-3">
          <DialogTitle className="text-sm">{pending?.title || r.remotePickerTitle}</DialogTitle>
          <DialogDescription className="mt-1 text-xs">{r.remotePickerDescription}</DialogDescription>
        </div>

-        <div className="flex min-h-[22rem] flex-col">
-          <div className="flex flex-wrap items-center gap-1 border-b border-border/50 px-3 py-2 text-xs text-muted-foreground">
+        <div className="flex min-h-0 flex-1 flex-col">
+          <div className="shrink-0 flex flex-wrap items-center gap-1 border-b border-border/50 px-3 py-2 text-xs text-muted-foreground">
            {crumbs.map((crumb, index) => (
              <button
                className={cn(
@@ -166,7 +166,7 @@ export function RemoteFolderPicker() {
          </div>
        </div>

-        <div className="flex items-center justify-between gap-2 border-t border-border/70 px-4 py-3">
+        <div className="shrink-0 flex items-center justify-between gap-2 border-t border-border/70 px-4 py-3">
          <div className="min-w-0 truncate text-xs text-muted-foreground">{currentPath}</div>
          <div className="flex shrink-0 items-center gap-2">
            <Button onClick={() => close()} size="sm" variant="ghost">
--- a/apps/desktop/src/app/right-sidebar/index.tsx
+++ b/apps/desktop/src/app/right-sidebar/index.tsx
@@ -16,7 +16,6 @@ import { $currentCwd } from '@/store/session'

 import { SidebarPanelLabel } from '../shell/sidebar-label'

-import { RemoteFolderPicker } from './files/remote-picker'
 import { ProjectTree } from './files/tree'
 import { useProjectTree } from './files/use-project-tree'

@@ -82,8 +81,6 @@ export function RightSidebarPane({ onActivateFile, onActivateFolder }: RightSide
          : 'border-l shadow-[inset_0.0625rem_0_0_color-mix(in_srgb,white_18%,transparent)]'
      )}
    >
-      <RemoteFolderPicker />
-
      <FilesystemTab
        canCollapse={canCollapse}
        collapseNonce={collapseNonce}
--- a/apps/desktop/src/app/right-sidebar/terminal/agent-terminal-stream.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/agent-terminal-stream.ts
@@ -0,0 +1,100 @@
+// Live agent-terminal output, pushed from the backend as `agent.terminal.output`
+// events (see tui_gateway `_wire_agent_terminal_output`). Chunks route straight
+// to the matching read-only xterm, keyed by process id — no polling, no tail
+// truncation. A capped per-proc backlog lets a tab opened mid-stream replay what
+// it missed, and lets a closed-then-reopened tab restore its history.
+
+type Writer = (chunk: string) => void
+
+const writers = new Map<string, Writer>()
+const backlog = new Map<string, string>()
+const commandHeaders = new Map<string, string>()
+const lastSnapshots = new Map<string, string>()
+const seededCommands = new Set<string>()
+
+const MAX_BACKLOG = 256_000
+
+/** A live agent terminal registers its xterm write and replays the backlog.
+ *  Returns an idempotent unregister. */
+export function registerAgentTerminalWriter(procId: string, write: Writer): () => void {
+  writers.set(procId, write)
+
+  const history = backlog.get(procId)
+
+  if (history) {
+    write(history)
+  }
+
+  return () => {
+    if (writers.get(procId) === write) {
+      writers.delete(procId)
+    }
+  }
+}
+
+/** Append a streamed chunk: buffer it (capped) for future opens and write it to
+ *  the live terminal, if one is mounted. */
+export function writeAgentTerminalChunk(procId: string, chunk: string): void {
+  if (!procId || !chunk) {
+    return
+  }
+
+  const next = (backlog.get(procId) ?? '') + chunk
+  backlog.set(procId, next.length > MAX_BACKLOG ? next.slice(-MAX_BACKLOG) : next)
+  writers.get(procId)?.(chunk)
+}
+
+/** Seed the tab with the command immediately, so an agent terminal never opens
+ *  as an empty void while stdout is still pending or not yet observed. */
+export function seedAgentTerminalCommand(procId: string, command: string): void {
+  const trimmed = command.trim()
+
+  if (!procId || !trimmed || seededCommands.has(procId)) {
+    return
+  }
+
+  seededCommands.add(procId)
+  const header = `$ ${trimmed}\r\n`
+  commandHeaders.set(procId, header)
+  writeAgentTerminalChunk(procId, header)
+}
+
+/** Ingest a full output snapshot from process.list/status-stack. This is the
+ *  fallback for older/not-yet-restarted gateways and a seed for tabs opened
+ *  after output already exists. If it extends our current backlog, append only
+ *  the delta; if the registry's rolling tail slid, reset to that tail. */
+export function syncAgentTerminalSnapshot(procId: string, output: string): void {
+  if (!procId || !output) {
+    return
+  }
+
+  const current = backlog.get(procId) ?? ''
+  const header = commandHeaders.get(procId) ?? ''
+  const body = header && current.startsWith(header) ? current.slice(header.length) : current
+  const previous = lastSnapshots.get(procId) ?? ''
+
+  if (output === previous || output === body || body.endsWith(output)) {
+    lastSnapshots.set(procId, output)
+
+    return
+  }
+
+  if (output.startsWith(previous)) {
+    writeAgentTerminalChunk(procId, output.slice(previous.length))
+    lastSnapshots.set(procId, output)
+
+    return
+  }
+
+  if (output.startsWith(body)) {
+    writeAgentTerminalChunk(procId, output.slice(body.length))
+    lastSnapshots.set(procId, output)
+
+    return
+  }
+
+  const next = `${header}${output}`.slice(-MAX_BACKLOG)
+  lastSnapshots.set(procId, output)
+  backlog.set(procId, next)
+  writers.get(procId)?.(`\x1bc${next}`)
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/buffer.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/buffer.ts
@@ -19,17 +19,32 @@ export interface TerminalReadOptions {

 type Reader = (opts: TerminalReadOptions) => TerminalReadResult

-// The persistent terminal is a singleton (one xterm mounted forever), so a
-// module-level slot is enough — set while the session is live, cleared on
-// dispose. The gateway `terminal.read.request` handler reads through this.
-let activeReader: Reader | null = null
+// Each live terminal registers a reader keyed by its id; a single `activeId`
+// (driven by the tab selection) decides which one the agent's `read_terminal`
+// tool sees. Keying by id keeps switching race-free — a deactivating tab's
+// cleanup can't null out the tab that just activated.
+const readers = new Map<string, Reader>()
+let activeId: string | null = null

-export function setActiveTerminalReader(reader: Reader | null): void {
-  activeReader = reader
+/** Register a live terminal's reader; returns an idempotent unregister. */
+export function registerTerminalReader(id: string, reader: Reader): () => void {
+  readers.set(id, reader)
+
+  return () => {
+    if (readers.get(id) === reader) {
+      readers.delete(id)
+    }
+  }
+}
+
+export function setActiveTerminalId(id: string | null): void {
+  activeId = id
 }

 export function readActiveTerminal(opts: TerminalReadOptions = {}): TerminalReadResult | null {
-  return activeReader ? activeReader(opts) : null
+  const reader = activeId === null ? null : readers.get(activeId)
+
+  return reader ? reader(opts) : null
 }

 export function makeTerminalReader(term: Terminal): Reader {
--- a/apps/desktop/src/app/right-sidebar/terminal/chrome.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/chrome.tsx
@@ -0,0 +1,24 @@
+import { useStore } from '@nanostores/react'
+
+import { TerminalSlot } from './persistent'
+import { TerminalRail } from './rail'
+import { $terminals } from './terminals'
+
+/** Pane-side terminal chrome: the body slot (which the persistent overlay chases)
+ *  plus the always-on tab rail. Lives in the real pane DOM — NOT the z-4 terminal
+ *  overlay — so the rail sits above the collapsed sidebars' z-30 hover-reveal
+ *  triggers (z-40, like the thread timeline) and suppresses them while hovered.
+ *  The rail is always shown when a terminal exists (even one), so every tab keeps
+ *  its close affordance; closing the last one hides the pane (reopen re-creates). */
+export function TerminalPaneChrome() {
+  const terminals = useStore($terminals)
+
+  return (
+    <div className="flex min-h-0 min-w-0 flex-1">
+      <div className="relative flex min-h-0 min-w-0 flex-1 flex-col">
+        <TerminalSlot />
+      </div>
+      {terminals.length > 0 && <TerminalRail />}
+    </div>
+  )
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/index.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/index.tsx
@@ -1,88 +0,0 @@
-import '@xterm/xterm/css/xterm.css'
-
-import { Button } from '@/components/ui/button'
-import { Codicon } from '@/components/ui/codicon'
-import { KbdCombo } from '@/components/ui/kbd'
-import { Loader } from '@/components/ui/loader'
-import { Tip } from '@/components/ui/tooltip'
-import { useI18n } from '@/i18n'
-
-import { SidebarPanelLabel } from '../../shell/sidebar-label'
-import { setTerminalTakeover } from '../store'
-
-import { useTerminalSession } from './use-terminal-session'
-
-interface TerminalTabProps {
-  cwd: string
-  onAddSelectionToChat: (text: string, label?: string) => void
-}
-
-export function TerminalTab({ cwd, onAddSelectionToChat }: TerminalTabProps) {
-  const { t } = useI18n()
-
-  const { addSelectionToChat, hostRef, selection, selectionStyle, shellName, status } = useTerminalSession({
-    cwd,
-    onAddSelectionToChat
-  })
-
-  const label = t.rightSidebar.terminalHide
-
-  return (
-    <div className="relative flex min-h-0 min-w-0 flex-1 flex-col">
-      <div className="flex h-8 shrink-0 items-center gap-2 px-2.5">
-        <SidebarPanelLabel className="text-(--ui-text-secondary)!">{shellName}</SidebarPanelLabel>
-        <Tip label={label}>
-          <Button
-            aria-label={label}
-            className="ml-auto size-6 rounded-md text-(--ui-text-secondary)!"
-            onClick={() => setTerminalTakeover(false)}
-            size="icon"
-            type="button"
-            variant="ghost"
-          >
-            <Codicon name="close" size="0.875rem" />
-          </Button>
-        </Tip>
-      </div>
-      <div className="relative min-h-0 flex-1 bg-(--ui-editor-surface-background) p-2">
-        {status === 'starting' && (
-          <div className="pointer-events-none absolute inset-0 z-10 grid place-items-center">
-            <Loader
-              className="size-8 text-(--ui-text-tertiary)"
-              pathSteps={180}
-              strokeScale={0.68}
-              type="spiral-search"
-            />
-          </div>
-        )}
-        {selection.trim() && (
-          <div className="absolute z-50 flex items-center gap-1" style={selectionStyle ?? { right: 12, top: 8 }}>
-            <Button
-              className="h-6 rounded-md px-2 text-[0.68rem] shadow-md backdrop-blur-md"
-              onClick={event => event.preventDefault()}
-              onMouseDown={event => {
-                event.preventDefault()
-                event.stopPropagation()
-                addSelectionToChat()
-              }}
-              type="button"
-              variant="secondary"
-            >
-              {t.rightSidebar.addToChat}
-              <KbdCombo className="ml-1 opacity-70" combo="mod+l" size="sm" />
-            </Button>
-          </div>
-        )}
-        {/* Outer div paints terminal inset; inner div is the xterm host so the
-            canvas sizes to the content area and p-2 stays as terminal padding.
-            Screen/viewport inherit the live skin surface so the terminal blends
-            with the app and follows light/dark; the xterm canvas itself is
-            painted the resolved surface color in use-terminal-session. */}
-        <div
-          className="h-full min-h-0 overflow-hidden text-(--ui-text-secondary) [&_.xterm]:h-full [&_.xterm-screen]:bg-(--ui-editor-surface-background)! [&_.xterm-viewport]:bg-(--ui-editor-surface-background)!"
-          ref={hostRef}
-        />
-      </div>
-    </div>
-  )
-}
--- a/apps/desktop/src/app/right-sidebar/terminal/instance.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/instance.tsx
@@ -0,0 +1,102 @@
+import '@xterm/xterm/css/xterm.css'
+
+import { Button } from '@/components/ui/button'
+import { KbdCombo } from '@/components/ui/kbd'
+import { Loader } from '@/components/ui/loader'
+import { useI18n } from '@/i18n'
+import { cn } from '@/lib/utils'
+
+import { reportTerminalShell } from './terminals'
+import { useAgentTerminal } from './use-agent-terminal'
+import { useTerminalSession } from './use-terminal-session'
+
+// Absolute-stacked so inactive tabs keep layout size (a display:none host goes
+// 0×0 and renders garbled on re-show); visibility toggles which one is seen.
+const INSTANCE_CLASS = 'absolute inset-0 flex flex-col bg-(--ui-editor-surface-background) px-2 pb-2 pt-0'
+
+interface TerminalInstanceProps {
+  id: string
+  cwd: string
+  active: boolean
+  onAddSelectionToChat: (text: string, label?: string) => void
+  reviveBuffer?: string
+}
+
+/** One persistent xterm+PTY. Every open tab stays mounted (so its shell and
+ *  scrollback survive tab switches); only the active one is shown. */
+export function TerminalInstance({ id, active, cwd, onAddSelectionToChat, reviveBuffer }: TerminalInstanceProps) {
+  const { t } = useI18n()
+
+  const { addSelectionToChat, hostRef, selection, selectionStyle, status } = useTerminalSession({
+    id,
+    cwd,
+    active,
+    onAddSelectionToChat,
+    reviveBuffer,
+    onShell: shell => reportTerminalShell(id, shell)
+  })
+
+  return (
+    <div
+      className={cn(INSTANCE_CLASS, active ? 'visible' : 'invisible pointer-events-none')}
+      // Focus-scope marker so isFocusWithin('[data-terminal]') can route ⌘W here.
+      data-terminal=""
+    >
+      {status === 'starting' && (
+        <div className="pointer-events-none absolute inset-0 z-10 grid place-items-center">
+          <Loader className="size-8 text-(--ui-text-tertiary)" pathSteps={180} strokeScale={0.68} type="spiral-search" />
+        </div>
+      )}
+      {selection.trim() && (
+        <div className="absolute z-50 flex items-center gap-1" style={selectionStyle ?? { right: 12, top: 8 }}>
+          <Button
+            className="h-6 rounded-md px-2 text-[0.68rem] shadow-md backdrop-blur-md"
+            onClick={event => event.preventDefault()}
+            onMouseDown={event => {
+              event.preventDefault()
+              event.stopPropagation()
+              addSelectionToChat()
+            }}
+            type="button"
+            variant="secondary"
+          >
+            {t.rightSidebar.addToChat}
+            <KbdCombo className="ml-1 opacity-70" combo="mod+l" size="sm" />
+          </Button>
+        </div>
+      )}
+      {/* Outer div paints the terminal inset; inner div is the xterm host so the
+          canvas sizes to the content area and p-2 stays as terminal padding. */}
+      <div
+        className="h-full min-h-0 overflow-hidden text-(--ui-text-secondary) [&_.xterm]:h-full [&_.xterm-screen]:bg-(--ui-editor-surface-background)! [&_.xterm-viewport]:bg-(--ui-editor-surface-background)!"
+        ref={hostRef}
+      />
+    </div>
+  )
+}
+
+interface AgentTerminalInstanceProps {
+  active: boolean
+  id: string
+  procId: string
+}
+
+/** Read-only mirror of an agent background process — a write-only xterm streamed
+ *  live from the backend output (no PTY, no input). */
+export function AgentTerminalInstance({ active, id, procId }: AgentTerminalInstanceProps) {
+  const { hostRef } = useAgentTerminal({ active, id, procId })
+
+  return (
+    <div
+      className={cn(INSTANCE_CLASS, active ? 'visible' : 'invisible pointer-events-none')}
+      // Same focus-scope marker as the user terminal so isFocusWithin('[data-terminal]')
+      // routes ⌘W here and closes the focused agent tab (not a preview).
+      data-terminal=""
+    >
+      <div
+        className="h-full min-h-0 overflow-hidden text-(--ui-text-secondary) [&_.xterm]:h-full [&_.xterm-screen]:bg-(--ui-editor-surface-background)! [&_.xterm-viewport]:bg-(--ui-editor-surface-background)!"
+        ref={hostRef}
+      />
+    </div>
+  )
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/persistent.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/persistent.tsx
@@ -4,7 +4,8 @@ import { type CSSProperties, useEffect, useLayoutEffect, useRef, useState } from

 import { $terminalTakeover } from '../store'

-import { TerminalTab } from './index'
+import { ensureTerminal } from './terminals'
+import { TerminalWorkspace } from './workspace'

 /**
 * One xterm Terminal mounted at the layout root and CSS-overlayed onto
@@ -40,7 +41,6 @@ export function TerminalSlot({ className = SLOT_CLASS }: { className?: string })
 }

 interface PersistentTerminalProps {
-  cwd: string
  onAddSelectionToChat: (text: string, label?: string) => void
 }

@@ -54,12 +54,26 @@ interface Rect {
 const sameRect = (a: Rect | null, b: Rect) =>
  !!a && a.top === b.top && a.left === b.left && a.width === b.width && a.height === b.height

-export function PersistentTerminal({ cwd, onAddSelectionToChat }: PersistentTerminalProps) {
+export function PersistentTerminal({ onAddSelectionToChat }: PersistentTerminalProps) {
  const slot = useStore($slot)
  const terminalTakeover = useStore($terminalTakeover)
  const [rect, setRect] = useState<Rect | null>(null)
  const [ready, setReady] = useState(false)

+  // VS Code parity: once the pane has ever been opened, keep the terminals
+  // mounted — and their shells alive — even while hidden. Hiding the pane just
+  // collapses the slot, so the overlay below goes invisible; nothing is torn
+  // down. Only an explicit per-tab close kills a PTY. Re-opening re-ensures one
+  // terminal exists (covers having closed the last tab).
+  const [mounted, setMounted] = useState(false)
+
+  useEffect(() => {
+    if (terminalTakeover && ready) {
+      setMounted(true)
+      ensureTerminal()
+    }
+  }, [terminalTakeover, ready])
+
  useLayoutEffect(() => {
    if (!slot) {
      setRect(null)
@@ -114,12 +128,12 @@ export function PersistentTerminal({ cwd, onAddSelectionToChat }: PersistentTerm
    contain: 'layout size paint'
  }

-  // Defer mount until the terminal sidebar is open and the slot has real dims.
-  // Booting xterm/node-pty at 0×0 starts the shell at 80×24 and spawns a
-  // visible conhost on Windows even when the pane is collapsed.
+  // Defer the FIRST mount until the pane is open and the slot has real dims —
+  // booting xterm/node-pty at 0×0 starts the shell at 80×24 and spawns a visible
+  // conhost on Windows. After that `mounted` latches: shells persist while hidden.
  return (
    <div aria-hidden={!visible} style={style}>
-      {terminalTakeover && ready && <TerminalTab cwd={cwd} onAddSelectionToChat={onAddSelectionToChat} />}
+      {mounted && <TerminalWorkspace onAddSelectionToChat={onAddSelectionToChat} />}
    </div>
  )
 }
--- a/apps/desktop/src/app/right-sidebar/terminal/rail.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/rail.tsx
@@ -0,0 +1,177 @@
+import { useStore } from '@nanostores/react'
+
+import { Codicon } from '@/components/ui/codicon'
+import {
+  ContextMenu,
+  ContextMenuContent,
+  ContextMenuItem,
+  ContextMenuSeparator,
+  ContextMenuTrigger
+} from '@/components/ui/context-menu'
+import { Tip } from '@/components/ui/tooltip'
+import { useI18n } from '@/i18n'
+import { formatCombo } from '@/lib/keybinds/combo'
+import { cn } from '@/lib/utils'
+import { $bindings } from '@/store/keybinds'
+
+import { setTerminalTakeover } from '../store'
+
+import {
+  $activeTerminalId,
+  $terminals,
+  closeAllTerminals,
+  closeOtherTerminals,
+  closeTerminal,
+  createTerminal,
+  selectTerminal,
+  type TerminalEntry
+} from './terminals'
+
+const RAIL_ACTION =
+  'grid size-6 place-items-center rounded text-(--ui-text-tertiary) transition-colors hover:bg-(--chrome-action-hover) hover:text-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-sidebar-ring [-webkit-app-region:no-drag]'
+
+/** Tooltip label with a trailing hotkey hint (the user's live binding). */
+function hintLabel(text: string, combo?: string) {
+  return combo ? (
+    <span className="flex items-center gap-2">
+      <span>{text}</span>
+      <span className="opacity-55">{formatCombo(combo)}</span>
+    </span>
+  ) : (
+    text
+  )
+}
+
+/** Thin icon "bookmark" strip blended into the terminal surface, shown whenever a
+ *  terminal exists. Each square is a tab (name + hotkey on hover); close via the
+ *  shell's `exit`, middle-click, or the context menu. */
+export function TerminalRail() {
+  const { t } = useI18n()
+  const terminals = useStore($terminals)
+  const activeId = useStore($activeTerminalId)
+  const bindings = useStore($bindings)
+  const toggleHint = bindings['view.showTerminal']?.[0]
+  const newHint = bindings['view.newTerminal']?.[0]
+
+  return (
+    <div
+      className="group/rail relative z-40 flex h-full w-9 shrink-0 flex-col items-center border-l border-(--ui-stroke-quaternary) bg-(--ui-editor-surface-background)"
+      // The rail sits at the pane's outer edge, under the collapsed sidebars'
+      // hover-reveal triggers; mark it so those triggers go pointer-transparent
+      // while it's hovered (see the suppression rules in styles.css) and a reach
+      // for a tab can't drag in the file-browser/review panel.
+      data-suppress-pane-reveal=""
+    >
+      <ul
+        aria-label={t.rightSidebar.terminalsAria}
+        className="flex min-h-0 flex-1 flex-col items-center gap-0.5 self-stretch overflow-y-auto overflow-x-hidden overscroll-contain py-1 [-ms-overflow-style:none] [scrollbar-width:none] [&::-webkit-scrollbar]:hidden"
+        role="tablist"
+      >
+        {terminals.map((term, index) => (
+          <TerminalRailItem
+            active={term.id === activeId}
+            canCloseOthers={terminals.length > 1}
+            index={index}
+            key={term.id}
+            term={term}
+            toggleHint={toggleHint}
+          />
+        ))}
+        <li className="flex w-full justify-center">
+          <Tip label={hintLabel(t.rightSidebar.terminalNew, newHint)} side="left">
+            <button
+              aria-label={t.rightSidebar.terminalNew}
+              className={cn(RAIL_ACTION, 'size-7 text-(--ui-text-quaternary)')}
+              onClick={() => createTerminal()}
+              type="button"
+            >
+              <Codicon name="add" size="0.8125rem" />
+            </button>
+          </Tip>
+        </li>
+      </ul>
+
+      <div className="flex shrink-0 flex-col items-center pb-1.5">
+        <Tip label={t.rightSidebar.terminalHide} side="left">
+          <button
+            aria-label={t.rightSidebar.terminalHide}
+            className={cn(RAIL_ACTION, 'opacity-0 transition-opacity group-hover/rail:opacity-100')}
+            onClick={() => setTerminalTakeover(false)}
+            type="button"
+          >
+            <Codicon name="chevron-down" size="0.8125rem" />
+          </button>
+        </Tip>
+      </div>
+    </div>
+  )
+}
+
+interface TerminalRailItemProps {
+  active: boolean
+  canCloseOthers: boolean
+  index: number
+  term: TerminalEntry
+  toggleHint?: string
+}
+
+function TerminalRailItem({ active, canCloseOthers, index, term, toggleHint }: TerminalRailItemProps) {
+  const { t } = useI18n()
+  const label = `${index + 1}. ${term.title}`
+
+  return (
+    <ContextMenu>
+      <ContextMenuTrigger asChild>
+        <li className="relative flex w-full justify-center [-webkit-app-region:no-drag]">
+          {active && (
+            <span
+              aria-hidden="true"
+              className="absolute inset-y-0.5 right-0 w-0.5 rounded-l-sm bg-(--ui-stroke-primary)"
+            />
+          )}
+          <Tip label={hintLabel(label, toggleHint)} side="left">
+            <button
+              aria-label={label}
+              aria-selected={active}
+              className={cn(
+                'grid size-7 place-items-center rounded-md transition-colors',
+                active
+                  ? 'bg-(--chrome-action-hover) text-foreground'
+                  : 'text-(--ui-text-tertiary) hover:bg-(--chrome-action-hover) hover:text-foreground'
+              )}
+              onAuxClick={event => {
+                if (event.button === 1) {
+                  event.preventDefault()
+                  closeTerminal(term.id)
+                }
+              }}
+              onClick={() => selectTerminal(term.id)}
+              onMouseDown={event => {
+                if (event.button === 1) {
+                  event.preventDefault()
+                }
+              }}
+              role="tab"
+              type="button"
+            >
+              <Codicon
+                className={cn(term.kind === 'agent' && !active && 'text-primary')}
+                name={term.kind === 'agent' ? 'agent' : 'terminal'}
+                size="0.875rem"
+              />
+            </button>
+          </Tip>
+        </li>
+      </ContextMenuTrigger>
+      <ContextMenuContent>
+        <ContextMenuItem onSelect={() => closeTerminal(term.id)}>{t.common.close}</ContextMenuItem>
+        <ContextMenuItem disabled={!canCloseOthers} onSelect={() => closeOtherTerminals(term.id)}>
+          {t.rightSidebar.terminalCloseOthers}
+        </ContextMenuItem>
+        <ContextMenuItem onSelect={closeAllTerminals}>{t.rightSidebar.terminalCloseAll}</ContextMenuItem>
+        <ContextMenuSeparator />
+        <ContextMenuItem onSelect={() => setTerminalTakeover(false)}>{t.rightSidebar.terminalHide}</ContextMenuItem>
+      </ContextMenuContent>
+    </ContextMenu>
+  )
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/terminals.test.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/terminals.test.ts
@@ -0,0 +1,90 @@
+import { atom } from 'nanostores'
+import { beforeEach, describe, expect, it, vi } from 'vitest'
+
+const STORAGE_KEY = 'hermes.desktop.terminals.v1'
+
+async function loadTerminalStore() {
+  vi.doMock('@/store/session', () => ({
+    $currentCwd: atom('/workspace')
+  }))
+
+  return import('./terminals')
+}
+
+describe('terminal store persistence', () => {
+  beforeEach(() => {
+    window.localStorage.clear()
+    vi.resetModules()
+  })
+
+  it('restores user tabs, active tab, and history on module load', async () => {
+    window.localStorage.setItem(
+      STORAGE_KEY,
+      JSON.stringify({
+        activeTerminalId: 'term-two',
+        terminals: [
+          { auto: false, cwd: '/repo/one', id: 'term-one', reviveBuffer: 'last output', title: 'zsh' },
+          { auto: true, cwd: '/repo/two', id: 'term-two', title: 'Terminal' }
+        ]
+      })
+    )
+
+    const { $activeTerminalId, $terminals } = await loadTerminalStore()
+
+    expect($activeTerminalId.get()).toBe('term-two')
+    expect($terminals.get()).toEqual([
+      { auto: false, cwd: '/repo/one', id: 'term-one', kind: 'user', reviveBuffer: 'last output', title: 'zsh' },
+      { auto: true, cwd: '/repo/two', id: 'term-two', kind: 'user', title: 'Terminal' }
+    ])
+  })
+
+  it('persists user tabs and history synchronously, skipping agent mirrors', async () => {
+    const { createTerminal, ensureAgentTerminal, renameTerminal, selectTerminal, updateTerminalReviveBuffer } =
+      await loadTerminalStore()
+
+    const userId = createTerminal('/repo')
+    renameTerminal(userId, 'server')
+    updateTerminalReviveBuffer(userId, 'recent scrollback')
+    ensureAgentTerminal('proc-1', 'background task')
+    selectTerminal(userId)
+
+    // No flush/tick: persistence is synchronous, so the snapshot is already on
+    // disk (this is what makes app-quit restore reliable).
+    expect(JSON.parse(window.localStorage.getItem(STORAGE_KEY) ?? '{}')).toEqual({
+      activeTerminalId: userId,
+      terminals: [{ auto: false, cwd: '/repo', id: userId, reviveBuffer: 'recent scrollback', title: 'server' }]
+    })
+  })
+
+  it('never attaches a revive buffer to an agent tab', async () => {
+    const { $terminals, ensureAgentTerminal, updateTerminalReviveBuffer } = await loadTerminalStore()
+
+    const agentId = ensureAgentTerminal('proc-1', 'background task')!
+    updateTerminalReviveBuffer(agentId, 'should be ignored')
+
+    expect($terminals.get().find(term => term.id === agentId)?.reviveBuffer).toBeUndefined()
+    expect(window.localStorage.getItem(STORAGE_KEY)).toBeNull()
+  })
+
+  it('tail-trims an oversized revive buffer to stay under the storage budget', async () => {
+    const { $terminals, createTerminal, updateTerminalReviveBuffer } = await loadTerminalStore()
+
+    const userId = createTerminal('/repo')
+    const huge = 'x'.repeat(60_000)
+    updateTerminalReviveBuffer(userId, huge)
+
+    const stored = $terminals.get().find(term => term.id === userId)?.reviveBuffer ?? ''
+    expect(stored.length).toBe(48_000)
+    expect(stored).toBe(huge.slice(-48_000))
+  })
+
+  it('clears remembered tabs when all terminals close', async () => {
+    const { closeAllTerminals, createTerminal } = await loadTerminalStore()
+
+    createTerminal('/repo')
+    expect(window.localStorage.getItem(STORAGE_KEY)).not.toBeNull()
+
+    closeAllTerminals()
+    expect(window.localStorage.getItem(STORAGE_KEY)).toBeNull()
+  })
+})
--- a/apps/desktop/src/app/right-sidebar/terminal/terminals.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/terminals.ts
@@ -0,0 +1,330 @@
+import { atom, computed } from 'nanostores'
+
+import { readKey, writeKey } from '@/lib/storage'
+import { $currentCwd } from '@/store/session'
+
+import { setTerminalTakeover } from '../store'
+
+import { seedAgentTerminalCommand } from './agent-terminal-stream'
+
+/** One in-app terminal tab. `id` is the renderer-side handle (distinct from the
+ *  PTY session id the main process mints); each instance owns its own shell. */
+export interface TerminalEntry {
+  id: string
+  /** Display label. `auto` adopts the resolved shell name until the user renames. */
+  title: string
+  auto: boolean
+  /** Working directory, snapshotted once at creation. Terminals live outside
+   *  session/project state — the only thing they inherit is this initial cwd
+   *  (the project root if opened in one, else the backend's default). Switching
+   *  sessions never moves or recreates a terminal. */
+  cwd: string
+  /** Serialized xterm scrollback from the last session, replayed on relaunch so
+   *  the tab reopens with its recent history (VS Code parity). Processes are NOT
+   *  revived — a fresh shell starts beneath the restored buffer. Captured live
+   *  for user tabs only; agent mirrors stay runtime-only. */
+  reviveBuffer?: string
+  /** `user` = interactive PTY shell. `agent` = read-only mirror of an agent
+   *  background process (`terminal(background=true)`), keyed by `procId`. */
+  kind: 'user' | 'agent'
+  procId?: string
+}
+
+interface PersistedTerminalEntry {
+  auto: boolean
+  cwd: string
+  id: string
+  reviveBuffer?: string
+  title: string
+}
+
+interface PersistedTerminalState {
+  activeTerminalId: null | string
+  terminals: PersistedTerminalEntry[]
+}
+
+const TERMINALS_STORAGE_KEY = 'hermes.desktop.terminals.v1'
+
+// Cap a single tab's replayed history so the persisted layout can't blow the
+// localStorage quota. Roughly mirrors VS Code's persistentSessionScrollback
+// default (100 lines) once the serialized escape codes are counted in.
+const MAX_REVIVE_BUFFER_CHARS = 48_000
+
+function sanitizePersistedTerminal(value: unknown): PersistedTerminalEntry | null {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    return null
+  }
+
+  const record = value as Record<string, unknown>
+  const id = typeof record.id === 'string' ? record.id.trim() : ''
+  const title = typeof record.title === 'string' ? record.title.trim() : ''
+  const cwd = typeof record.cwd === 'string' ? record.cwd : ''
+  const reviveBuffer = typeof record.reviveBuffer === 'string' ? record.reviveBuffer : undefined
+
+  if (!id) {
+    return null
+  }
+
+  return {
+    auto: typeof record.auto === 'boolean' ? record.auto : true,
+    cwd,
+    id,
+    ...(reviveBuffer ? { reviveBuffer } : {}),
+    title: title || 'Terminal'
+  }
+}
+
+function loadPersistedTerminals(): PersistedTerminalState {
+  const fallback: PersistedTerminalState = { activeTerminalId: null, terminals: [] }
+  const raw = readKey(TERMINALS_STORAGE_KEY)
+
+  if (!raw) {
+    return fallback
+  }
+
+  try {
+    const parsed = JSON.parse(raw) as unknown
+
+    if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+      return fallback
+    }
+
+    const record = parsed as Record<string, unknown>
+
+    const terminals = Array.isArray(record.terminals)
+      ? record.terminals.map(sanitizePersistedTerminal).filter((term): term is PersistedTerminalEntry => Boolean(term))
+      : []
+
+    const active =
+      typeof record.activeTerminalId === 'string' && terminals.some(term => term.id === record.activeTerminalId)
+        ? record.activeTerminalId
+        : (terminals[0]?.id ?? null)
+
+    return { activeTerminalId: active, terminals }
+  } catch {
+    return fallback
+  }
+}
+
+// Persist synchronously on every change (the app-wide convention — see panes.ts
+// / layout.ts). Capturing history this way means a snapshot is already on disk
+// well before the renderer tears down, so app quit needs no unload hook.
+function persistTerminals(list: readonly TerminalEntry[], activeTerminalId: null | string) {
+  const terminals = list
+    .filter(term => term.kind === 'user')
+    .map(term => ({
+      auto: term.auto,
+      cwd: term.cwd,
+      id: term.id,
+      ...(term.reviveBuffer ? { reviveBuffer: term.reviveBuffer } : {}),
+      title: term.title
+    }))
+
+  if (!terminals.length) {
+    writeKey(TERMINALS_STORAGE_KEY, null)
+
+    return
+  }
+
+  const active = terminals.some(term => term.id === activeTerminalId) ? activeTerminalId : (terminals[0]?.id ?? null)
+  writeKey(TERMINALS_STORAGE_KEY, JSON.stringify({ activeTerminalId: active, terminals }))
+}
+
+const restored = loadPersistedTerminals()
+
+export const $terminals = atom<readonly TerminalEntry[]>(
+  restored.terminals.map(term => ({ ...term, kind: 'user' as const }))
+)
+export const $activeTerminalId = atom<string | null>(restored.activeTerminalId)
+
+$terminals.subscribe(list => persistTerminals(list, $activeTerminalId.get()))
+$activeTerminalId.subscribe(active => persistTerminals($terminals.get(), active))
+
+export const $activeTerminal = computed(
+  [$terminals, $activeTerminalId],
+  (list, id) => list.find(term => term.id === id) ?? null
+)
+
+const newId = () =>
+  globalThis.crypto?.randomUUID?.() ?? `term-${Date.now().toString(36)}-${Math.random().toString(36).slice(2)}`
+
+/** Append a fresh terminal and focus it. Captures the current cwd once (its only
+ *  tie to session/project state); pass an explicit cwd to override. Returns the id. */
+export function createTerminal(cwd: string = $currentCwd.get()): string {
+  const id = newId()
+  $terminals.set([...$terminals.get(), { id, title: 'Terminal', auto: true, cwd, kind: 'user' }])
+  $activeTerminalId.set(id)
+
+  return id
+}
+
+// Procs we've already surfaced a tab for — so closing an agent tab doesn't
+// resurrect it on the next poll while the process is still running.
+const surfacedProcs = new Set<string>()
+
+const findByProc = (procId: string) => $terminals.get().find(term => term.procId === procId)
+
+/** Auto-surface an agent background process as a read-only tab — once. Returns
+ *  the tab id, or null if it was already surfaced and the user has since closed it. */
+export function ensureAgentTerminal(procId: string, title: string): string | null {
+  const existing = findByProc(procId)
+
+  if (existing) {
+    return existing.id
+  }
+
+  if (surfacedProcs.has(procId)) {
+    return null
+  }
+
+  surfacedProcs.add(procId)
+  const id = newId()
+  $terminals.set([...$terminals.get(), { id, title: title || 'agent', auto: false, cwd: '', kind: 'agent', procId }])
+
+  return id
+}
+
+/** Open + focus an agent process's tab (the status-stack link), recreating it if
+ *  the user had closed it. Opens the pane. */
+export function openAgentTerminal(procId: string, title: string): void {
+  surfacedProcs.add(procId)
+  seedAgentTerminalCommand(procId, title)
+  let id = findByProc(procId)?.id
+
+  if (!id) {
+    id = newId()
+    $terminals.set([...$terminals.get(), { id, title: title || 'agent', auto: false, cwd: '', kind: 'agent', procId }])
+  }
+
+  $activeTerminalId.set(id)
+  setTerminalTakeover(true)
+}
+
+/** Guarantee at least one tab exists when the pane opens.
+ *  If a status-stack click already opened an agent tab, don't create a
+ *  second, unrelated user shell just because the pane became visible. */
+export function ensureTerminal(): void {
+  if ($terminals.get().length === 0) {
+    createTerminal()
+  }
+}
+
+export function selectTerminal(id: string): void {
+  if ($terminals.get().some(term => term.id === id)) {
+    $activeTerminalId.set(id)
+  }
+}
+
+/** Move the active tab by `direction` (+1 next / -1 prev), wrapping around. */
+export function cycleTerminal(direction: 1 | -1): void {
+  const list = $terminals.get()
+
+  if (list.length < 2) {
+    return
+  }
+
+  const current = Math.max(
+    0,
+    list.findIndex(term => term.id === $activeTerminalId.get())
+  )
+
+  $activeTerminalId.set(list[(current + direction + list.length) % list.length].id)
+}
+
+/** Drop a terminal. Focus slides to the neighbor that fills its slot; closing
+ *  the last one closes the whole pane. */
+export function closeTerminal(id: string): void {
+  const list = $terminals.get()
+  const index = list.findIndex(term => term.id === id)
+
+  if (index < 0) {
+    return
+  }
+
+  const next = list.filter(term => term.id !== id)
+  $terminals.set(next)
+
+  if ($activeTerminalId.get() === id) {
+    $activeTerminalId.set((next[index] ?? next[index - 1])?.id ?? null)
+  }
+
+  if (!next.length) {
+    setTerminalTakeover(false)
+  }
+}
+
+/** Close the read-only agent tab mirroring a background process. The agent
+ *  drives this via the desktop-gated `close_terminal` tool → `terminal.close`.
+ *  The process is NOT killed — only the view is dropped; `surfacedProcs` keeps
+ *  it from auto-resurfacing, and the status-stack row can reopen it on demand.
+ *  No-op when no such tab exists. */
+export function closeAgentTerminalByProc(procId: string): boolean {
+  const term = $terminals.get().find(t => t.kind === 'agent' && t.procId === procId)
+
+  if (!term) {
+    return false
+  }
+
+  closeTerminal(term.id)
+
+  return true
+}
+
+export function closeActiveTerminal(): void {
+  const id = $activeTerminalId.get()
+
+  if (id) {
+    closeTerminal(id)
+  }
+}
+
+export function closeAllTerminals(): void {
+  if ($terminals.get().length === 0) {
+    return
+  }
+
+  $terminals.set([])
+  $activeTerminalId.set(null)
+  setTerminalTakeover(false)
+}
+
+export function closeOtherTerminals(id: string): void {
+  const keep = $terminals.get().find(term => term.id === id)
+
+  if (keep) {
+    $terminals.set([keep])
+    $activeTerminalId.set(keep.id)
+  }
+}
+
+/** Record the latest serialized scrollback for a tab so it can be replayed on
+ *  the next launch. Oversized buffers are tail-trimmed to stay under the storage
+ *  budget; only user tabs ever carry one. */
+export function updateTerminalReviveBuffer(id: string, reviveBuffer: string): void {
+  const capped =
+    reviveBuffer.length > MAX_REVIVE_BUFFER_CHARS ? reviveBuffer.slice(-MAX_REVIVE_BUFFER_CHARS) : reviveBuffer
+
+  $terminals.set(
+    $terminals.get().map(term => (term.id === id && term.kind === 'user' ? { ...term, reviveBuffer: capped } : term))
+  )
+}
+
+export function renameTerminal(id: string, title: string): void {
+  const trimmed = title.trim()
+
+  $terminals.set(
+    $terminals.get().map(term => (term.id === id ? { ...term, title: trimmed || term.title, auto: false } : term))
+  )
+}
+
+/** A live terminal reports its resolved shell; adopt it as the label only while
+ *  the user hasn't named the tab themselves. */
+export function reportTerminalShell(id: string, shell: string): void {
+  const name = shell.trim()
+
+  if (!name) {
+    return
+  }
+
+  $terminals.set($terminals.get().map(term => (term.id === id && term.auto ? { ...term, title: name } : term)))
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/use-agent-terminal.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/use-agent-terminal.ts
@@ -0,0 +1,142 @@
+import { FitAddon } from '@xterm/addon-fit'
+import { Unicode11Addon } from '@xterm/addon-unicode11'
+import { WebLinksAddon } from '@xterm/addon-web-links'
+import { WebglAddon } from '@xterm/addon-webgl'
+import { Terminal } from '@xterm/xterm'
+import { useEffect, useRef } from 'react'
+
+import { useTheme } from '@/themes/context'
+
+import { registerAgentTerminalWriter } from './agent-terminal-stream'
+import { makeTerminalReader, registerTerminalReader } from './buffer'
+import { resolveSurfaceColor, terminalTheme } from './selection'
+
+// Read-only terminal for an agent background process: a write-only xterm (no PTY,
+// no input) fed live by the backend output stream, keyed by process id. Shares
+// the user terminal's look so the two read as one surface.
+export function useAgentTerminal({ active, id, procId }: { active: boolean; id: string; procId: string }) {
+  const { renderedMode, theme, themeName } = useTheme()
+  const hostRef = useRef<HTMLDivElement | null>(null)
+  const termRef = useRef<Terminal | null>(null)
+  const webglRef = useRef<WebglAddon | null>(null)
+  const fitRef = useRef<(() => void) | null>(null)
+
+  const surfaceTheme = () => {
+    const ansi = renderedMode === 'dark' ? (theme.darkTerminal ?? theme.terminal) : theme.terminal
+    const surface = resolveSurfaceColor('#ffffff')
+
+    return { ...terminalTheme(renderedMode, ansi), background: surface, cursorAccent: surface }
+  }
+
+  useEffect(() => {
+    const host = hostRef.current
+
+    if (!host) {
+      return
+    }
+
+    const term = new Terminal({
+      allowProposedApi: true,
+      allowTransparency: false,
+      convertEol: true,
+      cursorBlink: false,
+      disableStdin: true,
+      fontFamily: "'JetBrains Mono', 'Cascadia Code', 'SF Mono', Menlo, Consolas, monospace",
+      fontSize: 11,
+      fontWeight: 'normal',
+      fontWeightBold: 'bold',
+      letterSpacing: 0,
+      lineHeight: 1.12,
+      minimumContrastRatio: 4.5,
+      scrollback: 1000,
+      theme: surfaceTheme()
+    })
+
+    const fit = new FitAddon()
+    term.loadAddon(fit)
+    term.loadAddon(new Unicode11Addon())
+    term.loadAddon(new WebLinksAddon())
+    term.unicode.activeVersion = '11'
+    term.open(host)
+    termRef.current = term
+
+    fitRef.current = () => {
+      if (host.clientWidth > 0 && host.clientHeight > 0) {
+        try {
+          fit.fit()
+        } catch {
+          // Mid-transition layout — the next observer tick refits.
+        }
+      }
+    }
+
+    try {
+      const webgl = new WebglAddon()
+      webgl.onContextLoss(() => {
+        webgl.dispose()
+        webglRef.current = null
+      })
+      term.loadAddon(webgl)
+      webglRef.current = webgl
+    } catch {
+      // No WebGL — xterm falls back to the DOM renderer.
+    }
+
+    fitRef.current()
+    const observer = new ResizeObserver(() => fitRef.current?.())
+    observer.observe(host)
+
+    // Stream live output straight into the terminal (replays backlog on attach).
+    const unregister = registerAgentTerminalWriter(procId, chunk => term.write(chunk))
+    const unregisterReader = registerTerminalReader(id, makeTerminalReader(term))
+
+    return () => {
+      unregister()
+      unregisterReader()
+      observer.disconnect()
+      term.dispose()
+      termRef.current = null
+      webglRef.current = null
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
+
+  useEffect(() => {
+    const term = termRef.current
+
+    if (!term) {
+      return
+    }
+
+    const raf = requestAnimationFrame(() => {
+      term.options.theme = surfaceTheme()
+      webglRef.current?.clearTextureAtlas()
+    })
+
+    return () => cancelAnimationFrame(raf)
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [renderedMode, themeName])
+
+  // A visibility:hidden xterm doesn't paint — refit + redraw on re-activation.
+  useEffect(() => {
+    if (!active) {
+      return
+    }
+
+    const frame = requestAnimationFrame(() => {
+      const term = termRef.current
+
+      fitRef.current?.()
+      webglRef.current?.clearTextureAtlas()
+      term?.refresh(0, term.rows - 1)
+      // Take focus on activation (parity with the user terminal) so the active
+      // agent tab holds focus and ⌘W's isFocusWithin('[data-terminal]') routes
+      // the close to this tab rather than to a preview.
+      term?.focus()
+    })
+
+    return () => cancelAnimationFrame(frame)
+  }, [active])
+
+  return { hostRef }
+}
--- a/apps/desktop/src/app/right-sidebar/terminal/use-terminal-session.ts
+++ b/apps/desktop/src/app/right-sidebar/terminal/use-terminal-session.ts
@@ -1,4 +1,5 @@
 import { FitAddon } from '@xterm/addon-fit'
+import { SerializeAddon } from '@xterm/addon-serialize'
 import { Unicode11Addon } from '@xterm/addon-unicode11'
 import { WebLinksAddon } from '@xterm/addon-web-links'
 import { WebglAddon } from '@xterm/addon-webgl'
@@ -12,7 +13,7 @@ import { useTheme } from '@/themes/context'

 import { $terminalInjection } from '../store'

-import { makeTerminalReader, setActiveTerminalReader } from './buffer'
+import { makeTerminalReader, registerTerminalReader } from './buffer'
 import {
  isAddSelectionShortcut,
  resolveSurfaceColor,
@@ -20,6 +21,34 @@ import {
  terminalSelectionLabel,
  terminalTheme
 } from './selection'
+import { closeTerminal, updateTerminalReviveBuffer } from './terminals'
+
+// How many scrollback lines to serialize for relaunch restore. Mirrors VS Code's
+// terminal.integrated.persistentSessionScrollback default; the store caps the
+// resulting string so a long line-wrapped buffer can't blow the storage budget.
+const PERSISTENT_SESSION_SCROLLBACK = 200
+
+// Leading-edge throttle window for capturing history. The first output after an
+// idle gap persists almost immediately (so `cmd; quit` is on disk before the
+// renderer tears down), then at most once per window while output streams.
+const SNAPSHOT_THROTTLE_MS = 750
+
+// True once the page/app is tearing down (Cmd+Q, Alt+F4, window close, reload).
+// App quit kills the PTYs from the main process, which fires onExit in the
+// renderer — but React skips effect cleanups on teardown, so the per-instance
+// `disposed` flag never flips. Without this guard those teardown exits would call
+// closeTerminal() and wipe the persisted terminal list right before relaunch
+// reads it. A real `exit`/Ctrl-D still closes the tab (flag stays false).
+let appTearingDown = false
+
+if (typeof window !== 'undefined') {
+  const markTearingDown = () => {
+    appTearingDown = true
+  }
+
+  window.addEventListener('pagehide', markTearingDown)
+  window.addEventListener('beforeunload', markTearingDown)
+}

 type TerminalStatus = 'closed' | 'open' | 'starting'

@@ -65,6 +94,14 @@ function readEscapeSequence(data: string, index: number) {
    }
  }

+  // Character-set and other short ESC forms are three bytes (e.g. ESC ( B).
+  // Treating only ESC+( as a sequence leaves the final selector ("B") as
+  // printable text, which disarms the initial prompt-gap stripper before it can
+  // eat the shell's leading newline.
+  if (['(', ')', '*', '+', '-', '.', '/'].includes(kind) && index + 2 < data.length) {
+    return data.slice(index, index + 3)
+  }
+
  return data.slice(index, Math.min(index + 2, data.length))
 }

@@ -131,9 +168,49 @@ function stripInitialPromptGap(data: string) {
  return prefix
 }

+// Trim the shell's trailing idle prompt from a serialized snapshot before it's
+// persisted. Without it, the saved buffer ends in the old prompt, so the next
+// launch replays it directly above the fresh shell's prompt ("double bar"). The
+// prompt is the short block after the last blank line (starship's add_newline
+// gap); only a short tail is dropped, so real command output is never trimmed and
+// configs without that blank line simply keep the historical prompt (no loss).
+function cleanReviveSnapshot(serialized: string): string {
+  const visible = (line: string) => stripEscapeSequences(line).replace(/[\s%]/g, '')
+  const lines = serialized.split(/\r?\n/)
+
+  while (lines.length && visible(lines[lines.length - 1]) === '') {
+    lines.pop()
+  }
+
+  let lastBlank = -1
+
+  for (let i = lines.length - 1; i >= 0; i -= 1) {
+    if (visible(lines[i]) === '') {
+      lastBlank = i
+
+      break
+    }
+  }
+
+  // A prompt is a short block; a long tail after the blank is real output, leave it.
+  if (lastBlank >= 0 && lines.length - 1 - lastBlank <= 3) {
+    lines.length = lastBlank
+  }
+
+  return lines.join('\r\n')
+}
+
 interface UseTerminalSessionOptions {
+  /** Renderer-side terminal id (the tab handle), used to key the agent reader. */
+  id: string
  cwd: string
+  /** Only the active tab is visible, owns the agent reader, and runs injections. */
+  active: boolean
  onAddSelectionToChat: (text: string, label?: string) => void
+  /** Serialized scrollback from the previous session, replayed once on mount. */
+  reviveBuffer?: string
+  /** Reports the resolved shell name once the PTY is live (for the tab label). */
+  onShell?: (shell: string) => void
 }

 // Bind the palette to the live skin surface so the terminal blends with the app
@@ -232,7 +309,14 @@ function quotePathForShell(path: string, shellName: string): string {
  return `'${path.replace(/'/g, "'\\''")}'`
 }

-export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSessionOptions) {
+export function useTerminalSession({
+  id,
+  cwd,
+  active,
+  onAddSelectionToChat,
+  reviveBuffer,
+  onShell
+}: UseTerminalSessionOptions) {
  // Key off renderedMode (the painted surface type), not resolvedMode (the
  // clicked switch) — a skin can keep a light surface in "dark" mode, and we
  // must match the surface or the ANSI palette inverts against it. themeName
@@ -249,10 +333,17 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
  const termRef = useRef<Terminal | null>(null)
  const webglRef = useRef<WebglAddon | null>(null)
  const sessionIdRef = useRef<string | null>(null)
+  // Snapshot the revive buffer once: live snapshots feed updateTerminalReviveBuffer
+  // and would otherwise re-arm replay on every store-driven re-render.
+  const initialReviveBufferRef = useRef(reviveBuffer)
  const shellNameRef = useRef('shell')
  const selectionLabelRef = useRef('')
  const selectionRef = useRef('')
  const onAddSelectionToChatRef = useRef(onAddSelectionToChat)
+  const onShellRef = useRef(onShell)
+  // Re-fit on activation: a tab hidden via display:none has a 0×0 host, so its
+  // last fit is stale by the time it's shown again.
+  const fitRef = useRef<(() => void) | null>(null)
  const [status, setStatus] = useState<TerminalStatus>('starting')
  const [selection, setSelection] = useState('')
  const [selectionStyle, setSelectionStyle] = useState<CSSProperties | null>(null)
@@ -260,7 +351,8 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes

  useEffect(() => {
    onAddSelectionToChatRef.current = onAddSelectionToChat
-  }, [onAddSelectionToChat])
+    onShellRef.current = onShell
+  }, [onAddSelectionToChat, onShell])

  // Live selection at call time. A redraw-heavy TUI (spinners, clocks) outruns
  // onSelectionChange, so trust xterm directly — fall back to the native
@@ -361,16 +453,71 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
    })

    const fit = new FitAddon()
+    const serialize = new SerializeAddon()

    termRef.current = term
    term.loadAddon(fit)
+    term.loadAddon(serialize)
    term.loadAddon(new Unicode11Addon())
    term.loadAddon(new WebLinksAddon())
    term.unicode.activeVersion = '11'

-    // Let the GUI chat agent read this pane via the `read_terminal` tool: the
-    // gateway's terminal.read.request handler serializes the buffer through this.
-    setActiveTerminalReader(makeTerminalReader(term))
+    // Replay last session's scrollback before the fresh shell boots. The process
+    // is NOT revived — a new shell starts one line below the restored history.
+    // Stripping the boot gap still applies to the live shell output that follows,
+    // so the fresh prompt lands flush under the restored block.
+    const initialReviveBuffer = initialReviveBufferRef.current
+
+    if (initialReviveBuffer) {
+      term.write(initialReviveBuffer)
+      term.write('\r\n')
+    }
+
+    // Capture the buffer on a leading-edge throttle and persist synchronously via
+    // the store. No unload hook: by the time the user quits, a recent snapshot is
+    // already on disk (the prior beforeunload-based attempt lost the last output).
+    let snapshotTimer = 0
+    let lastSnapshotAt = 0
+
+    const persistSnapshot = () => {
+      if (disposed) {
+        return
+      }
+
+      lastSnapshotAt = Date.now()
+
+      try {
+        const snapshot = serialize.serialize({ scrollback: PERSISTENT_SESSION_SCROLLBACK })
+        updateTerminalReviveBuffer(id, cleanReviveSnapshot(snapshot))
+      } catch {
+        // Best-effort restore: never let serialization break a live terminal.
+      }
+    }
+
+    const scheduleSnapshot = () => {
+      if (snapshotTimer) {
+        return
+      }
+
+      const elapsed = Date.now() - lastSnapshotAt
+
+      if (elapsed >= SNAPSHOT_THROTTLE_MS) {
+        persistSnapshot()
+
+        return
+      }
+
+      snapshotTimer = window.setTimeout(() => {
+        snapshotTimer = 0
+        persistSnapshot()
+      }, SNAPSHOT_THROTTLE_MS - elapsed)
+    }
+
+    cleanup.push(() => {
+      if (snapshotTimer) {
+        window.clearTimeout(snapshotTimer)
+      }
+    })

    const onDragOver = (e: DragEvent) => {
      if (!e.dataTransfer || !transferHasDropCandidates(e.dataTransfer)) {
@@ -411,18 +558,9 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
      host.removeEventListener('drop', onDrop)
    })

-    // A fresh prompt should sit at the top. Every resize SIGWINCHes the shell,
-    // which reprints its prompt and can leave stale blank rows above it. While
-    // the session is pristine (nothing run yet) we ask the shell to clear +
-    // redraw via Ctrl-L (\f) after the resize settles. Ctrl-L preserves
-    // multi-line prompts (term.clear() would drop all but the cursor row) and we
-    // stop the moment real output exists, so command scrollback is never wiped.
-    let promptPristine = true
-    let gapCleanupTimer = 0
-
-    // While armed, strip leading blank rows so the prompt lands at the very top
-    // (no starship `add_newline` gap). Re-armed before each Ctrl-L redraw so the
-    // resize cleanup doesn't reintroduce the blank line.
+    // While armed, strip leading blank rows so the first prompt lands at the
+    // very top (no starship `add_newline` gap). Do this only on renderer output:
+    // never inject Ctrl-L or other cleanup keystrokes into the user's shell.
    let stripLeading = true

    const armedWrite = (data: string) => {
@@ -451,35 +589,6 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
      term.write(next)
    }

-    const scheduleGapCleanup = () => {
-      if (!promptPristine) {
-        return
-      }
-
-      if (gapCleanupTimer) {
-        window.clearTimeout(gapCleanupTimer)
-      }
-
-      gapCleanupTimer = window.setTimeout(() => {
-        gapCleanupTimer = 0
-        const id = sessionIdRef.current
-
-        if (disposed || !id || !promptPristine) {
-          return
-        }
-
-        stripLeading = true
-        void terminalApi.write(id, '\f')
-        term.clearSelection()
-      }, 120)
-    }
-
-    cleanup.push(() => {
-      if (gapCleanupTimer) {
-        window.clearTimeout(gapCleanupTimer)
-      }
-    })
-
    const fitAndResize = () => {
      if (disposed || !host.isConnected || host.clientWidth <= 0 || host.clientHeight <= 0) {
        return
@@ -496,10 +605,11 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
      if (id && (lastSentSize?.cols !== term.cols || lastSentSize?.rows !== term.rows)) {
        lastSentSize = { cols: term.cols, rows: term.rows }
        void terminalApi.resize(id, { cols: term.cols, rows: term.rows })
-        scheduleGapCleanup()
      }
    }

+    fitRef.current = fitAndResize
+
    // Coalesce ResizeObserver bursts through rAF — running fit.fit()
    // synchronously while sibling panes are mid-transition (e.g. file browser
    // collapsing to 0px) crashes the WebGL renderer mid texture-atlas rebuild.
@@ -533,12 +643,6 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
      const id = sessionIdRef.current

      if (id) {
-        // Once the user submits a line, real output may follow — stop the
-        // pristine-prompt gap cleanup so we never clear command scrollback.
-        if (promptPristine && data.includes('\r')) {
-          promptPristine = false
-        }
-
        void terminalApi.write(id, data)
      }
    })
@@ -569,6 +673,7 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
          lastSentSize = { cols: term.cols, rows: term.rows }
          shellNameRef.current = session.shell || 'shell'
          setShellName(session.shell || 'shell')
+          onShellRef.current?.(session.shell || 'shell')

          const initial = term.hasSelection() ? term.getSelection() : ''
          selectionRef.current = initial
@@ -577,10 +682,21 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
          setStatus('open')

          cleanup.push(
-            terminalApi.onData(session.id, armedWrite),
-            terminalApi.onExit(session.id, ({ code, signal }) => {
-              setStatus('closed')
-              term.write(`\r\n[terminal exited${signal ? `: ${signal}` : code !== null ? `: ${code}` : ''}]\r\n`)
+            terminalApi.onData(session.id, data => {
+              armedWrite(data)
+              scheduleSnapshot()
+            }),
+            terminalApi.onExit(session.id, () => {
+              // Shell exited (`exit` / Ctrl-D / crash) — drop the tab like a real
+              // terminal. closeTerminal hides the pane when it's the last one.
+              // Skip if we're tearing down (cleanup disposes the PTY) OR the app
+              // is quitting/reloading: on quit the main process kills every PTY,
+              // firing this exit, but React skips the cleanup so `disposed` stays
+              // false — running closeTerminal here would wipe the persisted tabs
+              // right before relaunch restores them.
+              if (!disposed && !appTearingDown) {
+                closeTerminal(id)
+              }
            })
          )

@@ -638,7 +754,7 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
    return () => {
      disposed = true
      cleanup.forEach(run => run())
-      setActiveTerminalReader(null)
+      fitRef.current = null

      const id = sessionIdRef.current
      sessionIdRef.current = null
@@ -654,7 +770,10 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
      selectionRef.current = ''
      selectionLabelRef.current = ''
    }
-  }, [addSelectionToChat, cwd])
+    // `id` is stable for the instance's life (keyed by tab id), so listing it
+    // doesn't re-create the shell — it just satisfies the deps check for the
+    // closeTerminal(id) call in onExit.
+  }, [addSelectionToChat, cwd, id])

  useEffect(() => {
    const term = termRef.current
@@ -677,27 +796,61 @@ export function useTerminalSession({ cwd, onAddSelectionToChat }: UseTerminalSes
    return () => cancelAnimationFrame(raf)
  }, [activeTheme, themeName])

-  // Flush a queued command (e.g. a provider-disconnect) into the live session.
-  // Only active while open; the subscribe fires immediately, so a command set
-  // before this pane mounted runs as soon as the session is ready. Clearing the
-  // atom after writing stops a later remount from replaying a stale command.
+  // Expose this terminal's buffer to the agent's `read_terminal` tool, keyed by
+  // id. The tab selection (setActiveTerminalId) decides which one it reads, so
+  // every live terminal stays registered regardless of visibility.
  useEffect(() => {
    if (status !== 'open') {
      return
    }

-    return $terminalInjection.subscribe(command => {
-      const id = sessionIdRef.current
+    const term = termRef.current

-      if (!command || !id) {
+    return term ? registerTerminalReader(id, makeTerminalReader(term)) : undefined
+  }, [id, status])
+
+  // On (re)activation: a WebGL terminal doesn't paint while visibility:hidden, so
+  // it reveals a stale/garbled frame. Refit, rebuild the glyph atlas, and force a
+  // full redraw against the live buffer, then focus.
+  useEffect(() => {
+    if (!active || status !== 'open') {
+      return
+    }
+
+    const frame = requestAnimationFrame(() => {
+      const term = termRef.current
+
+      fitRef.current?.()
+      webglRef.current?.clearTextureAtlas()
+      term?.refresh(0, term.rows - 1)
+      term?.focus()
+    })
+
+    return () => cancelAnimationFrame(frame)
+  }, [active, status])
+
+  // Flush a queued command (e.g. a provider-disconnect) into the live session.
+  // Only the active tab runs it (so a broadcast doesn't fan out to every shell);
+  // the subscribe fires immediately, so a command set before this pane mounted
+  // runs as soon as the session is ready. Cleared after writing so a later
+  // remount can't replay a stale command.
+  useEffect(() => {
+    if (!active || status !== 'open') {
+      return
+    }
+
+    return $terminalInjection.subscribe(command => {
+      const sessionId = sessionIdRef.current
+
+      if (!command || !sessionId) {
        return
      }

-      void window.hermesDesktop?.terminal?.write(id, `${command}\r`)
+      void window.hermesDesktop?.terminal?.write(sessionId, `${command}\r`)
      $terminalInjection.set(null)
      termRef.current?.focus()
    })
-  }, [status])
+  }, [active, status])

  return {
    addSelectionToChat,
--- a/apps/desktop/src/app/right-sidebar/terminal/workspace.tsx
+++ b/apps/desktop/src/app/right-sidebar/terminal/workspace.tsx
@@ -0,0 +1,65 @@
+import { useStore } from '@nanostores/react'
+import { useEffect } from 'react'
+
+import { $backgroundStatusBySession } from '@/store/composer-status'
+
+import { seedAgentTerminalCommand, syncAgentTerminalSnapshot } from './agent-terminal-stream'
+import { setActiveTerminalId } from './buffer'
+import { AgentTerminalInstance, TerminalInstance } from './instance'
+import { $activeTerminalId, $terminals, ensureAgentTerminal } from './terminals'
+
+interface TerminalWorkspaceProps {
+  onAddSelectionToChat: (text: string, label?: string) => void
+}
+
+/** The persistent-overlay layer: the stack of live xterm instances (only these
+ *  must stay in the fixed overlay, for the WebGL host). Mount/visibility is owned
+ *  by PersistentTerminal (latched so shells survive hiding); the tab rail and
+ *  new-terminal control live in the pane DOM — see TerminalPaneChrome. */
+export function TerminalWorkspace({ onAddSelectionToChat }: TerminalWorkspaceProps) {
+  const terminals = useStore($terminals)
+  const activeId = useStore($activeTerminalId)
+  const background = useStore($backgroundStatusBySession)
+
+  // Mirror the tab selection into the agent reader (read_terminal reads it).
+  useEffect(() => {
+    const unsubscribe = $activeTerminalId.subscribe(setActiveTerminalId)
+
+    return () => {
+      unsubscribe()
+      setActiveTerminalId(null)
+    }
+  }, [])
+
+  // Surface the agent's background processes as read-only tabs (once each).
+  // Live chunks stream via agent.terminal.output; the process-list snapshot also
+  // seeds/falls back so the tab never stays blank if the stream races startup.
+  useEffect(() => {
+    for (const list of Object.values(background)) {
+      for (const item of list) {
+        ensureAgentTerminal(item.id, item.title)
+        seedAgentTerminalCommand(item.id, item.title)
+        syncAgentTerminalSnapshot(item.id, item.output ?? '')
+      }
+    }
+  }, [background])
+
+  return (
+    <>
+      {terminals.map(term =>
+        term.kind === 'agent' ? (
+          <AgentTerminalInstance active={term.id === activeId} id={term.id} key={term.id} procId={term.procId!} />
+        ) : (
+          <TerminalInstance
+            active={term.id === activeId}
+            cwd={term.cwd}
+            id={term.id}
+            key={term.id}
+            onAddSelectionToChat={onAddSelectionToChat}
+            reviveBuffer={term.reviveBuffer}
+          />
+        )
+      )}
+    </>
+  )
+}
--- a/apps/desktop/src/app/session/hooks/use-message-stream.ts
+++ b/apps/desktop/src/app/session/hooks/use-message-stream.ts
@@ -1,6 +1,8 @@
 import type { QueryClient } from '@tanstack/react-query'
 import { type MutableRefObject, useCallback, useEffect, useRef } from 'react'

+import { writeAgentTerminalChunk } from '@/app/right-sidebar/terminal/agent-terminal-stream'
+import { closeAgentTerminalByProc } from '@/app/right-sidebar/terminal/terminals'
 import { readActiveTerminal } from '@/app/right-sidebar/terminal/buffer'
 import { translateNow } from '@/i18n'
 import {
@@ -900,6 +902,29 @@ export function useMessageStream({
          appendReasoningDelta(sessionId, coerceThinkingText(payload?.text), true)
        }

+        if (isActiveEvent) {
+          setPetActivity({ reasoning: true })
+        }
+      } else if (event.type === 'moa.reference') {
+        // MoA reference-model output — surface as a labelled thinking chunk
+        // (tagged with the source model) before the aggregator's response, so
+        // the mixture-of-agents process is visible. Reuses the reasoning
+        // disclosure rather than introducing a parallel surface.
+        if (sessionId) {
+          const label = coerceGatewayText(payload?.label) || 'reference'
+          const idx = typeof payload?.index === 'number' ? payload.index : undefined
+          const cnt = typeof payload?.count === 'number' ? payload.count : undefined
+          const header = idx && cnt ? `◇ Reference ${idx}/${cnt} — ${label}` : `◇ Reference — ${label}`
+          const body = coerceThinkingText(payload?.text)
+          appendReasoningDelta(sessionId, `${header}\n${body}\n\n`, true)
+        }
+
+        if (isActiveEvent) {
+          setPetActivity({ reasoning: true })
+        }
+      } else if (event.type === 'moa.aggregating') {
+        // Status transition only; the aggregator's reply arrives via the normal
+        // message stream. No reasoning/transcript mutation here.
        if (isActiveEvent) {
          setPetActivity({ reasoning: true })
        }
@@ -1142,6 +1167,13 @@ export function useMessageStream({
            text: result ? JSON.stringify(result) : ''
          })
        }
+      } else if (event.type === 'agent.terminal.output') {
+        // Live chunk from a background process → its read-only agent terminal tab.
+        writeAgentTerminalChunk(payload?.process_id ?? '', payload?.chunk ?? '')
+      } else if (event.type === 'terminal.close') {
+        // Agent closed its own read-only tab via the desktop-gated close_terminal tool.
+        // The process is untouched — this only drops the view.
+        closeAgentTerminalByProc(payload?.process_id ?? '')
      } else if (event.type === 'status.update') {
        if (sessionId && payload?.kind === 'compacting') {
          setSessionCompacting(sessionId, true)
--- a/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx
+++ b/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx
@@ -3,9 +3,19 @@ import type { MutableRefObject } from 'react'
 import { useEffect } from 'react'
 import { afterEach, describe, expect, it, vi } from 'vitest'

-import { getSessionMessages } from '@/hermes'
+import { getSessionMessages, type SessionInfo } from '@/hermes'
+import { createClientSessionState } from '@/lib/chat-runtime'
 import { $activeGatewayProfile, $newChatProfile } from '@/store/profile'
-import { $currentCwd, $messages, $resumeFailedSessionId, setMessages, setResumeFailedSessionId } from '@/store/session'
+import {
+  $activeSessionId,
+  $currentCwd,
+  $messages,
+  $resumeFailedSessionId,
+  setActiveSessionId,
+  setMessages,
+  setResumeFailedSessionId,
+  setSessions
+} from '@/store/session'

 import type { ClientSessionState } from '../../types'

@@ -22,6 +32,25 @@ vi.mock('@/hermes', async importOriginal => ({

 const RUNTIME_SESSION_ID = 'rt-new-001'

+function storedSession(overrides: Partial<SessionInfo> = {}): SessionInfo {
+  return {
+    ended_at: null,
+    id: 'stored-1',
+    input_tokens: 0,
+    is_active: false,
+    last_active: 1,
+    message_count: 0,
+    model: null,
+    output_tokens: 0,
+    preview: null,
+    source: 'desktop',
+    started_at: 1,
+    title: 'stored',
+    tool_call_count: 0,
+    ...overrides
+  }
+}
+
 function Harness({
  onReady,
  requestGateway
@@ -84,6 +113,7 @@ describe('createBackendSessionForSend profile routing', () => {
    cleanup()
    $newChatProfile.set(null)
    $activeGatewayProfile.set('default')
+    $currentCwd.set('')
    vi.restoreAllMocks()
  })

@@ -117,6 +147,14 @@ describe('createBackendSessionForSend profile routing', () => {

    expect(params).toMatchObject({ profile: 'default' })
  })
+
+  it('passes the current workspace cwd into session.create', async () => {
+    const params = await createWith(() => {
+      $currentCwd.set('/remote/worktree')
+    })
+
+    expect(params).toMatchObject({ cwd: '/remote/worktree' })
+  })
 })

 // ── Resume failure recovery (the "stuck loading session window" bug) ──────────
@@ -126,10 +164,14 @@ describe('createBackendSessionForSend profile routing', () => {
 // succeeds must NOT leave the flag armed.
 function ResumeHarness({
  onReady,
-  requestGateway
+  requestGateway,
+  runtimeIdByStoredSessionIdRef,
+  sessionStateByRuntimeIdRef
 }: {
  onReady: (resume: (storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) => void
  requestGateway: <T>(method: string, params?: Record<string, unknown>) => Promise<T>
+  runtimeIdByStoredSessionIdRef?: MutableRefObject<Map<string, string>>
+  sessionStateByRuntimeIdRef?: MutableRefObject<Map<string, ClientSessionState>>
 }) {
  const ref = <T,>(value: T): MutableRefObject<T> => ({ current: value })

@@ -142,10 +184,10 @@ function ResumeHarness({
    getRouteToken: () => 'token',
    navigate: vi.fn() as never,
    requestGateway,
-    runtimeIdByStoredSessionIdRef: ref(new Map<string, string>()),
+    runtimeIdByStoredSessionIdRef: runtimeIdByStoredSessionIdRef ?? ref(new Map<string, string>()),
    selectedStoredSessionId: null,
    selectedStoredSessionIdRef: ref<string | null>(null),
-    sessionStateByRuntimeIdRef: ref(new Map<string, ClientSessionState>()),
+    sessionStateByRuntimeIdRef: sessionStateByRuntimeIdRef ?? ref(new Map<string, ClientSessionState>()),
    syncSessionStateToView: vi.fn(),
    updateSessionState: (_sessionId, updater) => updater({} as ClientSessionState)
  })
@@ -160,16 +202,22 @@ function ResumeHarness({
 describe('resumeSession failure recovery', () => {
  afterEach(() => {
    cleanup()
+    setActiveSessionId(null)
    setResumeFailedSessionId(null)
    setMessages([])
+    setSessions([])
    vi.restoreAllMocks()
  })

  async function runResume(
-    requestGateway: <T>(method: string, params?: Record<string, unknown>) => Promise<T>
+    requestGateway: <T>(method: string, params?: Record<string, unknown>) => Promise<T>,
+    options: {
+      runtimeIdByStoredSessionIdRef?: MutableRefObject<Map<string, string>>
+      sessionStateByRuntimeIdRef?: MutableRefObject<Map<string, ClientSessionState>>
+    } = {}
  ): Promise<void> {
    let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) | null = null
-    render(<ResumeHarness onReady={r => (resume = r)} requestGateway={requestGateway} />)
+    render(<ResumeHarness onReady={r => (resume = r)} requestGateway={requestGateway} {...options} />)
    await waitFor(() => expect(resume).not.toBeNull())
    await resume!('stored-1', true)
  }
@@ -281,4 +329,187 @@ describe('resumeSession failure recovery', () => {
    expect(resumeParams).not.toHaveProperty('lazy')
    expect(resumeParams).not.toHaveProperty('eager_build')
  })
+
+  it('arms the failure latch when resume succeeds with an empty transcript for a non-empty stored session', async () => {
+    setSessions([storedSession({ message_count: 4 })])
+
+    const requestGateway = vi.fn(async (method: string, params?: Record<string, unknown>) => {
+      if (method === 'session.resume') {
+        return { session_id: 'runtime-1', resumed: params?.session_id, messages: [], info: {} } as never
+      }
+
+      return {} as never
+    })
+
+    vi.mocked(getSessionMessages).mockResolvedValue({ messages: [], session_id: 'stored-1' } as never)
+
+    await runResume(requestGateway)
+
+    expect($resumeFailedSessionId.get()).toBe('stored-1')
+    expect($activeSessionId.get()).toBeNull()
+    expect($messages.get()).toEqual([])
+  })
+
+  it('does not reuse an empty cached runtime view for a stored session with history', async () => {
+    const runtimeIdByStoredSessionIdRef = {
+      current: new Map([['stored-1', 'runtime-stale']])
+    } satisfies MutableRefObject<Map<string, string>>
+
+    const sessionStateByRuntimeIdRef = {
+      current: new Map([
+        [
+          'runtime-stale',
+          {
+            awaitingResponse: false,
+            branch: '',
+            busy: false,
+            cwd: '',
+            fast: false,
+            interrupted: false,
+            messages: [],
+            model: '',
+            needsInput: false,
+            pendingBranchGroup: null,
+            personality: '',
+            provider: '',
+            reasoningEffort: '',
+            sawAssistantPayload: false,
+            serviceTier: '',
+            storedSessionId: 'stored-1',
+            streamId: null,
+            turnStartedAt: null,
+            yolo: false
+          }
+        ]
+      ])
+    } satisfies MutableRefObject<Map<string, ClientSessionState>>
+
+    setSessions([storedSession({ message_count: 4 })])
+
+    const requestGateway = vi.fn(async (method: string, params?: Record<string, unknown>) => {
+      if (method === 'session.resume') {
+        return { session_id: 'runtime-1', resumed: params?.session_id, messages: [], info: {} } as never
+      }
+
+      return {} as never
+    })
+
+    vi.mocked(getSessionMessages).mockResolvedValue({
+      messages: [{ content: 'existing text', role: 'user', timestamp: 1 }],
+      session_id: 'stored-1'
+    } as never)
+
+    await runResume(requestGateway, {
+      runtimeIdByStoredSessionIdRef,
+      sessionStateByRuntimeIdRef
+    })
+
+    expect(requestGateway).not.toHaveBeenCalledWith('session.usage', { session_id: 'runtime-stale' })
+    expect(runtimeIdByStoredSessionIdRef.current.has('stored-1')).toBe(false)
+    expect(sessionStateByRuntimeIdRef.current.has('runtime-stale')).toBe(false)
+    expect($activeSessionId.get()).toBe('runtime-1')
+    expect($messages.get().length).toBe(1)
+  })
+})
+
+// ── Warm-cache mapping integrity (the "open chat A, chat B loads" bug) ─────────
+// resumeSession's warm fast-path maps storedSessionId -> runtimeId -> cached
+// state. A reaped/respawned pooled backend re-mints runtime ids, so a recycled
+// id can resolve to a live-but-DIFFERENT session's cache entry. The fast-path
+// must verify the cached state still BELONGS to the resumed session before it
+// paints, or it shows a totally different thread under the current route.
+const clientState = (storedSessionId: string | null): ClientSessionState => createClientSessionState(storedSessionId)
+
+describe('resumeSession warm-cache mapping integrity', () => {
+  afterEach(() => {
+    cleanup()
+    setActiveSessionId(null)
+    setResumeFailedSessionId(null)
+    setMessages([])
+    setSessions([])
+    vi.restoreAllMocks()
+  })
+
+  it('rejects a cross-wired runtime mapping and falls through to a full resume', async () => {
+    // A recycled runtime id ('rt-recycled') is mapped to 'stored-A', but its
+    // cached state actually belongs to a DIFFERENT session ('stored-B') — the
+    // exact "open chat A, chat B loads" corruption a reaped/respawned pooled
+    // backend can leave behind.
+    const runtimeIdByStoredSessionIdRef: MutableRefObject<Map<string, string>> = {
+      current: new Map([['stored-A', 'rt-recycled']])
+    }
+
+    const sessionStateByRuntimeIdRef: MutableRefObject<Map<string, ClientSessionState>> = {
+      current: new Map([['rt-recycled', clientState('stored-B')]])
+    }
+
+    const requestGateway = vi.fn(async (method: string, params?: Record<string, unknown>) => {
+      if (method === 'session.resume') {
+        return { session_id: 'rt-A-fresh', resumed: params?.session_id, messages: [], info: {} } as never
+      }
+
+      return {} as never
+    })
+
+    vi.mocked(getSessionMessages).mockResolvedValue({ messages: [] } as never)
+
+    let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) | null = null
+    render(
+      <ResumeHarness
+        onReady={r => (resume = r)}
+        requestGateway={requestGateway}
+        runtimeIdByStoredSessionIdRef={runtimeIdByStoredSessionIdRef}
+        sessionStateByRuntimeIdRef={sessionStateByRuntimeIdRef}
+      />
+    )
+    await waitFor(() => expect(resume).not.toBeNull())
+    await resume!('stored-A', true)
+
+    // The fast-path did NOT short-circuit on the cross-wired cache — the full
+    // resume RPC ran, for the session that was actually requested.
+    const resumeCalls = requestGateway.mock.calls.filter(([method]) => method === 'session.resume')
+    expect(resumeCalls.length).toBe(1)
+    expect(resumeCalls[0][1]).toMatchObject({ session_id: 'stored-A' })
+
+    // The corrupt mapping was purged so it can't mis-resolve again.
+    expect(runtimeIdByStoredSessionIdRef.current.has('stored-A')).toBe(false)
+    expect(sessionStateByRuntimeIdRef.current.has('rt-recycled')).toBe(false)
+  })
+
+  it('honours a warm cache entry whose stored id matches (no needless refetch)', async () => {
+    // Correctly-wired mapping: 'rt-A' <-> 'stored-A'. The fast-path should trust
+    // it and never reach session.resume (only the lightweight usage probe).
+    const runtimeIdByStoredSessionIdRef: MutableRefObject<Map<string, string>> = {
+      current: new Map([['stored-A', 'rt-A']])
+    }
+
+    const sessionStateByRuntimeIdRef: MutableRefObject<Map<string, ClientSessionState>> = {
+      current: new Map([['rt-A', clientState('stored-A')]])
+    }
+
+    const requestGateway = vi.fn(async (method: string) => {
+      if (method === 'session.usage') {
+        return { input: 0, output: 0, total: 0 } as never
+      }
+
+      return {} as never
+    })
+
+    let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) | null = null
+    render(
+      <ResumeHarness
+        onReady={r => (resume = r)}
+        requestGateway={requestGateway}
+        runtimeIdByStoredSessionIdRef={runtimeIdByStoredSessionIdRef}
+        sessionStateByRuntimeIdRef={sessionStateByRuntimeIdRef}
+      />
+    )
+    await waitFor(() => expect(resume).not.toBeNull())
+    await resume!('stored-A', true)
+
+    // Fast-path served the session from cache: no full resume RPC, mapping intact.
+    const methods = requestGateway.mock.calls.map(([method]) => method)
+    expect(methods).not.toContain('session.resume')
+    expect(runtimeIdByStoredSessionIdRef.current.get('stored-A')).toBe('rt-A')
+  })
 })
--- a/apps/desktop/src/app/session/hooks/use-session-actions.ts
+++ b/apps/desktop/src/app/session/hooks/use-session-actions.ts
@@ -252,6 +252,10 @@ function sessionMatchesStoredId(session: SessionInfo, storedSessionId: string):
  return session.id === storedSessionId || session._lineage_root_id === storedSessionId
 }

+function sessionShouldHaveTranscript(session: SessionInfo | undefined): boolean {
+  return (session?.message_count ?? 0) > 0
+}
+
 function upsertResolvedSession(session: SessionInfo, storedSessionId: string) {
  const lineage = session._lineage_root_id ?? session.id

@@ -627,9 +631,34 @@ export function useSessionActions({
      // chat view drops the error state and shows the loader again.
      setResumeExhaustedSessionId(current => (current === storedSessionId ? null : current))

-      const warmRuntimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId)
+      // A warm cache entry is only trustworthy when it still BELONGS to the
+      // session being resumed. A pooled profile backend that gets idle-reaped
+      // and respawned (pruneSecondaryGateways) re-mints runtime ids, so a
+      // recycled id can resolve to a live-but-DIFFERENT session's cache entry.
+      // The session.usage 404 guard below only catches a fully-DEAD id — a
+      // recycled-live id 200s, so an unchecked hit paints the wrong transcript
+      // under the current route (the "open chat A, chat B loads" bug). On a
+      // mismatch the mapping is cross-wired: purge both sides and report a miss
+      // so the caller falls through to a full resume that rebinds a correct id.
+      const takeWarmCache = (): { runtimeId: string; state: ClientSessionState } | null => {
+        const runtimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId)
+        const state = runtimeId ? sessionStateByRuntimeIdRef.current.get(runtimeId) : undefined

-      if (!warmRuntimeId || !sessionStateByRuntimeIdRef.current.get(warmRuntimeId)) {
+        if (!runtimeId || !state) {
+          return null
+        }
+
+        if (state.storedSessionId !== storedSessionId) {
+          runtimeIdByStoredSessionIdRef.current.delete(storedSessionId)
+          sessionStateByRuntimeIdRef.current.delete(runtimeId)
+
+          return null
+        }
+
+        return { runtimeId, state }
+      }
+
+      if (!takeWarmCache()) {
        setActiveSessionId(null)
        activeSessionIdRef.current = null
        setMessages([])
@@ -648,11 +677,15 @@ export function useSessionActions({

      await ensureGatewayProfile(sessionProfile)

-      const cachedRuntimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId)
-      const cachedState = cachedRuntimeId && sessionStateByRuntimeIdRef.current.get(cachedRuntimeId)
+      // Re-check after the profile-resolve / gateway-swap awaits above: the
+      // cache may have changed, and takeWarmCache re-validates belongs-to and
+      // purges a cross-wired mapping before we trust the fast-path.
+      const warmHit = takeWarmCache()

-      if (cachedRuntimeId && cachedState) {
-        const stored = $sessions.get().find(session => session.id === storedSessionId)
+      if (warmHit) {
+        const cachedRuntimeId = warmHit.runtimeId
+        const cachedState = warmHit.state
+        const stored = $sessions.get().find(session => sessionMatchesStoredId(session, storedSessionId)) ?? storedForProfile

        const cachedViewState =
          !cachedState.model && stored?.model != null
@@ -666,41 +699,46 @@ export function useSessionActions({
          sessionStateByRuntimeIdRef.current.set(cachedRuntimeId, cachedViewState)
        }

-        setFreshDraftReady(false)
-        clearNotifications()
-        setSelectedStoredSessionId(storedSessionId)
-        selectedStoredSessionIdRef.current = storedSessionId
-        setActiveSessionId(cachedRuntimeId)
-        activeSessionIdRef.current = cachedRuntimeId
-        syncSessionStateToView(cachedRuntimeId, cachedViewState)
-        setCurrentCwd(cachedViewState.cwd)
-        setCurrentBranch(cachedViewState.branch)
-        setSessionStartedAt(Date.now())
-
-        try {
-          const usage = await requestGateway<UsageStats>('session.usage', { session_id: cachedRuntimeId })
-
-          if (!isCurrentResume()) {
-            return
-          }
-
-          if (usage) {
-            setCurrentUsage(current => ({ ...current, ...usage }))
-          }
-
-          return
-        } catch {
-          // The cached runtime id was minted by a prior backend instance. A
-          // pooled profile backend that gets idle-reaped (pruneSecondaryGateways)
-          // and respawned across a profile swap mints fresh ids, so this mapping
-          // now 404s ("session not found"). Drop it and fall through to a full
-          // resume that rebinds a live runtime id.
-          if (!isCurrentResume()) {
-            return
-          }
-
+        if (sessionShouldHaveTranscript(stored) && cachedViewState.messages.length === 0) {
          runtimeIdByStoredSessionIdRef.current.delete(storedSessionId)
          sessionStateByRuntimeIdRef.current.delete(cachedRuntimeId)
+        } else {
+          setFreshDraftReady(false)
+          clearNotifications()
+          setSelectedStoredSessionId(storedSessionId)
+          selectedStoredSessionIdRef.current = storedSessionId
+          setActiveSessionId(cachedRuntimeId)
+          activeSessionIdRef.current = cachedRuntimeId
+          syncSessionStateToView(cachedRuntimeId, cachedViewState)
+          setCurrentCwd(cachedViewState.cwd)
+          setCurrentBranch(cachedViewState.branch)
+          setSessionStartedAt(Date.now())
+
+          try {
+            const usage = await requestGateway<UsageStats>('session.usage', { session_id: cachedRuntimeId })
+
+            if (!isCurrentResume()) {
+              return
+            }
+
+            if (usage) {
+              setCurrentUsage(current => ({ ...current, ...usage }))
+            }
+
+            return
+          } catch {
+            // The cached runtime id was minted by a prior backend instance. A
+            // pooled profile backend that gets idle-reaped (pruneSecondaryGateways)
+            // and respawned across a profile swap mints fresh ids, so this mapping
+            // now 404s ("session not found"). Drop it and fall through to a full
+            // resume that rebinds a live runtime id.
+            if (!isCurrentResume()) {
+              return
+            }
+
+            runtimeIdByStoredSessionIdRef.current.delete(storedSessionId)
+            sessionStateByRuntimeIdRef.current.delete(cachedRuntimeId)
+          }
        }
      }

@@ -714,7 +752,7 @@ export function useSessionActions({
      setSelectedStoredSessionId(storedSessionId)
      selectedStoredSessionIdRef.current = storedSessionId
      setSessionStartedAt(Date.now())
-      const stored = $sessions.get().find(session => sessionMatchesStoredId(session, storedSessionId))
+      const stored = $sessions.get().find(session => sessionMatchesStoredId(session, storedSessionId)) ?? storedForProfile
      applyStoredSessionPreviewRuntimeInfo(stored)

      if (stored) {
@@ -804,6 +842,15 @@ export function useSessionActions({
            ? currentMessages
            : preserveLocalAssistantErrors(preferredMessages, currentMessages)

+        if (sessionShouldHaveTranscript(stored) && messagesForView.length === 0) {
+          setActiveSessionId(null)
+          activeSessionIdRef.current = null
+          setResumeFailedSessionId(storedSessionId)
+          resumedRunning = false
+
+          return
+        }
+
        setActiveSessionId(resumed.session_id)
        activeSessionIdRef.current = resumed.session_id
        const runtimeInfo = applyRuntimeInfo(resumed.info)
--- a/apps/desktop/src/app/settings/index.tsx
+++ b/apps/desktop/src/app/settings/index.tsx
@@ -213,7 +213,7 @@ export function SettingsView({ gateway, onClose, onConfigSaved, onMainModelChang
          </div>
        </OverlaySidebar>

-        <OverlayMain className="px-0 pb-0 pt-[calc(var(--titlebar-height)+1rem)]">
+        <OverlayMain className="px-0 pb-0 pt-[calc(var(--titlebar-height)/2+1rem)]">
          {activeView === 'config:appearance' ? (
            <AppearanceSettings />
          ) : activeView === 'about' ? (
--- a/apps/desktop/src/app/settings/model-settings.tsx
+++ b/apps/desktop/src/app/settings/model-settings.tsx
@@ -78,6 +78,12 @@ const AUX_TASKS: readonly AuxTaskMeta[] = [

 const NO_PROVIDERS: readonly ModelOptionProvider[] = [{ name: '—', slug: '', models: [] }]

+// Radix <Select> renders a blank trigger when `value` matches no <SelectItem>.
+// A custom model (e.g. one added via config that isn't in the provider's
+// curated list) would vanish — surface the active value so it stays selectable.
+export const withActive = (models: readonly string[], active: string): readonly string[] =>
+  active && !models.includes(active) ? [active, ...models] : models
+
 interface StaleAuxWarningProps {
  applying: boolean
  onReset: () => void
@@ -555,7 +561,7 @@ export function ModelSettings({ onMainModelChanged }: ModelSettingsProps) {
                  <SelectValue placeholder={m.model} />
                </SelectTrigger>
                <SelectContent>
-                  {(selectedProviderModels.length ? selectedProviderModels : []).map(model => (
+                  {withActive(selectedProviderModels, selectedModel).map(model => (
                    <SelectItem key={model} value={model}>
                      {model}
                    </SelectItem>
@@ -708,7 +714,7 @@ export function ModelSettings({ onMainModelChanged }: ModelSettingsProps) {
                          <SelectValue placeholder={m.model} />
                        </SelectTrigger>
                        <SelectContent>
-                          {(auxDraftProviderModels.length ? auxDraftProviderModels : []).map(model => (
+                          {withActive(auxDraftProviderModels, auxDraft.model).map(model => (
                            <SelectItem key={model} value={model}>
                              {model}
                            </SelectItem>
@@ -880,7 +886,7 @@ export function ModelSettings({ onMainModelChanged }: ModelSettingsProps) {
                        <SelectValue placeholder={m.model} />
                      </SelectTrigger>
                      <SelectContent>
-                        {modelsForProvider(slot.provider).map(model => (
+                        {withActive(modelsForProvider(slot.provider), slot.model).map(model => (
                          <SelectItem key={model} value={model}>
                            {model}
                          </SelectItem>
@@ -957,7 +963,10 @@ export function ModelSettings({ onMainModelChanged }: ModelSettingsProps) {
                      <SelectValue placeholder={m.model} />
                    </SelectTrigger>
                    <SelectContent>
-                      {modelsForProvider(currentMoaPreset.aggregator.provider).map(model => (
+                      {withActive(
+                        modelsForProvider(currentMoaPreset.aggregator.provider),
+                        currentMoaPreset.aggregator.model
+                      ).map(model => (
                        <SelectItem key={model} value={model}>
                          {model}
                        </SelectItem>
--- a/apps/desktop/src/app/settings/with-active.test.ts
+++ b/apps/desktop/src/app/settings/with-active.test.ts
@@ -0,0 +1,34 @@
+import { describe, expect, it } from 'vitest'
+
+import { withActive } from './model-settings'
+
+// A Radix <Select> shows a blank trigger when its `value` matches no
+// <SelectItem>. `withActive` guarantees the controlled value is always
+// representable so a config-only / custom model never renders blank.
+describe('withActive', () => {
+  const curated = ['hermes-4', 'hermes-4-mini']
+
+  it('prepends a custom model missing from the curated list', () => {
+    expect(withActive(curated, 'anthropic/claude-opus-4.7')).toEqual([
+      'anthropic/claude-opus-4.7',
+      ...curated
+    ])
+  })
+
+  it('leaves the list untouched when the active model is already curated', () => {
+    expect(withActive(curated, 'hermes-4')).toEqual(curated)
+  })
+
+  it('does not inject an empty active value', () => {
+    expect(withActive(curated, '')).toEqual(curated)
+  })
+
+  it('surfaces the active model even when the curated list is empty', () => {
+    expect(withActive([], 'anthropic/claude-opus-4.7')).toEqual(['anthropic/claude-opus-4.7'])
+  })
+
+  it('keeps the active model selectable as the invariant', () => {
+    const out = withActive(curated, 'custom/model')
+    expect(out).toContain('custom/model')
+  })
+})
--- a/apps/desktop/src/app/shell/app-shell.tsx
+++ b/apps/desktop/src/app/shell/app-shell.tsx
@@ -192,7 +192,7 @@ export function AppShell({
      {nativeOverlayWidth > 0 && (
        <div
          aria-hidden
-          className="pointer-events-none fixed inset-x-0 top-0 z-[4] h-(--titlebar-height) border-b border-(--ui-stroke-tertiary) bg-(--ui-chat-surface-background)"
+          className="pointer-events-none fixed right-0 top-0 z-[4] h-(--titlebar-height) w-(--titlebar-tools-right) border-b border-(--ui-stroke-tertiary) bg-(--ui-chat-surface-background)"
        />
      )}

--- a/apps/desktop/src/app/shell/context-usage-panel.tsx
+++ b/apps/desktop/src/app/shell/context-usage-panel.tsx
@@ -0,0 +1,147 @@
+import { useEffect, useMemo, useState } from 'react'
+
+import { useI18n } from '@/i18n'
+import { formatK } from '@/lib/statusbar'
+import { cn } from '@/lib/utils'
+import type { ContextBreakdown, ContextUsageCategory, UsageStats } from '@/types/hermes'
+
+interface ContextUsagePanelProps {
+  currentUsage: UsageStats
+  requestGateway: <T = unknown>(method: string, params?: Record<string, unknown>) => Promise<T>
+  sessionId: string | null
+}
+
+export function ContextUsagePanel({ currentUsage, requestGateway, sessionId }: ContextUsagePanelProps) {
+  const { t } = useI18n()
+  const copy = t.shell.statusbar.contextUsagePanel
+  const [breakdown, setBreakdown] = useState<ContextBreakdown | null>(null)
+  const [loading, setLoading] = useState(false)
+
+  useEffect(() => {
+    if (!sessionId) {
+      setBreakdown(null)
+      setLoading(false)
+      return
+    }
+
+    let cancelled = false
+    setLoading(true)
+
+    void requestGateway<ContextBreakdown>('session.context_breakdown', { session_id: sessionId })
+      .then(data => {
+        if (!cancelled) {
+          setBreakdown(data)
+        }
+      })
+      .catch(() => {
+        if (!cancelled) {
+          setBreakdown(null)
+        }
+      })
+      .finally(() => {
+        if (!cancelled) {
+          setLoading(false)
+        }
+      })
+
+    return () => {
+      cancelled = true
+    }
+  }, [requestGateway, sessionId])
+
+  const contextMax = breakdown?.context_max ?? currentUsage.context_max ?? 0
+  const contextUsed = breakdown?.context_used ?? currentUsage.context_used ?? 0
+  const contextPercent = Math.max(
+    0,
+    Math.min(100, Math.round(breakdown?.context_percent ?? currentUsage.context_percent ?? 0))
+  )
+
+  const categories = useMemo(
+    () =>
+      (breakdown?.categories ?? []).map(category => ({
+        ...category,
+        label: copy.categories[category.id as keyof typeof copy.categories] ?? category.label
+      })),
+    [breakdown?.categories, copy.categories]
+  )
+
+  const segmentTotal = categories.reduce((sum, category) => sum + category.tokens, 0) || contextUsed || 1
+
+  return (
+    <div className="flex w-72 flex-col gap-3 p-3 text-[0.75rem]" data-slot="context-usage-panel">
+      <div className="flex items-baseline justify-between gap-2">
+        <p className="font-medium text-foreground">{copy.title}</p>
+
+        <span className="text-[0.6875rem] text-muted-foreground">
+          {copy.tokenSummary(`~${formatK(contextUsed)}`, formatK(contextMax))}
+        </span>
+      </div>
+
+      <p className="text-[0.6875rem] text-foreground">{copy.percentFull(contextPercent)}</p>
+
+      <ContextUsageBar categories={categories} segmentTotal={segmentTotal} />
+
+      <ul className="flex flex-col gap-1.5">
+        {categories.map(category => (
+          <li className="flex items-center justify-between gap-2" key={category.id}>
+            <span className="flex min-w-0 items-center gap-2">
+              <span
+                className="size-2 shrink-0 rounded-[2px]"
+                style={{ background: category.color }}
+              />
+
+              <span className="truncate text-muted-foreground">{category.label}</span>
+            </span>
+
+            <span className="shrink-0 tabular-nums text-foreground">{formatCategoryTokens(category.tokens)}</span>
+          </li>
+        ))}
+      </ul>
+
+      {loading && <p className="text-[0.6875rem] text-muted-foreground">{copy.loading}</p>}
+
+      {!loading && !categories.length && <p className="text-[0.6875rem] text-muted-foreground">{copy.empty}</p>}
+    </div>
+  )
+}
+
+function ContextUsageBar({
+  categories,
+  segmentTotal
+}: {
+  categories: readonly ContextUsageCategory[]
+  segmentTotal: number
+}) {
+  return (
+    <div
+      className={cn(
+        'flex h-1.5 overflow-hidden rounded-full',
+        categories.length ? 'bg-(--ui-stroke-tertiary)' : 'dither bg-(--ui-bg-elevated)'
+      )}
+      data-slot="context-usage-bar"
+    >
+      {categories.map(category => (
+        <span
+          className="h-full min-w-px"
+          key={category.id}
+          style={{
+            background: category.color,
+            width: `${(category.tokens / segmentTotal) * 100}%`
+          }}
+        />
+      ))}
+    </div>
+  )
+}
+
+function formatCategoryTokens(value: number): string {
+  if (!Number.isFinite(value) || value <= 0) {
+    return '0'
+  }
+
+  if (value >= 1_000) {
+    return `${formatK(value)}`
+  }
+
+  return value.toLocaleString()
+}
--- a/Show More
+++ b/Show More