refactor(desktop): extract filesystem IPC handlers from main.cjs into fs-ipc.cjs

Second main.cjs cluster peel (after git-ipc). The six hermes:fs:* handlers (readDir, gitRoot, reveal, rename, writeText, trash) move verbatim into electron/fs-ipc.cjs behind a registerFsIpc({ ipcMain, directoryExists, expandUserPath }) registrar — same injection pattern as registerGitIpc. Path hardening / read-dir / git-root come from their sibling modules directly; the two main-process path helpers are injected so the module stays side-effect free. Channel names are unchanged, so preload + renderer are untouched. main.cjs drops ~85 lines; the now-dead fs-read-dir / git-root requires in main.cjs are removed. Adds electron/fs-ipc.test.cjs asserting the hermes:fs:* surface by invariant.
refactor(desktop): assert git-ipc surface by invariant, drop channel snapshot
2026-07-05 01:27:52 +08:00 · 2026-06-30 13:26:53 -05:00 · 2026-06-30 02:05:07 -05:00 · 2026-06-30 01:42:33 -05:00 · 2026-06-30 01:10:08 -05:00 · 2026-06-30 00:59:59 -05:00
1456 changed files with 134638 additions and 16623 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -66,8 +66,12 @@ runtime/

 # ---------- Not needed inside the Docker image ----------

-# Desktop app source (Tauri/Electron); never installed in the container
+# Desktop app source (Tauri/Electron); never installed in the container.
+# apps/shared is the dashboard↔desktop websocket helper and is linked from
+# web/package.json as a file: workspace dep — keep it in the build context.
 apps/
+!apps/shared/
+!apps/shared/**

 # Test suite — not shipped in production images
 tests/
--- a/.envrc
+++ b/.envrc
@@ -1,5 +1,5 @@
 watch_file pyproject.toml uv.lock
 watch_file package-lock.json package.json web/package.json ui-tui/package.json website/package.json apps/shared/package.json apps/desktop/package.json ui-tui/packages/hermes-ink/package.json
-watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix nix/hermes-agent.nix nix/desktop.nix

 use flake
--- a/.github/actions/hermes-smoke-test/action.yml
+++ b/.github/actions/hermes-smoke-test/action.yml
@@ -1,50 +0,0 @@
-name: Hermes smoke test
-description: >
-  Run the image's built-in entrypoint against `--help` and `dashboard --help`
-  to catch basic runtime regressions before publishing.  Requires the image
-  to already be loaded into the local Docker daemon under `image`.
-
-  Works identically on amd64 and arm64 runners.
-
-inputs:
-  image:
-    description: Fully-qualified image tag (e.g. nousresearch/hermes-agent:test)
-    required: true
-
-runs:
-  using: composite
-  steps:
-    - name: Ensure /tmp/hermes-test is hermes-writable
-      shell: bash
-      run: |
-        # The image runs as the hermes user (UID 10000).  GitHub Actions
-        # creates /tmp/hermes-test root-owned by default, which hermes
-        # can't write to — chown it to match the in-container UID before
-        # bind-mounting.  Real users doing `docker run -v ~/.hermes:...`
-        # with their own UID hit the same issue and have their own
-        # remediations (HERMES_UID env var, or chown locally).
-        mkdir -p /tmp/hermes-test
-        sudo chown -R 10000:10000 /tmp/hermes-test
-
-    - name: hermes --help
-      shell: bash
-      run: |
-        # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so
-        # this exercises the actual production startup path. PR #30136
-        # review caught that an --entrypoint override here had been
-        # silently neutered by the s6-overlay migration — stage2-hook
-        # ignores its CMD args, so the smoke test was a no-op.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" --help
-
-    - name: hermes dashboard --help
-      shell: bash
-      run: |
-        # Regression guard for #9153: dashboard was present in source but
-        # missing from the published image.  If this fails, something in
-        # the Dockerfile is excluding the dashboard subcommand from the
-        # installed package.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" dashboard --help
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,7 +12,6 @@ name: CI

 on:
  pull_request:
-    branches: [main]
  push:
    branches: [main]

@@ -21,6 +20,7 @@ permissions:
  pull-requests: write # needed by lint (PR comment) + supply-chain (PR comment)
  actions: read # needed by osv-scanner (SARIF upload)
  security-events: write # needed by osv-scanner (SARIF upload)
+  packages: write # needed by docker build

 concurrency:
  group: ci-${{ github.ref }}
@@ -33,6 +33,7 @@ jobs:
  # (all lanes true) so post-merge validation is never weakened.
  # ─────────────────────────────────────────────────────────────────────
  detect:
+    name: Detect affected areas
    runs-on: ubuntu-latest
    outputs:
      python: ${{ steps.classify.outputs.python }}
@@ -54,11 +55,15 @@ jobs:
  # Skipped workflows (if condition is false) don't spin up runners.
  # ─────────────────────────────────────────────────────────────────────
  tests:
+    name: Python tests
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/tests.yml
+    with:
+      slice_count: 8

  lint:
+    name: Python lints
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/lint.yml
@@ -66,35 +71,49 @@ jobs:
      event_name: ${{ needs.detect.outputs.event_name }}

  typecheck:
+    name: TypeScript
    needs: detect
    if: needs.detect.outputs.frontend == 'true'
    uses: ./.github/workflows/typecheck.yml

  docs-site:
+    name: Docs Site
    needs: detect
    if: needs.detect.outputs.site == 'true'
    uses: ./.github/workflows/docs-site-checks.yml

  history-check:
+    name: Deny unrelated histories
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request'
    uses: ./.github/workflows/history-check.yml

  contributor-check:
+    name: Check contributors
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/contributor-check.yml

  uv-lockfile:
+    name: Check uv.lock
    needs: detect
    uses: ./.github/workflows/uv-lockfile-check.yml

  docker-lint:
+    name: Lint Docker scripts
    needs: detect
    if: needs.detect.outputs.docker_meta == 'true'
    uses: ./.github/workflows/docker-lint.yml

+  docker:
+    name: Build&Test Docker image
+    needs: detect
+    if: needs.detect.outputs.python == 'true' || needs.detect.outputs.frontend == 'true' || needs.detect.outputs.docker_meta == 'true'
+    uses: ./.github/workflows/docker.yml
+    secrets: inherit
+
  supply-chain:
+    name: Supply-chain scan
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request' && (needs.detect.outputs.scan == 'true' || needs.detect.outputs.deps == 'true' || needs.detect.outputs.mcp_catalog == 'true')
    uses: ./.github/workflows/supply-chain-audit.yml
@@ -105,7 +124,7 @@ jobs:
      mcp_catalog: ${{ needs.detect.outputs.mcp_catalog == 'true' }}

  osv-scanner:
-    needs: detect
+    name: OSV scan
    uses: ./.github/workflows/osv-scanner.yml

  # ─────────────────────────────────────────────────────────────────────
@@ -128,6 +147,8 @@ jobs:
      - docker-lint
      - supply-chain
      - osv-scanner
+      # we don't require docker to pass rn because it's so slow lol
+      # - docker
    if: always()
    runs-on: ubuntu-latest
    steps:
@@ -144,3 +165,67 @@ jobs:
              sys.exit(1)
          print('All checks passed (or were skipped)')
          "
+
+  # ─────────────────────────────────────────────────────────────────────
+  # CI timing report: collect per-job/step durations from the GitHub API,
+  # cache them on main (as a baseline), and on PRs generate an HTML diff
+  # report with a gantt chart + per-step breakdown. The report is uploaded
+  # as an artifact and a markdown summary is written to $GITHUB_STEP_SUMMARY.
+  # ─────────────────────────────────────────────────────────────────────
+  ci-timings:
+    name: CI timing report
+    needs: [all-checks-pass, docker]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Restore baseline cache (PR only)
+        if: github.event_name == 'pull_request'
+        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ci-timings-baseline.json
+          # Prefix-match: exact key will never hit (run_id differs), so
+          # restore-keys finds the most recent baseline from main.
+          key: ci-timings-baseline-never-exact
+          restore-keys: |
+            ci-timings-baseline-
+
+      - name: Collect timings and generate report
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          python3 scripts/ci/timings_report.py \
+            --baseline ci-timings-baseline.json \
+            --output ci-timings-report.html \
+            --json-out ci-timings.json \
+            --summary-out ci-timings-summary.md
+
+      - name: Upload HTML report
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        id: ci-timings-artifact
+        with:
+          name: ci-timings-report
+          path: ci-timings-report.html
+          retention-days: 14
+          archive: false
+
+      - name: Output summary
+        env:
+          REPORT_URL: ${{ steps.ci-timings-artifact.outputs.artifact-url}}
+        run: |
+          echo "# CI Timing report" >> "$GITHUB_STEP_SUMMARY"
+          echo "[View the full interactive report]($REPORT_URL)" >> "$GITHUB_STEP_SUMMARY"
+          cat ci-timings-summary.md >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Save baseline cache (main only)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        run: cp ci-timings.json ci-timings-baseline.json
+
+      - name: Upload baseline to cache (main only)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ci-timings-baseline.json
+          key: ci-timings-baseline-${{ github.run_id }}
--- a/.github/workflows/docker-lint.yml
+++ b/.github/workflows/docker-lint.yml
@@ -2,7 +2,7 @@ name: Docker / shell lint

 # Lints the container build inputs: Dockerfile (via hadolint) and any shell
 # scripts under docker/ (via shellcheck). These catch the class of regression
-# the behavioral docker-publish smoke test can't — unquoted variable
+# the behavioral docker smoke test can't — unquoted variable
 # expansions, silently-failing RUN commands, etc.
 #
 # Rules and ignores are documented in .hadolint.yaml at the repo root.
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,353 +0,0 @@
-name: Docker Build and Publish
-
-on:
-  push:
-    branches: [main]
-    paths:
-      - '**/*.py'
-      - 'pyproject.toml'
-      - 'uv.lock'
-      - 'Dockerfile'
-      - 'docker/**'
-      - '.github/workflows/docker-publish.yml'
-      - '.github/actions/hermes-smoke-test/**'
-
-  # No paths filter — the job must always run so the required check
-  # reports a status (path-gated workflows leave checks "pending" forever
-  # when no matching files change, which blocks merge).
-  pull_request:
-    branches: [main]
-
-  release:
-    types: [published]
-
-permissions:
-  contents: read
-  # Needed so the arm64 job can push/pull its registry-backed build cache
-  # to ghcr.io (cache-to/cache-from type=registry).  See the build-arm64
-  # job for why registry cache replaced the gha cache on that arch.
-  packages: write
-
-# Concurrency: push/release runs are NEVER cancelled so every merge gets
-# its own image.  PR runs reuse a PR-scoped group with
-# cancel-in-progress: true so rapid pushes to the same PR collapse to the
-# latest commit.
-concurrency:
-  group: docker-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-env:
-  IMAGE_NAME: nousresearch/hermes-agent
-
-jobs:
-  # ---------------------------------------------------------------------------
-  # Build amd64 natively.  This job also runs the smoke tests (basic --help
-  # and the dashboard subcommand regression guard from #9153), because amd64
-  # is the only arch we can `load` into the local daemon on an amd64 runner.
-  # ---------------------------------------------------------------------------
-  build-amd64:
-    # Only run on the upstream repository, not on forks
-    if: github.repository == 'NousResearch/hermes-agent'
-    runs-on: ubuntu-latest
-    timeout-minutes: 45
-    outputs:
-      digest: ${{ steps.push.outputs.digest }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-
-      # The image build + smoke test + integration tests run ONLY on
-      # push-to-main and release — never on PRs. They are the heaviest jobs
-      # in CI (~15-45 min) and a broken build surfaces on the main push (and
-      # is gated pre-merge by docker-lint + uv-lockfile-check). Every step
-      # below is skipped on PRs, so the job still reports green and the
-      # required check never hangs.
-      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
-
-      # Build once, load into the local daemon for smoke testing.  Cached
-      # to gha with a per-arch scope; the push step below reuses every
-      # layer from this build.
-      - name: Build image (amd64, smoke test)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
-        with:
-          context: .
-          file: Dockerfile
-          load: true
-          platforms: linux/amd64
-          tags: ${{ env.IMAGE_NAME }}:test
-          build-args: |
-            HERMES_GIT_SHA=${{ github.sha }}
-          cache-from: type=gha,scope=docker-amd64
-          cache-to: type=gha,mode=max,scope=docker-amd64
-
-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
-
-      # ---------------------------------------------------------------------
-      # Run the docker-integration test suite against the freshly-built
-      # image already loaded into the local daemon (`:test`).  These tests
-      # are excluded from the sharded `tests.yml :: test` matrix on purpose
-      # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each
-      # shard would otherwise reach the session-scoped ``built_image``
-      # fixture in ``tests/docker/conftest.py`` and start a 3-7min
-      # ``docker build`` — guaranteed to
-      # die in fixture setup.
-      #
-      # Piggybacking here avoids a second image build: the smoke test
-      # already proved the image loads + runs, so the daemon has it under
-      # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at
-      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
-      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
-      #
-      # Why this job and not a standalone one: the image is 5GB+; passing
-      # it between jobs via ``docker save``/``upload-artifact`` is slower
-      # than the build itself.  Reusing the existing daemon state is the
-      # cheapest path to coverage on every PR that touches docker code.
-      # ---------------------------------------------------------------------
-      - name: Install uv (for docker tests)
-        if: github.event_name != 'pull_request'
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
-
-      - name: Set up Python 3.11 (for docker tests)
-        if: github.event_name != 'pull_request'
-        run: uv python install 3.11
-
-      - name: Install Python dependencies (for docker tests)
-        if: github.event_name != 'pull_request'
-        run: |
-          uv venv .venv --python 3.11
-          source .venv/bin/activate
-          # ``dev`` extra pulls in pytest, pytest-asyncio —
-          # everything tests/docker/ needs.  We deliberately avoid ``all``
-          # here because the docker tests only drive the container via
-          # subprocess and don't import hermes_agent's optional deps.
-          uv pip install -e ".[dev]"
-
-      - name: Run docker integration tests
-        if: github.event_name != 'pull_request'
-        env:
-          # Skip rebuild; use the image already loaded by the build step.
-          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
-          # Match the policy in tests.yml :: test job — no accidental
-          # real-API calls from inside the harness.
-          OPENROUTER_API_KEY: ""
-          OPENAI_API_KEY: ""
-          NOUS_API_KEY: ""
-        run: |
-          source .venv/bin/activate
-          python -m pytest tests/docker/ -v --tb=short
-
-      - name: Log in to Docker Hub
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      # Push amd64 by digest only (no tag).  The merge job assembles the
-      # tagged manifest list.  `push-by-digest=true` is docker's recommended
-      # pattern for multi-runner multi-platform builds.
-      - name: Push amd64 by digest
-        id: push
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
-        with:
-          context: .
-          file: Dockerfile
-          platforms: linux/amd64
-          labels: |
-            org.opencontainers.image.revision=${{ github.sha }}
-          build-args: |
-            HERMES_GIT_SHA=${{ github.sha }}
-          outputs: type=image,name=${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=gha,scope=docker-amd64
-          cache-to: type=gha,mode=max,scope=docker-amd64
-
-      # Write the digest to a file and upload it as an artifact so the
-      # merge job can stitch both per-arch digests into a manifest list.
-      - name: Export digest
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        run: |
-          mkdir -p /tmp/digests
-          digest="${{ steps.push.outputs.digest }}"
-          touch "/tmp/digests/${digest#sha256:}"
-
-      - name: Upload digest artifact
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
-        with:
-          name: digest-amd64
-          path: /tmp/digests/*
-          if-no-files-found: error
-          retention-days: 1
-
-  # ---------------------------------------------------------------------------
-  # Build arm64 natively on GitHub's free arm64 runner.  This replaces the
-  # previous QEMU-emulated arm64 build, which was ~5-10x slower and shared
-  # a cache scope with amd64.  Matches the amd64 job's shape: build+load,
-  # smoke test, then on push/release push by digest.
-  # ---------------------------------------------------------------------------
-  build-arm64:
-    if: github.repository == 'NousResearch/hermes-agent'
-    runs-on: ubuntu-24.04-arm
-    timeout-minutes: 45
-    outputs:
-      digest: ${{ steps.push.outputs.digest }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-
-      # arm64 build runs only on push-to-main and release (see build-amd64).
-      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
-
-      # Log in to ghcr.io so the registry-backed build cache below can be
-      # read (cache-from) on every event and written (cache-to) on
-      # push/release.  Uses the workflow's GITHUB_TOKEN, which is valid for
-      # the whole job — unlike the gha cache backend's short-lived Azure SAS
-      # token, which expired mid-build on slow cold-cache arm64 runs and
-      # crashed the build before the smoke test (the reason the gha cache
-      # was removed from arm64 PRs in the first place).
-      - name: Log in to ghcr.io (build cache)
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      # Build once, load into the local daemon for smoke testing, then push
-      # by digest below. Reads AND writes the registry-backed cache so the
-      # push reuses layers from this build and the next build starts warm.
-      #
-      # Registry cache (type=registry on ghcr.io) is used instead of the gha
-      # cache that previously broke here: its credential is the job-lifetime
-      # GITHUB_TOKEN, not a short-lived SAS token, so the cold-build-outlives-
-      # token failure mode cannot recur.
-      - name: Build image (arm64, smoke test, cached publish)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
-        with:
-          context: .
-          file: Dockerfile
-          load: true
-          platforms: linux/arm64
-          tags: ${{ env.IMAGE_NAME }}:test
-          build-args: |
-            HERMES_GIT_SHA=${{ github.sha }}
-          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
-          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max
-
-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
-
-      - name: Log in to Docker Hub
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Push arm64 by digest
-        id: push
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
-        with:
-          context: .
-          file: Dockerfile
-          platforms: linux/arm64
-          labels: |
-            org.opencontainers.image.revision=${{ github.sha }}
-          build-args: |
-            HERMES_GIT_SHA=${{ github.sha }}
-          outputs: type=image,name=${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
-          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max
-
-      - name: Export digest
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        run: |
-          mkdir -p /tmp/digests
-          digest="${{ steps.push.outputs.digest }}"
-          touch "/tmp/digests/${digest#sha256:}"
-
-      - name: Upload digest artifact
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
-        with:
-          name: digest-arm64
-          path: /tmp/digests/*
-          if-no-files-found: error
-          retention-days: 1
-
-  # ---------------------------------------------------------------------------
-  # Stitch both per-arch digests into a single tagged multi-arch manifest.
-  # This is a registry-side operation — no building, no layer re-push —
-  # so it runs in ~30 seconds.
-  #
-  # On main pushes: tags both :main and :latest.
-  # On releases: tags :<release_tag_name>.
-  # ---------------------------------------------------------------------------
-  merge:
-    if: github.repository == 'NousResearch/hermes-agent' && (github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release')
-    runs-on: ubuntu-latest
-    needs: [build-amd64, build-arm64]
-    timeout-minutes: 10
-    steps:
-      - name: Download digests
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
-        with:
-          path: /tmp/digests
-          pattern: digest-*
-          merge-multiple: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Create manifest list and push
-        working-directory: /tmp/digests
-        run: |
-          set -euo pipefail
-          args=()
-          for digest_file in *; do
-            args+=("${IMAGE_NAME}@sha256:${digest_file}")
-          done
-          if [ "${{ github.event_name }}" = "release" ]; then
-            TAG="${{ github.event.release.tag_name }}"
-            docker buildx imagetools create \
-              -t "${IMAGE_NAME}:${TAG}" \
-              "${args[@]}"
-          else
-            docker buildx imagetools create \
-              -t "${IMAGE_NAME}:main" \
-              -t "${IMAGE_NAME}:latest" \
-              "${args[@]}"
-          fi
-        env:
-          IMAGE_NAME: ${{ env.IMAGE_NAME }}
-
-      - name: Inspect image
-        run: |
-          if [ "${{ github.event_name }}" = "release" ]; then
-            docker buildx imagetools inspect "${IMAGE_NAME}:${{ github.event.release.tag_name }}"
-          else
-            docker buildx imagetools inspect "${IMAGE_NAME}:main"
-          fi
-        env:
-          IMAGE_NAME: ${{ env.IMAGE_NAME }}
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,209 @@
+name: Docker Build, Test, and Publish
+
+on:
+  release:
+    types: [published]
+  workflow_call:
+
+permissions:
+  contents: read
+
+# Concurrency: push/release runs are NEVER cancelled so every merge gets
+# its own image.  PR runs reuse a PR-scoped group with
+# cancel-in-progress: true so rapid pushes to the same PR collapse to
+# the latest commit.
+concurrency:
+  group: docker-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+env:
+  IMAGE_NAME: nousresearch/hermes-agent
+
+jobs:
+  # Build, test, and optionally push the image for each architecture.
+  build:
+    if: github.repository == 'NousResearch/hermes-agent'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: amd64
+            runner: ubuntu-latest
+            platform: linux/amd64
+            cache-from: type=gha,scope=docker-amd64
+            cache-to: type=gha,mode=max,scope=docker-amd64
+          - arch: arm64
+            runner: ubuntu-24.04-arm
+            platform: linux/arm64
+            cache-from: type=gha,scope=docker-arm64
+            cache-to: type=gha,mode=max,scope=docker-arm64
+
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 45
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+
+      # Build once, load into the local daemon for testing.  Cached
+      # per-arch; the push step below reuses every layer from this build.
+      - name: Build image (${{ matrix.arch }})
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        with:
+          context: .
+          file: Dockerfile
+          load: true
+          platforms: ${{ matrix.platform }}
+          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
+          cache-from: ${{ matrix.cache-from }}
+          cache-to: ${{ (github.event_name != 'pull_request') && matrix.cache-to || '' }}
+
+      - name: Log in to Docker Hub
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      # Push by digest only (no tag).  The merge job assembles the
+      # tagged manifest list.  `push-by-digest=true` is docker's recommended
+      # pattern for multi-runner multi-platform builds.
+      - name: Push ${{ matrix.arch }} by digest
+        id: push
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        with:
+          context: .
+          file: Dockerfile
+          platforms: ${{ matrix.platform }}
+          labels: |
+            org.opencontainers.image.revision=${{ github.sha }}
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
+          outputs: type=image,name=${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: ${{ matrix.cache-from }}
+          cache-to: ${{ matrix.cache-to }}
+
+      # Write the digest to a file and upload it as an artifact so the
+      # merge job can stitch both per-arch digests into a manifest list.
+      - name: Export digest
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
+        run: |
+          mkdir -p /tmp/digests
+          digest="${{ steps.push.outputs.digest }}"
+          touch "/tmp/digests/${digest#sha256:}"
+
+      - name: Upload digest artifact
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        with:
+          name: digest-${{ matrix.arch }}
+          path: /tmp/digests/*
+          if-no-files-found: error
+          retention-days: 1
+
+      # Run the docker-integration test suite against the freshly-built
+      # image already loaded into the local daemon (`:test`).
+      #
+      # Piggybacking here avoids a second image build: the build step
+      # already loaded the image into the daemon under
+      # `${IMAGE_NAME}:test`, so we just point ``HERMES_TEST_IMAGE`` at
+      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
+      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
+      #
+      # Why this job and not a standalone one: the image is 5GB+; passing
+      # it between jobs via ``docker save``/``upload-artifact`` is slower
+      # than the build itself.  Reusing the existing daemon state is the
+      # cheapest path to coverage on every PR that touches docker code.
+      # ---------------------------------------------------------------------
+      - name: Install uv (for docker tests)
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+
+      - name: Set up Python 3.11 (for docker tests)
+        run: uv python install 3.11
+
+      - name: Install Python dependencies (for docker tests)
+        run: |
+          # ``dev`` extra pulls in pytest, pytest-asyncio —
+          # everything tests/docker/ needs.  We deliberately avoid ``all``
+          # here because the docker tests only drive the container via
+          # subprocess and don't import hermes_agent's optional deps.
+          uv sync --locked --python 3.11 --extra dev
+
+      - name: Run docker integration tests
+        env:
+          # Skip rebuild; use the image already loaded by the build step.
+          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
+          # Match the policy in tests.yml :: test job — no accidental
+          # real-API calls from inside the harness.
+          OPENROUTER_API_KEY: ""
+          OPENAI_API_KEY: ""
+          NOUS_API_KEY: ""
+        run: |
+          scripts/run_tests.sh tests/docker/ --file-timeout 600
+
+  # ---------------------------------------------------------------------------
+  # Stitch both per-arch digests into a single tagged multi-arch manifest.
+  # This is a registry-side operation — no building, no layer re-push —
+  # so it runs in ~30 seconds.
+  #
+  # On main pushes: tags both :main and :latest.
+  # On releases: tags :<release_tag_name>.
+  # ---------------------------------------------------------------------------
+  merge:
+    if: github.repository == 'NousResearch/hermes-agent' && (github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release')
+    runs-on: ubuntu-latest
+    needs: [build]
+    timeout-minutes: 10
+    steps:
+      - name: Download digests
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        with:
+          path: /tmp/digests
+          pattern: digest-*
+          merge-multiple: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Create manifest list and push
+        working-directory: /tmp/digests
+        run: |
+          set -euo pipefail
+          args=()
+          for digest_file in *; do
+            args+=("${IMAGE_NAME}@sha256:${digest_file}")
+          done
+          if [ "${{ github.event_name }}" = "release" ]; then
+            TAG="${{ github.event.release.tag_name }}"
+            docker buildx imagetools create \
+              -t "${IMAGE_NAME}:${TAG}" \
+              "${args[@]}"
+          else
+            docker buildx imagetools create \
+              -t "${IMAGE_NAME}:main" \
+              -t "${IMAGE_NAME}:latest" \
+              "${args[@]}"
+          fi
+        env:
+          IMAGE_NAME: ${{ env.IMAGE_NAME }}
+
+      - name: Inspect image
+        run: |
+          if [ "${{ github.event_name }}" = "release" ]; then
+            docker buildx imagetools inspect "${IMAGE_NAME}:${{ github.event.release.tag_name }}"
+          else
+            docker buildx imagetools inspect "${IMAGE_NAME}:main"
+          fi
+        env:
+          IMAGE_NAME: ${{ env.IMAGE_NAME }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -37,7 +37,7 @@ jobs:
          fetch-depth: 0 # need full history for merge-base + worktree

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff + ty
        uses: ./.github/actions/retry
@@ -109,46 +109,6 @@ jobs:
            --output    .lint-reports/summary.md
          cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY"

-      - name: Upload reports as artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
-        with:
-          name: lint-reports
-          path: .lint-reports/
-          retention-days: 14
-
-      - name: Post / update PR comment
-        if: inputs.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
-        continue-on-error: true
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
-        with:
-          script: |
-            const fs = require('fs');
-            const body = fs.readFileSync('.lint-reports/summary.md', 'utf8');
-            const marker = '<!-- lint-diff-summary -->';
-            const fullBody = marker + '\n' + body;
-
-            const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo:  context.repo.repo,
-              issue_number: context.issue.number,
-            });
-            const existing = comments.find(c => c.body && c.body.includes(marker));
-            if (existing) {
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo:  context.repo.repo,
-                comment_id: existing.id,
-                body: fullBody,
-              });
-            } else {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo:  context.repo.repo,
-                issue_number: context.issue.number,
-                body: fullBody,
-              });
-            }
-
  ruff-blocking:
    # Enforce the rules in pyproject.toml [tool.ruff.lint.select]. Currently
    # PLW1514 (unspecified-encoding) — catches bare ``open()`` /
@@ -164,7 +124,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff
        uses: ./.github/actions/retry
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -3,17 +3,17 @@ name: Build Skills Index
 on:
  schedule:
    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: '0 6,18 * * *'
-  workflow_dispatch:  # Manual trigger
+    - cron: "0 6,18 * * *"
+  workflow_dispatch: # Manual trigger
  push:
    branches: [main]
    paths:
-      - 'scripts/build_skills_index.py'
-      - '.github/workflows/skills-index.yml'
+      - "scripts/build_skills_index.py"
+      - ".github/workflows/skills-index.yml"

 permissions:
  contents: read
-  actions: write   # to trigger deploy-site.yml on schedule
+  actions: write # to trigger deploy-site.yml on schedule

 jobs:
  build-index:
@@ -21,11 +21,11 @@ jobs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.11'
+          python-version: "3.11"

      - name: Install dependencies
        run: pip install httpx==0.28.1 pyyaml==6.0.2
@@ -36,7 +36,7 @@ jobs:
        run: python scripts/build_skills_index.py

      - name: Upload index artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: skills-index
          path: website/static/api/skills-index.json
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,6 +2,11 @@ name: Tests

 on:
  workflow_call:
+    inputs:
+      slice_count:
+        description: Number of parallel test slices
+        type: number
+        default: 8

 permissions:
  contents: read
@@ -12,13 +17,11 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  test:
+  generate:
+    name: "Generate slices"
    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        slice: [1, 2, 3, 4, 5, 6]
+    outputs:
+      matrix: ${{ steps.matrix.outputs.matrix }}
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -27,13 +30,26 @@ jobs:
        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: test_durations.json
-          # main always writes a new suffix, but jobs pick the latest one with the same prefix
-          # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
-          # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
-          # If there are no exact matches, the action searches for partial matches of the restore keys.
-          # When the action finds a partial match, the most recent cache is restored to the path directory.
          key: test-durations

+      - name: Generate test slices
+        id: matrix
+        run: |
+          MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
+          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
+
+  test:
+    name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
+    needs: generate
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
      - name: Install ripgrep (prebuilt binary)
        run: |
          set -euo pipefail
@@ -49,7 +65,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -78,33 +94,19 @@ jobs:
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice }}/6)
-        # Per-file isolation via scripts/run_tests_parallel.py: discovers
-        # every test_*.py file under tests/ (excluding integration/ + e2e/),
-        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
+      - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
+        # Per-file isolation via scripts/run_tests.sh: each test file runs
+        # in its own freshly-spawned `python -m pytest <file>` subprocess
        # with bounded parallelism. No xdist, no shared workers, no
        # module-level state leakage between files.
        #
-        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
-        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
-        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
-        # every file a fresh interpreter — the only isolation boundary
-        # that matters in practice (cross-file leakage was the original
-        # flake source; intra-file is the test author's responsibility).
-        #
-        # Why drop xdist entirely: xdist's persistent workers accumulate
-        # state across files, which is exactly the leakage we wanted to
-        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
-        # the job with cleaner semantics.
-        #
-        # Matrix slicing (--slice I/N): files are distributed across 6
-        # jobs by cached duration (LPT algorithm) so each job gets
-        # roughly equal wall time. Without a cache, files default to 2s
-        # estimate and get split roughly evenly by count — still correct,
-        # just not perfectly balanced.
+        # File list is pre-computed by the generate job (--generate-slices)
+        # which runs LPT distribution once and passes the file list to each
+        # matrix job via --files. Previously each job re-discovered files and
+        # re-ran LPT independently — redundant N times.
        run: |
          source .venv/bin/activate
-          python scripts/run_tests_parallel.py --slice ${{ matrix.slice }}/6
+          scripts/run_tests.sh --files '${{ matrix.slice.files }}'
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -114,7 +116,7 @@ jobs:
      - name: Upload per-slice durations
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: test-durations-slice-${{ matrix.slice }}
+          name: test-durations-slice-${{ matrix.slice.index }}
          path: test_durations.json
          retention-days: 1

@@ -173,7 +175,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
--- a/.github/workflows/typecheck.yml
+++ b/.github/workflows/typecheck.yml
@@ -6,6 +6,7 @@ on:

 jobs:
  typecheck:
+    name: Check TypeScript
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -22,8 +23,7 @@ jobs:
      # native builds. Skipping install scripts drops node-pty's node-gyp
      # header fetch — the transient flake that killed this job pre-`tsc` — and
      # is faster. retry covers the remaining registry blips.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci --ignore-scripts
      - run: npm run --prefix ${{ matrix.package }} typecheck
@@ -35,6 +35,7 @@ jobs:
  # users build apps/desktop from source on install/update. Run the real
  # `vite build` here so that class of break fails in CI instead.
  desktop-build:
+    name: Build desktop app
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -44,8 +45,7 @@ jobs:
          cache: npm
      # Keep install scripts here: the production build may need node-pty's
      # native binary. retry handles the transient install-time fetch flakes.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci
      - run: npm run --prefix apps/desktop build
--- a/.github/workflows/upload_to_pypi.yml
+++ b/.github/workflows/upload_to_pypi.yml
@@ -5,11 +5,11 @@ name: Publish to PyPI
 on:
  push:
    tags:
-      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+      - "v20*" # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
  workflow_dispatch:
    inputs:
      confirm_tag:
-        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
+        description: "Tag to publish (e.g. v2026.5.15). Must already exist."
        required: true
        type: string

@@ -27,7 +27,7 @@ jobs:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
          # On workflow_dispatch, check out the confirmed tag.
@@ -43,17 +43,17 @@ jobs:
          fi

      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.13'
+          python-version: "3.13"

      - name: Install uv
-        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
        with:
-          node-version: '22'
+          node-version: "22"

      - name: Build web dashboard
        run: cd web && npm ci && npm run build
@@ -81,7 +81,7 @@ jobs:
        run: uv build --sdist --wheel

      - name: Upload distribution artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: python-package-distributions
          path: dist/
@@ -94,17 +94,17 @@ jobs:
      name: pypi
      url: https://pypi.org/p/hermes-agent
    permissions:
-      id-token: write  # OIDC trusted publishing
+      id-token: write # OIDC trusted publishing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
        with:
          skip-existing: true

@@ -116,12 +116,12 @@ jobs:
    needs: publish
    runs-on: ubuntu-latest
    permissions:
-      contents: write   # attach assets to the existing release
-      id-token: write   # sigstore signing
+      contents: write # attach assets to the existing release
+      id-token: write # sigstore signing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -145,7 +145,7 @@ jobs:

      - name: Sign with Sigstore
        if: env.skip_sign != 'true'
-        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc  # v3.3.0
+        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc # v3.3.0
        with:
          inputs: >-
            ./dist/*.tar.gz
--- a/.github/workflows/uv-lockfile-check.yml
+++ b/.github/workflows/uv-lockfile-check.yml
@@ -4,7 +4,7 @@ name: uv.lock check
 # that modify pyproject.toml without regenerating uv.lock (or vice versa)
 # must not merge, because the Docker build's `uv sync --frozen` step will
 # fail on a stale lockfile and we'd rather catch it here than in the
-# docker-publish workflow on main.
+# docker workflow on main.
 #
 # ─────────────────────────────────────────────────────────────────────────
 # IMPORTANT: this check runs against the MERGED state, not just your branch
@@ -63,7 +63,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      # `uv lock --check` re-resolves the project from pyproject.toml and
      # compares the result to uv.lock, exiting non-zero if they disagree.
@@ -100,7 +100,7 @@ jobs:

          This check is blocking because the Docker image build uses
          `uv sync --frozen --extra all`, which rejects stale lockfiles
-          — catching it here avoids a ~15 min failed docker-publish run
+          — catching it here avoids a ~15 min failed docker run
          on `main` post-merge.
          EOF
            echo "::error title=uv.lock out of sync::Run \`uv lock\` locally and commit the result. If on a PR, sync with main first."
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,9 @@ RELEASE_v*.md
 # Desktop demo-run scratch output (hermes writes demo/*.txt during recorded
 # walkthroughs). Throwaway artifacts, never part of the app.
 apps/desktop/demo/
+
+# PR infographics are rendered locally and embedded in PR descriptions via the
+# image-provider (fal.media) URL — they are NEVER committed to the repo. The
+# PR body is the archive. See the hermes-agent-dev skill's
+# pr-infographic-workflow reference (storage rule + lapse #8 / #COMMIT-1).
+infographic/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -123,6 +123,17 @@ conservative at the waist.
  without E2E proof, and plugins that touch core files.** Plugins live in their
  own directory and work within the ABCs/hooks we provide; if a plugin needs
  more, widen the generic plugin surface, don't special-case it in core.
+- **Third-party products / other people's projects integrated into the core
+  tree.** Observability backends, vendor SaaS integrations, analytics dashboards,
+  and similar "someone else's product" plugins do NOT land under `plugins/` in
+  this repo. They place an ongoing maintenance burden on us to keep them working
+  against a fast-moving core, for a backend we don't own. Ship them as a
+  **standalone plugin repo** users install into `~/.hermes/plugins/` (or via a
+  pip entry point), and promote them in the Nous Research Discord
+  (`#plugins-skills-and-skins`). This is a coupling-and-maintenance decision, not
+  a quality bar — the plugin can be excellent and still be a close. PRs that add
+  such a directory to the tree are closed with a pointer to publish it as its own
+  repo.

 ### Before you call it a bug — verify the premise (and when NOT to close)

@@ -480,7 +491,7 @@ The dashboard embeds the real `hermes --tui` — **not** a rewrite.  See `hermes

 ### Electron Desktop Chat App (`apps/desktop/`)

-A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.
+A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). The WebSocket/JSON-RPC transport lives in the framework-agnostic `apps/shared` package (`@hermes/shared` — `JsonRpcGatewayClient` + WS URL helpers), which the web dashboard (`web/`) also consumes; **desktop has no build/runtime dependency on the dashboard frontend** — it spawns a headless `hermes serve` backend server (the same gateway `dashboard` serves, minus the browser UI). `dashboard` and `serve` share `cmd_dashboard`/`start_server` but are independent surfaces — neither launches the other. The one exception is a backward-compat *fallback*: `serve` is newer, so the desktop spawn (`electron/backend-command.cjs` + `backendSupportsServe()` in `main.cjs`) detects whether the resolved runtime registers `serve` and, only when it does not (an older managed install / PATH `hermes` the app hasn't updated yet), rewrites the argv to the legacy `dashboard --no-open`. Without that, a new app against an un-upgraded runtime would crash on an unknown subcommand and brick every mid-upgrade user. It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.

 **Slash commands in the desktop app are curated client-side, then dispatched to the backend.** The pipeline:

@@ -783,6 +794,24 @@ landing in this tree. PRs that add a new directory under
 provider as its own repo. Existing in-tree providers stay; bug fixes
 to them are welcome.

+**No new third-party-product plugins in-tree (policy, June 2026):** the
+same rule applies beyond memory providers. Plugins that integrate
+someone else's product or project — observability/metrics backends,
+vendor SaaS connectors, analytics dashboards, paid-service tie-ins —
+must ship as **standalone plugin repos** that users install into
+`~/.hermes/plugins/` (or via pip entry points). They register through
+the existing plugin discovery path and use the ABCs/hooks/ctx surface
+we expose; nothing special is needed in core. The reason is
+maintenance load: every product we absorb into the tree becomes our
+burden to keep working against a fast-moving core, for a backend we
+don't own. Promote standalone plugins in the Nous Research Discord
+(`#plugins-skills-and-skins`). PRs that add such a directory under
+`plugins/` are closed with a pointer to publish it as its own repo —
+this is a coupling decision, not a quality judgment. (The
+`observability/`, `kanban/`, `disk-cleanup/`, etc. directories already
+in the tree are existing precedent, not an invitation to add more
+third-party-product plugins alongside them.)
+
 ### Model-provider plugins (`plugins/model-providers/<name>/`)

 Every inference backend (openrouter, anthropic, gmi, deepseek, nvidia, …)
@@ -1260,65 +1289,22 @@ scripts/run_tests.sh                                  # full suite, CI-parity
 scripts/run_tests.sh tests/gateway/                   # one directory
 scripts/run_tests.sh tests/agent/test_foo.py::test_x  # one test
 scripts/run_tests.sh -v --tb=long                     # pass-through pytest flags
-scripts/run_tests.sh --no-isolate tests/foo/          # disable subprocess isolation (faster, for debugging)
 ```

-### Subprocess-per-test isolation
+### Subprocess-per-test-file isolation

-Every test runs in a freshly-spawned Python subprocess via the in-tree plugin
-at `tests/_isolate_plugin.py`. This means module-level dicts/sets and
-ContextVars from one test cannot leak into the next — the historic
-`_reset_module_state` autouse fixture is gone.
+Every test file runs in a freshly-spawned Python subprocess via `run_tests_parallel.py`. This means module-level dicts/sets and
+ContextVars from one test file cannot leak into the next.

-Implementation notes:
+### Why the wrapper

- The plugin uses `multiprocessing.get_context("spawn")`, which works on
-  Linux, macOS, and Windows alike (POSIX `fork` is not used).
- Per-test overhead is ~0.5–1.0s (Python startup + pytest collection). xdist
-  parallelism amortizes this across cores; on a 20-core box the full suite
-  finishes in roughly the same wall time as before, but flake-free.
- `isolate_timeout` (configured in `pyproject.toml`) caps each test at 30s.
-  Hangs are killed and surfaced as a failure report.
- Pass `--no-isolate` to disable isolation — useful when debugging a single
-  test interactively, or when you specifically want to verify state leakage.
- The plugin disables itself in child processes (sentinel envvar
-  `HERMES_ISOLATE_CHILD=1`), so there's no fork-bomb risk.
+|                     | Without wrapper                             | With wrapper                              |
+| ------------------- | ------------------------------------------- | ----------------------------------------- |
+| Provider API keys   | Whatever is in your env (auto-detects pool) | All env vars except a specific few unset. |
+| HOME / `~/.hermes/` | Your real config+auth.json                  | Temp dir per test                         |
+| Timezone            | Local TZ (PDT etc.)                         | UTC                                       |
+| Locale              | Whatever is set                             | C.UTF-8                                   |

-### Why the wrapper (and why the old "just call pytest" doesn't work)
-
-Five real sources of local-vs-CI drift the script closes:
-
-| | Without wrapper | With wrapper |
-|---|---|---|
-| Provider API keys | Whatever is in your env (auto-detects pool) | All `*_API_KEY`/`*_TOKEN`/etc. unset |
-| HOME / `~/.hermes/` | Your real config+auth.json | Temp dir per test |
-| Timezone | Local TZ (PDT etc.) | UTC |
-| Locale | Whatever is set | C.UTF-8 |
-| xdist workers | `-n auto` = all cores | `-n auto` (safe — subprocess isolation prevents cross-worker flakes) |
-
-`tests/conftest.py` also enforces points 1-4 as an autouse fixture so ANY pytest
-invocation (including IDE integrations) gets hermetic behavior — but the wrapper
-is belt-and-suspenders.
-
-### Running without the wrapper (only if you must)
-
-If you can't use the wrapper (e.g. inside an IDE that shells pytest directly),
-at minimum activate the venv. The isolation plugin loads automatically from
-`addopts` in `pyproject.toml`, so you get the same per-test process isolation
-either way.
-
-```bash
-source .venv/bin/activate   # or: source venv/bin/activate
-python -m pytest tests/ -q
-```
-
-If you need to bypass isolation for fast feedback while debugging:
-
-```bash
-python -m pytest tests/agent/test_foo.py -q --no-isolate
-```
-
-Always run the full suite before pushing changes.

 ### Don't write change-detector tests

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -85,6 +85,23 @@ This isn't a quality bar — it's a coupling-and-maintenance decision. Memory pr

 ---

+## Third-Party Product Integrations: Ship as a Standalone Plugin
+
+The same rule extends to **any plugin that integrates someone else's product or project** — observability/metrics backends, vendor SaaS connectors, analytics dashboards, paid-service tie-ins, and similar third-party integrations. **These do not land in this repo.**
+
+The reason is maintenance load, not quality. Every external product absorbed into the core tree becomes ours to keep working against a fast-moving codebase, for a backend we don't own and can't control. Hermes ships a lot and the core moves quickly; coupling third-party products into it creates an open-ended burden on the maintainers.
+
+Publish these as a **standalone plugin repo** instead:
+
+- Implement the relevant ABC and use the existing plugin discovery path (`~/.hermes/plugins/`, project `.hermes/plugins/`, or a pip entry point) — see [Build a Hermes Plugin](https://hermes-agent.nousresearch.com/docs/guides/build-a-hermes-plugin)
+- Register lifecycle hooks (`pre_tool_call`, `post_tool_call`, `pre_llm_call`, `post_llm_call`, `on_session_start`, `on_session_end`), tools (`ctx.register_tool`), and CLI subcommands (`ctx.register_cli_command`) through the surface we already expose — no core changes needed
+- If your plugin needs a capability the framework doesn't expose, that's a feature request to **widen the generic plugin surface** (a new hook or `ctx` method) — never special-case your plugin in core
+- Promote it in the [Nous Research Discord](https://discord.gg/NousResearch) `#plugins-skills-and-skins` channel so users can find and install it
+
+A well-built third-party-product plugin can clear automated review and still be closed for this reason — it's a placement decision, not a verdict on the code. PRs that add such a directory under `plugins/` will be closed with a pointer to publish it as its own repo.
+
+---
+
 ## Development Setup

 ### Prerequisites
@@ -132,13 +149,20 @@ this way, make sure you run the `hermes` entrypoint from this venv; running the
 system `python3 -m hermes_cli.main` can pick up unrelated system Python
 packages.

+Create the venv **outside** the cloned source tree. A venv that lives inside
+the directory the agent operates from can be wiped by a relative-path command
+the agent runs against its own checkout (`rm -rf venv`, `uv venv venv`, etc.),
+which silently destroys the running runtime mid-session. Keeping it outside the
+tree means no relative path from the workspace resolves to it.
+
 ```bash
 git clone https://github.com/NousResearch/hermes-agent.git
 cd hermes-agent

-# Create venv with Python 3.11
-uv venv venv --python 3.11
-export VIRTUAL_ENV="$(pwd)/venv"
+# Create venv with Python 3.11, OUTSIDE the source tree
+uv venv ~/.hermes/venvs/hermes-dev --python 3.11
+export VIRTUAL_ENV="$HOME/.hermes/venvs/hermes-dev"
+export PATH="$VIRTUAL_ENV/bin:$PATH"

 # Install with all extras (messaging, cron, CLI menus, dev tools)
 uv pip install -e ".[all,dev]"
--- a/45
+++ b/45
@@ -119,6 +119,9 @@ COPY package.json package-lock.json ./
 COPY web/package.json web/
 COPY ui-tui/package.json ui-tui/
 COPY ui-tui/packages/hermes-ink/ ui-tui/packages/hermes-ink/
+# apps/shared/ is copied IN FULL because web/package.json references it as a
+# `file:` workspace dependency (same pattern as hermes-ink above).
+COPY apps/shared/ apps/shared/

 # `npm_config_install_links=false` forces npm to install `file:` deps as
 # symlinks instead of copies.  This is the default since npm 10+, which is
@@ -184,12 +187,19 @@ RUN uv sync --frozen --no-install-project --extra all --extra messaging --extra
 # invalidate the (relatively slow) web + ui-tui build layer.
 COPY web/ web/
 COPY ui-tui/ ui-tui/
+COPY apps/shared/ apps/shared/
 RUN cd web && npm run build && \
    cd ../ui-tui && npm run build

 # ---------- Source code ----------
 # .dockerignore excludes node_modules, so the installs above survive.
-COPY . .
+# --link decouples this layer from parents for cache purposes; --chmod bakes
+# the final read-only permissions at copy time so we skip the separate
+# `chmod -R` pass that previously walked ~30k files across the venv +
+# node_modules + source (21s amd64 / 222s arm64 — #49113).  `a+rX,go-w`
+# gives the non-root hermes user read + traverse but no write; root retains
+# write so the build steps below don't need chmod u+w dances.
+COPY --link --chmod=a+rX,go-w . .

 # ---------- Permissions ----------
 # Link hermes-agent itself (editable). Deps are already installed in the
@@ -197,19 +207,15 @@ COPY . .
 # resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

-# Keep /opt/hermes immutable for the runtime hermes user. Hosted/container
-# instances must not be able to self-edit the installed source or venv; user
-# data, skills, plugins, config, logs, and dashboard uploads live under
-# /opt/data instead. Root can still repair the image during build/boot, but
-# supervised Hermes processes drop to the non-root hermes user.
+# Wire the exec shim and install-method stamp.  Files under /opt/hermes are
+# already root-owned (COPY, uv sync, npm install all run as root) and
+# read-only for the hermes user (go-w from the --chmod above).
+
 USER root
 RUN mkdir -p /opt/hermes/bin && \
    cp /opt/hermes/docker/hermes-exec-shim.sh /opt/hermes/bin/hermes && \
    chmod 0755 /opt/hermes/bin/hermes && \
-    printf 'docker\n' > /opt/hermes/.install_method && \
-    chown -R root:root /opt/hermes && \
-    chmod -R a+rX /opt/hermes && \
-    chmod -R a-w /opt/hermes
+    printf 'docker\n' > /opt/hermes/.install_method
 # The ``.install_method`` stamp is baked next to the running code (the install
 # tree), NOT into $HERMES_HOME. $HERMES_HOME (/opt/data) is a shared data
 # volume that is commonly bind-mounted from the host and even shared with a
@@ -236,13 +242,11 @@ RUN mkdir -p /opt/hermes/bin && \
 #
 # The arg is optional — local `docker build` without --build-arg simply
 # omits the file, and the runtime falls back to live-git lookup.  CI
-# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
+# (.github/workflows/docker.yml) passes ${{ github.sha }} so
 # every published image has it.
 ARG HERMES_GIT_SHA=
 RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
-        chmod u+w /opt/hermes && \
-        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
-        chmod a-w /opt/hermes /opt/hermes/.hermes_build_sha; \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha; \
    fi

 # ---------- s6-overlay service wiring ----------
@@ -290,6 +294,19 @@ ENV HERMES_TUI_DIR=/opt/hermes/ui-tui
 ENV HERMES_HOME=/opt/data
 ENV HERMES_WRITE_SAFE_ROOT=/opt/data
 ENV HERMES_DISABLE_LAZY_INSTALLS=1
+# The published image seals /opt/hermes (root-owned, read-only) so a runtime
+# lazy install can't mutate the agent's own venv and brick it. But opt-in
+# backends (Firecrawl web search, Exa, Feishu, …) keep their SDKs in
+# tools/lazy_deps.py — deliberately NOT baked into [all] (see pyproject.toml
+# policy 2026-05-12: one quarantined release must not break every install).
+# Redirect those lazy installs to a writable dir on the durable data volume.
+# lazy_deps appends this dir to the END of sys.path, so a package installed
+# here can only ADD modules — it can never shadow or downgrade a core module,
+# so the sealed-venv guarantee holds even with installs re-enabled. The dir
+# is seeded + chowned to the hermes user by docker/stage2-hook.sh and lives
+# on the /opt/data volume, so it persists across container recreates / image
+# updates (an ABI stamp invalidates it if a rebuild bumps the interpreter).
+ENV HERMES_LAZY_INSTALL_TARGET=/opt/data/lazy-packages

 # `docker exec` privilege-drop shim. When operators run
 # `docker exec <c> hermes ...` they default to root, and any file the
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [NovitaAI](https://novita.ai) (AI-native cloud for Model API, Agent Sandbox, and GPU Cloud), [NVIDIA NIM](https://build.nvidia.com) (Nemotron), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), OpenRouter, OpenAI, your own endpoint, and [many others](https://hermes-agent.nousresearch.com/docs/integrations/providers). Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -232,10 +232,14 @@ scripts/run_tests.sh
 Manual clone fallback (for throwaway clones/CI where you intentionally do not
 want the managed install layout):

+Create the venv outside the cloned source tree — a venv inside the directory
+the agent operates from can be wiped by a relative-path command the agent runs
+against its own checkout, destroying the running runtime mid-session.
+
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
-uv venv .venv --python 3.11
-source .venv/bin/activate
+uv venv ~/.hermes/venvs/hermes-dev --python 3.11
+source ~/.hermes/venvs/hermes-dev/bin/activate
 uv pip install -e ".[all,dev]"
 scripts/run_tests.sh
 ```
--- a/acp_adapter/entry.py
+++ b/acp_adapter/entry.py
@@ -23,6 +23,11 @@ except ModuleNotFoundError:
    # new code but ``uv pip install -e .`` didn't finish.  Missing bootstrap
    # means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
    pass
+else:
+    # Stop a ``utils/``/``proxy/``/``ui/`` package in the launch directory from
+    # shadowing Hermes's own modules — ``hermes acp`` can be started from any
+    # cwd, including a project that has same-named packages on its path.
+    hermes_bootstrap.harden_import_path()

 import argparse
 import asyncio
--- a/acp_adapter/tools.py
+++ b/acp_adapter/tools.py
@@ -74,7 +74,7 @@ _POLISHED_TOOLS = {
    "kanban_create", "kanban_show", "kanban_comment", "kanban_complete",
    "kanban_block", "kanban_link", "kanban_heartbeat",
    "yb_query_group_info", "yb_query_group_members", "yb_search_sticker",
-    "yb_send_dm", "yb_send_sticker", "mixture_of_agents",
+    "yb_send_dm", "yb_send_sticker",
 }


--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -106,7 +106,12 @@ def _custom_provider_extra_body_for_agent(
    base_url: str,
    custom_providers: List[Dict[str, Any]],
 ) -> Optional[Dict[str, Any]]:
-    if (provider or "").strip().lower() != "custom":
+    provider_norm = (provider or "").strip().lower()
+    if provider_norm == "custom":
+        provider_key_filter = ""
+    elif provider_norm.startswith("custom:"):
+        provider_key_filter = provider_norm.split(":", 1)[1].strip()
+    else:
        return None

    target_url = _normalized_custom_base_url(base_url)
@@ -117,6 +122,13 @@ def _custom_provider_extra_body_for_agent(
    for entry in custom_providers or []:
        if not isinstance(entry, dict):
            continue
+        if provider_key_filter:
+            entry_keys = {
+                str(entry.get("provider_key", "") or "").strip().lower(),
+                str(entry.get("name", "") or "").strip().lower(),
+            }
+            if provider_key_filter not in entry_keys:
+                continue
        if _normalized_custom_base_url(entry.get("base_url")) != target_url:
            continue
        extra_body = entry.get("extra_body")
@@ -707,6 +719,55 @@ def init_agent(
                    print("🔑 Using credentials: Microsoft Entra ID")
                elif isinstance(effective_key, str) and len(effective_key) > 12:
                    print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}")
+    elif agent.provider == "moa":
+        from agent.moa_loop import MoAClient
+        agent.api_mode = "chat_completions"
+
+        # Route reference-model outputs to the agent's tool_progress_callback so
+        # every surface that already consumes it (CLI spinner/scrollback, TUI,
+        # desktop, gateway) can show each reference's answer as a labelled block
+        # before the aggregator acts. The facade emits "moa.reference" and
+        # "moa.aggregating" events; we forward them through the same callback
+        # the tool lifecycle uses. Best-effort and cache-safe — these are
+        # display-only events, they never touch the message history.
+        def _moa_reference_relay(event: str, **kwargs: Any) -> None:
+            cb = getattr(agent, "tool_progress_callback", None)
+            if cb is None:
+                return
+            try:
+                if event == "moa.reference":
+                    label = str(kwargs.get("label") or "")
+                    text = str(kwargs.get("text") or "")
+                    idx = kwargs.get("index")
+                    count = kwargs.get("count")
+                    cb(
+                        "moa.reference",
+                        label,
+                        text,
+                        None,
+                        moa_index=idx,
+                        moa_count=count,
+                    )
+                elif event == "moa.aggregating":
+                    cb(
+                        "moa.aggregating",
+                        str(kwargs.get("aggregator") or ""),
+                        None,
+                        None,
+                        moa_ref_count=kwargs.get("ref_count"),
+                    )
+            except Exception:
+                pass
+
+        agent.client = MoAClient(
+            agent.model or "default",
+            reference_callback=_moa_reference_relay,
+        )
+        agent._client_kwargs = {}
+        agent.api_key = api_key or "moa-virtual-provider"
+        agent.base_url = "moa://local"
+        if not agent.quiet_mode:
+            print(f"🤖 AI Agent initialized with MoA preset: {agent.model}")
    elif agent.api_mode == "bedrock_converse":
        # AWS Bedrock — uses boto3 directly, no OpenAI client needed.
        # Region is extracted from the base_url or defaults to us-east-1.
@@ -1246,6 +1307,12 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

+    # Intent-ack continuation config: "auto" (default — codex_responses only,
+    # the historical gate), true (all api_modes), false (never), or a list of
+    # model-name substrings.  Resolved against the active api_mode/model in the
+    # conversation loop's intent-ack block.
+    agent._intent_ack_continuation = _agent_section.get("intent_ack_continuation", "auto")
+
    # Universal task-completion guidance toggle.  Default True.  Surfaced
    # as a separate flag from tool_use_enforcement because the guidance
    # applies to ALL models, not just the model families enforcement
@@ -1506,6 +1573,7 @@ def init_agent(
    # 3. Check general plugin system (user-installed plugins)
    # 4. Fall back to built-in ContextCompressor
    _selected_engine = None
+    _copy_failed = False
    _engine_name = "compressor"  # default
    try:
        _ctx_cfg = _agent_cfg.get("context", {}) if isinstance(_agent_cfg, dict) else {}
@@ -1523,15 +1591,35 @@ def init_agent(

        # Try general plugin system as fallback
        if _selected_engine is None:
+            _candidate = None
            try:
                from hermes_cli.plugins import get_plugin_context_engine
                _candidate = get_plugin_context_engine()
-                if _candidate and _candidate.name == _engine_name:
-                    _selected_engine = _candidate
            except Exception:
-                pass
+                _candidate = None
+            if _candidate is not None and _candidate.name == _engine_name:
+                # Deep-copy the shared plugin singleton so a child agent's
+                # update_model() can't mutate the parent's compressor (#42449).
+                # Copy can fail for engines holding uncopyable state (locks, DB
+                # connections, clients); in that case fall back to the built-in
+                # compressor with an ACCURATE message rather than silently
+                # mislabelling it "not found".
+                import copy
+                try:
+                    _selected_engine = copy.deepcopy(_candidate)
+                except Exception as _copy_err:
+                    _copy_failed = True
+                    _ra().logger.warning(
+                        "Context engine '%s' could not be safely copied for this "
+                        "agent (%s) — falling back to built-in compressor. Plugin "
+                        "engines that hold uncopyable state (locks, DB connections) "
+                        "should implement __deepcopy__ to copy only mutable budget "
+                        "state.",
+                        _engine_name, _copy_err,
+                    )
+                    _selected_engine = None

-        if _selected_engine is None:
+        if _selected_engine is None and not _copy_failed:
            _ra().logger.warning(
                "Context engine '%s' not found — falling back to built-in compressor",
                _engine_name,
@@ -1588,8 +1676,10 @@ def init_agent(
            f"Model {agent.model} has a context window of {_ctx:,} tokens, "
            f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
            f"by Hermes Agent.  Choose a model with at least "
-            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
-            f"model.context_length in config.yaml to override."
+            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context.  If your server "
+            f"reports a window smaller than the model's true window, set "
+            f"model.context_length in config.yaml to the real value "
+            f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
        )

    # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
@@ -1621,16 +1711,27 @@ def init_agent(
            for t in agent.tools
            if isinstance(t, dict)
        }
-        for _schema in agent.context_compressor.get_tool_schemas():
-            _tname = _schema.get("name", "")
-            if _tname and _tname in _existing_tool_names:
+        from agent.memory_manager import normalize_tool_schema as _normalize_tool_schema
+        for _raw_schema in agent.context_compressor.get_tool_schemas():
+            _schema = _normalize_tool_schema(_raw_schema)
+            if _schema is None:
+                # A schema with no resolvable name (e.g. an already-wrapped
+                # entry) would append a nameless tool that strict providers
+                # 400 on, disabling the whole toolset (#47707). Skip it.
+                _ra().logger.warning(
+                    "Context engine returned a tool schema with no resolvable "
+                    "name; skipping to avoid poisoning the request (%r)",
+                    _raw_schema,
+                )
+                continue
+            _tname = _schema["name"]
+            if _tname in _existing_tool_names:
                continue  # already registered via plugin/cache path
            _wrapped = {"type": "function", "function": _schema}
            agent.tools.append(_wrapped)
-            if _tname:
-                agent.valid_tool_names.add(_tname)
-                agent._context_engine_tool_names.add(_tname)
-                _existing_tool_names.add(_tname)
+            agent.valid_tool_names.add(_tname)
+            agent._context_engine_tool_names.add(_tname)
+            _existing_tool_names.add(_tname)

    # Notify context engine of session start
    if hasattr(agent, "context_compressor") and agent.context_compressor:
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@@ -42,6 +42,14 @@ from utils import base_url_host_matches, base_url_hostname, env_var_enabled, ato
 logger = logging.getLogger(__name__)


+# Max consecutive successful credential-pool token refreshes of the SAME entry
+# on a persistent auth failure before we give up and let the fallback chain
+# activate. A single-entry OAuth pool can re-mint a fresh token indefinitely
+# even when the upstream keeps rejecting it, so without this cap the retry loop
+# spins forever and never reaches ``_try_activate_fallback``. See #26080.
+_MAX_AUTH_REFRESH_ATTEMPTS = 2
+
+
 def _ra():
    """Lazy ``run_agent`` reference for test-patch routing."""
    import run_agent
@@ -775,6 +783,30 @@ def recover_with_credential_pool(
            return False, has_retried_429
        refreshed = pool.try_refresh_current()
        if refreshed is not None:
+            # ``try_refresh_current()`` re-mints a fresh OAuth token and reports
+            # success even when the upstream keeps rejecting it — a single-entry
+            # pool (common for OAuth/Max subscribers) has nothing to rotate to,
+            # so a bare "refreshed → retry" loop spins forever on the same dead
+            # token and the configured fallback never activates. Cap consecutive
+            # same-entry refreshes and fall through to fallback once exceeded.
+            # See #26080.
+            refreshed_id = getattr(refreshed, "id", None)
+            if refreshed_id is not None:
+                refresh_counts = getattr(agent, "_auth_pool_refresh_counts", None)
+                if refresh_counts is None:
+                    refresh_counts = {}
+                    agent._auth_pool_refresh_counts = refresh_counts
+                refresh_key = (agent.provider, refreshed_id)
+                refresh_counts[refresh_key] = refresh_counts.get(refresh_key, 0) + 1
+                if refresh_counts[refresh_key] > _MAX_AUTH_REFRESH_ATTEMPTS:
+                    _ra().logger.warning(
+                        "Credential auth failure persists after %s refreshes for "
+                        "pool entry %s — treating as unrecoverable and allowing "
+                        "fallback to activate.",
+                        refresh_counts[refresh_key] - 1,
+                        refreshed_id,
+                    )
+                    return False, has_retried_429
            _ra().logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}")
            agent._swap_credential(refreshed)
            return True, has_retried_429
@@ -1046,6 +1078,34 @@ def restore_primary_runtime(agent) -> bool:
            api_mode=rt.get("compressor_api_mode", ""),
        )

+        # ── Re-select from the credential pool if one is available ──
+        # The snapshot's api_key was captured at construction time.  Across
+        # turns the pool may have rotated (token revocation, billing/rate-limit
+        # exhaustion, cooldown), leaving the snapshot key stale.  Restoring it
+        # blindly re-fails on the first request and burns through the remaining
+        # pool entries before cross-provider fallback even gets a chance.  Ask
+        # the pool for its current best entry and swap the live credential in.
+        # When the pool is absent, empty, or the entry has no usable key, we
+        # keep the snapshot key (the existing behavior).  Fixes #25205.
+        pool = getattr(agent, "_credential_pool", None)
+        if pool is not None and pool.has_available():
+            entry = pool.select()
+            if entry is not None:
+                entry_key = (
+                    getattr(entry, "runtime_api_key", None)
+                    or getattr(entry, "access_token", "")
+                )
+                if entry_key:
+                    # ``_swap_credential`` rebuilds the OpenAI/Anthropic client,
+                    # reapplies base-url-scoped headers, and carries the
+                    # accumulated base_url / OAuth-detection fixes (#33163).
+                    agent._swap_credential(entry)
+                    logger.info(
+                        "Restore re-selected pool entry %s (%s)",
+                        getattr(entry, "id", "?"),
+                        getattr(entry, "label", "?"),
+                    )
+
        # ── Reset fallback chain for the new turn ──
        agent._fallback_activated = False
        agent._fallback_index = 0
@@ -1221,7 +1281,11 @@ def dump_api_request_debug(
            dump_payload["error"] = error_info

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        dump_file = agent.logs_dir / f"request_dump_{agent.session_id}_{timestamp}.json"
+        # Sanitize the session ID into a traversal-free path segment — it can
+        # originate from untrusted input (X-Hermes-Session-Id header), and an
+        # unsanitized "../"-shaped ID would write the dump outside logs_dir.
+        safe_sid = _ra()._safe_session_filename_component(agent.session_id)
+        dump_file = agent.logs_dir / f"request_dump_{safe_sid}_{timestamp}.json"

        # Redact secrets before persisting/printing. This dump captures the
        # full request body (system prompt, tool defs, context-embedded
@@ -1420,6 +1484,15 @@ def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: boo
        keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", ""))
        if keepalive_http is not None:
            client_kwargs["http_client"] = keepalive_http
+    # Delegate all rate-limit / 5xx retry to hermes's outer conversation loop,
+    # which honors Retry-After and applies adaptive/jittered backoff. The OpenAI
+    # SDK default (max_retries=2) uses its own 1-2s backoff that ignores
+    # Retry-After and double-retries inside our loop — the same deadlock the
+    # Anthropic clients hit (#26293). This is the single chokepoint every primary
+    # OpenAI/aggregator client passes through (init, switch_model, recovery,
+    # restore, request-scoped); auxiliary_client builds its own clients and keeps
+    # SDK retries because it is NOT wrapped by the conversation loop.
+    client_kwargs.setdefault("max_retries", 0)
    # Uses the module-level `OpenAI` name, resolved lazily on first
    # access via __getattr__ below. Tests patch via `run_agent.OpenAI`.
    client = _ra().OpenAI(**client_kwargs)
@@ -1499,6 +1572,10 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
    # _client_kwargs is a dict — snapshot a shallow copy so mutating the
    # live dict doesn't poison the rollback target.
    _snapshot["_client_kwargs"] = dict(getattr(agent, "_client_kwargs", {}) or {})
+    # Snapshot the credential pool reference so a failed client rebuild can
+    # restore the original pool (issue #52727: pool reload is part of this
+    # switch and must be reversible on rollback).
+    _snapshot["_credential_pool"] = getattr(agent, "_credential_pool", _MISSING)

    try:
        # Clear the per-config context_length override so the new model's
@@ -1523,8 +1600,36 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        if api_key:
            agent.api_key = api_key

+        # ── Reload credential pool for the new provider (issue #52727) ──
+        # Without this, ``recover_with_credential_pool`` sees a
+        # ``pool.provider != agent.provider`` mismatch and short-circuits,
+        # leaving the new provider with no rotation/recovery on 401/429 and
+        # burning the original pool's entries. Only reload when the provider
+        # actually changed (or the pool was missing) — re-selecting the same
+        # provider must not churn the pool reference. A reload failure is
+        # logged + swallowed: the switch itself must still complete.
+        old_norm = (old_provider or "").strip().lower()
+        new_norm = (new_provider or "").strip().lower()
+        if old_norm != new_norm or getattr(agent, "_credential_pool", None) is None:
+            try:
+                from agent.credential_pool import load_pool
+                agent._credential_pool = load_pool(new_provider)
+            except Exception as _pool_exc:  # noqa: BLE001
+                logger.warning(
+                    "switch_model: credential pool reload failed for %s (%s); "
+                    "continuing without pool rotation this turn",
+                    new_provider, _pool_exc,
+                )
+
        # ── Build new client ──
-        if api_mode == "anthropic_messages":
+        if (new_provider or "").strip().lower() == "moa":
+            from agent.moa_loop import MoAClient
+
+            agent.api_key = api_key or "moa-virtual-provider"
+            agent.base_url = "moa://local"
+            agent._client_kwargs = {}
+            agent.client = MoAClient(agent.model or "default")
+        elif api_mode == "anthropic_messages":
            from agent.anthropic_adapter import (
                build_anthropic_client,
                resolve_anthropic_token,
@@ -1697,6 +1802,27 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        old_model, old_provider, new_model, new_provider,
    )

+    # ── Persist billing route to session DB ──
+    # The agent's _session_db / session_id may not be set in all contexts
+    # (tests, bare agents without a session DB, etc.).  This ensures the
+    # dashboard Model cards show the actual provider after a mid-session
+    # /model switch instead of the stale session-creation provider.
+    # See #48248 for the full bug description.
+    _session_db = getattr(agent, "_session_db", None)
+    _session_id = getattr(agent, "session_id", None)
+    if _session_db is not None and _session_id:
+        try:
+            _session_db.update_session_billing_route(
+                _session_id,
+                provider=agent.provider,
+                base_url=agent.base_url,
+                billing_mode=getattr(agent, "api_mode", None),
+            )
+        except Exception:
+            logger.warning(
+                "Failed to persist billing route after model switch",
+                exc_info=True,
+            )


 def invoke_tool(agent, function_name: str, function_args: dict, effective_task_id: str,
@@ -2083,8 +2209,21 @@ def looks_like_codex_intermediate_ack(
    user_message: str,
    assistant_content: str,
    messages: List[Dict[str, Any]],
+    require_workspace: bool = True,
 ) -> bool:
-    """Detect a planning/ack message that should continue instead of ending the turn."""
+    """Detect a planning/ack message that should continue instead of ending the turn.
+
+    ``require_workspace`` (default True) keeps the original codex-coding scope:
+    the ack must reference a filesystem/repo workspace. The conversation loop
+    passes ``require_workspace=False`` when the user has explicitly opted into
+    intent-ack continuation for all api_modes (``agent.intent_ack_continuation``
+    is ``true`` or a model-list), so general autonomous workflows ("I'll run a
+    health check on the server", "I'll start the deployment") — which carry a
+    future-ack and an action verb but no filesystem reference — are caught too.
+    The future-ack + short-content + no-prior-tools + action-verb requirements
+    always apply, which is what keeps conversational "I'll help you brainstorm"
+    replies from tripping it.
+    """
    if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
        return False

@@ -2137,17 +2276,67 @@ def looks_like_codex_intermediate_ack(
        "path",
    )

+    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
+    if not assistant_mentions_action:
+        return False
+
+    # Opted-in (all-api_mode) path: a future-ack + action verb + no prior tool
+    # call is enough — the user asked us to keep going when the model only
+    # announces intent, regardless of whether a filesystem is involved.
+    if not require_workspace:
+        return True
+
    user_text = (user_message or "").strip().lower()
    user_targets_workspace = (
        any(marker in user_text for marker in workspace_markers)
        or "~/" in user_text
        or "/" in user_text
    )
-    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
    assistant_targets_workspace = any(
        marker in assistant_text for marker in workspace_markers
    )
-    return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action
+    return user_targets_workspace or assistant_targets_workspace
+
+
+def intent_ack_continuation_mode(agent) -> str:
+    """Classify the resolved intent-ack continuation mode for this turn.
+
+    Returns one of:
+      * ``"off"``        — never continue.
+      * ``"codex_only"`` — historical scope: continue only on the
+        ``codex_responses`` api_mode, and only for codebase/workspace acks
+        (``require_workspace=True``).
+      * ``"all"``        — user opted in for every api_mode; continue on any
+        future-ack + action verb (``require_workspace=False``).
+
+    Mirrors the four-mode shape of ``agent.tool_use_enforcement``: ``"auto"``
+    (default) → codex_only; ``True``/"true"/"always"/"yes"/"on" → all;
+    ``False``/"false"/"never"/"no"/"off" → off; ``list`` → all when a substring
+    matches the active model name, else off.
+    """
+    mode = getattr(agent, "_intent_ack_continuation", "auto")
+
+    if mode is True or (isinstance(mode, str) and mode.lower() in {"true", "always", "yes", "on"}):
+        return "all"
+    if mode is False or (isinstance(mode, str) and mode.lower() in {"false", "never", "no", "off"}):
+        return "off"
+    if isinstance(mode, list):
+        model_lower = (agent.model or "").lower()
+        return "all" if any(p.lower() in model_lower for p in mode if isinstance(p, str)) else "off"
+    # "auto" or any unrecognised value — historical codex-only behavior.
+    return "codex_only" if agent.api_mode == "codex_responses" else "off"
+
+
+def intent_ack_continuation_enabled(agent) -> bool:
+    """Whether intent-ack continuation should fire at all for this turn.
+
+    The ``codex_ack_continuations < 2`` per-turn cap and the
+    ``looks_like_codex_intermediate_ack`` detector are applied by the caller;
+    this only decides the on/off gate. Callers that also need to know whether
+    the workspace requirement applies should use ``intent_ack_continuation_mode``
+    directly (``"codex_only"`` ⇒ require_workspace=True, ``"all"`` ⇒ False).
+    """
+    return intent_ack_continuation_mode(agent) != "off"



--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
    kwargs = {
        "timeout": timeout_obj,
        "http_client": http_client,
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        "max_retries": 0,
        # The SDK requires *something* for api_key/auth_token. Our
        # event hook overrides Authorization per request so this value
        # is never sent. The sentinel string makes accidental leaks
@@ -757,6 +760,12 @@ def build_anthropic_client(
    _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
    kwargs = {
        "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
+        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
+        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
+        # its own 1-2s backoff that ignores Retry-After and double-retries
+        # inside our loop — burning request slots against a bucket that won't
+        # refill for minutes. (#26293)
+        "max_retries": 0,
    }
    if normalized_base_url:
        # Azure Anthropic endpoints require an ``api-version`` query parameter.
@@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
    return _anthropic_sdk.AnthropicBedrock(
        aws_region=region,
        timeout=Timeout(timeout=900.0, connect=10.0),
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        max_retries=0,
        default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
    )

@@ -914,44 +926,72 @@ def _read_claude_code_credentials_from_keychain() -> Optional[Dict[str, Any]]:
    return None


+def _read_claude_code_credentials_from_file() -> Optional[Dict[str, Any]]:
+    """Read Claude Code OAuth credentials from ~/.claude/.credentials.json.
+
+    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
+    """
+    cred_path = Path.home() / ".claude" / ".credentials.json"
+    if not cred_path.exists():
+        return None
+    try:
+        data = json.loads(cred_path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, IOError) as e:
+        logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
+        return None
+
+    oauth_data = data.get("claudeAiOauth")
+    if not (oauth_data and isinstance(oauth_data, dict)):
+        return None
+    access_token = oauth_data.get("accessToken", "")
+    if not access_token:
+        return None
+    return {
+        "accessToken": access_token,
+        "refreshToken": oauth_data.get("refreshToken", ""),
+        "expiresAt": oauth_data.get("expiresAt", 0),
+        "source": "claude_code_credentials_file",
+    }
+
+
 def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    """Read refreshable Claude Code OAuth credentials.

-    Checks two sources in order:
+    Reads from two possible sources and reconciles them:
      1. macOS Keychain (Darwin only) — "Claude Code-credentials" entry
      2. ~/.claude/.credentials.json file

+    Selection rules when both are present:
+      - If exactly one is non-expired, prefer that one. (Handles the case
+        where Claude Code refreshes one source but not the other — observed
+        in the wild on Claude Code 2.1.x.)
+      - Otherwise, prefer the source with the later ``expiresAt`` so that
+        any subsequent refresh uses the most recent ``refreshToken``.
+
    This intentionally excludes ~/.claude.json primaryApiKey. Opencode's
    subscription flow is OAuth/setup-token based with refreshable credentials,
    and native direct Anthropic provider usage should follow that path rather
    than auto-detecting Claude's first-party managed key.

-    Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
+    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
    """
-    # Try macOS Keychain first (covers Claude Code >=2.1.114)
    kc_creds = _read_claude_code_credentials_from_keychain()
-    if kc_creds:
-        return kc_creds
+    file_creds = _read_claude_code_credentials_from_file()

-    # Fall back to JSON file
-    cred_path = Path.home() / ".claude" / ".credentials.json"
-    if cred_path.exists():
-        try:
-            data = json.loads(cred_path.read_text(encoding="utf-8"))
-            oauth_data = data.get("claudeAiOauth")
-            if oauth_data and isinstance(oauth_data, dict):
-                access_token = oauth_data.get("accessToken", "")
-                if access_token:
-                    return {
-                        "accessToken": access_token,
-                        "refreshToken": oauth_data.get("refreshToken", ""),
-                        "expiresAt": oauth_data.get("expiresAt", 0),
-                        "source": "claude_code_credentials_file",
-                    }
-        except (json.JSONDecodeError, OSError, IOError) as e:
-            logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
+    if kc_creds and file_creds:
+        kc_valid = is_claude_code_token_valid(kc_creds)
+        file_valid = is_claude_code_token_valid(file_creds)
+        if kc_valid and not file_valid:
+            return kc_creds
+        if file_valid and not kc_valid:
+            return file_creds
+        # Both valid or both expired: prefer the later expiresAt so the
+        # downstream refresh path uses the freshest refresh_token.
+        kc_exp = kc_creds.get("expiresAt", 0) or 0
+        file_exp = file_creds.get("expiresAt", 0) or 0
+        return kc_creds if kc_exp >= file_exp else file_creds

-    return None
+    return kc_creds or file_creds


 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
@@ -1034,8 +1074,40 @@ def refresh_anthropic_oauth_pure(refresh_token: str, *, use_json: bool = False)


 def _refresh_oauth_token(creds: Dict[str, Any]) -> Optional[str]:
-    """Attempt to refresh an expired Claude Code OAuth token."""
-    refresh_token = creds.get("refreshToken", "")
+    """Attempt to refresh an expired Claude Code OAuth token.
+
+    Claude Code's OAuth refresh tokens are single-use: a successful refresh
+    rotates the pair and invalidates the old refresh token. Claude Code itself
+    also refreshes on its own schedule (IDE/CLI activity), so by the time
+    Hermes notices an expired token, Claude Code may have already rotated it.
+    POSTing our now-stale refresh token in that window races Claude Code and
+    fails with ``invalid_grant``.
+
+    So before refreshing, re-read the live credential sources. If Claude Code
+    has already produced a valid token, adopt it and skip the POST entirely.
+    Only fall back to refreshing ourselves when no fresh credential is found.
+    """
+    # Claude Code may have already refreshed — adopt its token rather than
+    # racing it with our (possibly already-rotated) refresh token. Only adopt
+    # when the live re-read produced a DIFFERENT token with a real future
+    # expiry: re-adopting the same credential we were just handed would be a
+    # no-op, and a 0/absent ``expiresAt`` means "managed key / unknown expiry"
+    # (see is_claude_code_token_valid) which must NOT be treated as a fresh
+    # refresh here.
+    current = read_claude_code_credentials()
+    if current:
+        current_token = current.get("accessToken", "")
+        current_exp = current.get("expiresAt", 0) or 0
+        if (
+            current_token
+            and current_token != creds.get("accessToken", "")
+            and current_exp > 0
+            and is_claude_code_token_valid(current)
+        ):
+            logger.debug("Adopted Claude Code's already-refreshed OAuth token")
+            return current_token
+
+    refresh_token = (current or {}).get("refreshToken", "") or creds.get("refreshToken", "")
    if not refresh_token:
        logger.debug("No refresh token available — cannot refresh")
        return None
@@ -1297,7 +1369,15 @@ def run_oauth_setup_token() -> Optional[str]:
 # Stores credentials in ~/.hermes/.anthropic_oauth.json (our own file).

 _OAUTH_CLIENT_ID = "9d1c250a-e61b-44d9-88ed-5944d1962f5e"
-_OAUTH_TOKEN_URL = "https://console.anthropic.com/v1/oauth/token"
+# Anthropic migrated the OAuth token endpoint to platform.claude.com;
+# console.anthropic.com now 404s. Callers should iterate _OAUTH_TOKEN_URLS
+# (new host first, console fallback). _OAUTH_TOKEN_URL is kept as the primary
+# for backward compatibility with existing imports and now points at the live host.
+_OAUTH_TOKEN_URLS = [
+    "https://platform.claude.com/v1/oauth/token",
+    "https://console.anthropic.com/v1/oauth/token",
+]
+_OAUTH_TOKEN_URL = _OAUTH_TOKEN_URLS[0]
 _OAUTH_REDIRECT_URI = "https://console.anthropic.com/oauth/code/callback"
 _OAUTH_SCOPES = "org:create_api_key user:profile user:inference"
 _HERMES_OAUTH_FILE = get_hermes_home() / ".anthropic_oauth.json"
@@ -1395,18 +1475,34 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
            "code_verifier": verifier,
        }).encode()

-        req = urllib.request.Request(
-            _OAUTH_TOKEN_URL,
-            data=exchange_data,
-            headers={
-                "Content-Type": "application/json",
-                "User-Agent": f"claude-cli/{_get_claude_code_version()} (external, cli)",
-            },
-            method="POST",
-        )
+        # Anthropic migrated the OAuth token endpoint to platform.claude.com;
+        # console.anthropic.com now 404s. Try the new host first, then fall
+        # back to console for older deployments (mirrors the refresh path).
+        result = None
+        last_error = None
+        for endpoint in _OAUTH_TOKEN_URLS:
+            req = urllib.request.Request(
+                endpoint,
+                data=exchange_data,
+                headers={
+                    "Content-Type": "application/json",
+                    "User-Agent": f"claude-cli/{_get_claude_code_version()} (external, cli)",
+                },
+                method="POST",
+            )
+            try:
+                with urllib.request.urlopen(req, timeout=15) as resp:
+                    result = json.loads(resp.read().decode())
+                break
+            except Exception as exc:
+                last_error = exc
+                logger.debug("Anthropic token exchange failed at %s: %s", endpoint, exc)
+                continue

-        with urllib.request.urlopen(req, timeout=15) as resp:
-            result = json.loads(resp.read().decode())
+        if result is None:
+            raise last_error if last_error is not None else ValueError(
+                "Anthropic token exchange failed"
+            )
    except Exception as e:
        print(f"Token exchange failed: {e}")
        return None
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -101,6 +101,8 @@ class _OpenAIProxy:
 OpenAI = _OpenAIProxy()  # module-level name, resolves lazily on call/isinstance

 from agent.credential_pool import load_pool
+from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length
+from agent.process_bootstrap import build_keepalive_http_client
 from hermes_cli.config import get_hermes_home
 from hermes_constants import OPENROUTER_BASE_URL
 from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars
@@ -108,6 +110,23 @@ from utils import base_url_host_matches, base_url_hostname, env_float, model_for
 logger = logging.getLogger(__name__)


+def _openai_http_client_kwargs(
+    base_url: Optional[str],
+    *,
+    async_mode: bool = False,
+) -> Dict[str, Any]:
+    """Inject keepalive httpx client with env-only proxy (not macOS system proxy)."""
+    client = build_keepalive_http_client(str(base_url or ""), async_mode=async_mode)
+    if client is None:
+        return {}
+    return {"http_client": client}
+
+
+def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
+    kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
+    return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
+
+
 # ── Interrupt protection for atomic auxiliary tasks ──────────────────────
 # Some auxiliary tasks must NOT be aborted mid-flight by a gateway interrupt
 # (e.g. an incoming user message while the agent is busy). Context
@@ -665,6 +684,28 @@ def _pool_runtime_base_url(entry: Any, fallback: str = "") -> str:
    return str(url or "").strip().rstrip("/")


+# Hostnames (lowercase, exact) that the auxiliary Anthropic path is allowed to
+# be pointed at via config.yaml model.base_url. Anything else falls back to the
+# Anthropic default — operators routing main-session traffic through a
+# non-Anthropic host (e.g. OpenRouter, OpenAI) with provider=anthropic in config
+# must NOT have that foreign host leak into the auxiliary client. See #52608.
+_ANTHROPIC_COMPATIBLE_HOSTS = frozenset({
+    "api.anthropic.com",
+})
+
+
+def _is_anthropic_compatible_host(url: str) -> bool:
+    """Return True if ``url``'s hostname is an Anthropic endpoint we trust for aux calls."""
+    if not url:
+        return False
+    try:
+        from urllib.parse import urlparse
+        host = (urlparse(url).hostname or "").strip().lower().rstrip(".")
+        return host in _ANTHROPIC_COMPATIBLE_HOSTS
+    except Exception:
+        return False
+
+
 def _nous_min_key_ttl_seconds() -> int:
    try:
        return max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800")))
@@ -1591,7 +1632,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            _merged_aux = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_aux:
                extra["default_headers"] = _merged_aux
-            _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
+            _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
            _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
            return _client, model

@@ -1631,7 +1672,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
        _merged_aux2 = _apply_user_default_headers(extra.get("default_headers"))
        if _merged_aux2:
            extra["default_headers"] = _merged_aux2
-        _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
+        _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
        _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
        return _client, model

@@ -1646,20 +1687,21 @@ def _try_openrouter(explicit_api_key: str = None, model: str = None) -> Tuple[Op
    pool_present, entry = _select_pool_entry("openrouter")
    if pool_present:
        or_key = explicit_api_key or _pool_runtime_api_key(entry)
-        if not or_key:
-            _mark_provider_unhealthy("openrouter", ttl=60)
-            return None, None
-        base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
-        logger.debug("Auxiliary client: OpenRouter via pool")
-        return OpenAI(api_key=or_key, base_url=base_url,
-                       default_headers=build_or_headers()), model or _OPENROUTER_MODEL
+        if or_key:
+            base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
+            logger.debug("Auxiliary client: OpenRouter via pool")
+            return _create_openai_client(api_key=or_key, base_url=base_url,
+                           default_headers=build_or_headers()), model or _OPENROUTER_MODEL
+        # Pool exists but is exhausted (no usable runtime key) — fall through to
+        # the OPENROUTER_API_KEY env-var path rather than failing outright.
+        logger.debug("Auxiliary client: OpenRouter pool exhausted, trying OPENROUTER_API_KEY")

    or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY")
    if not or_key:
        _mark_provider_unhealthy("openrouter", ttl=60)
        return None, None
    logger.debug("Auxiliary client: OpenRouter")
-    return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
+    return _create_openai_client(api_key=or_key, base_url=OPENROUTER_BASE_URL,
                   default_headers=build_or_headers()), model or _OPENROUTER_MODEL


@@ -1752,7 +1794,7 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
-        OpenAI(
+        _create_openai_client(
            api_key=api_key,
            base_url=base_url,
        ),
@@ -2029,7 +2071,7 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
    if _custom_headers:
        _extra["default_headers"] = _custom_headers
    if custom_mode == "codex_responses":
-        real_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
+        real_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
        return CodexAuxiliaryClient(real_client, model), model
    if custom_mode == "anthropic_messages":
        # Third-party Anthropic-compatible gateway (MiniMax, Zhipu GLM,
@@ -2043,14 +2085,14 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
                "Custom endpoint declares api_mode=anthropic_messages but the "
                "anthropic SDK is not installed — falling back to OpenAI-wire."
            )
-            return OpenAI(api_key=custom_key, base_url=_clean_base, **_extra), model
+            return _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra), model
        return (
            AnthropicAuxiliaryClient(real_client, model, custom_key, custom_base, is_oauth=False),
            model,
        )
    # URL-based anthropic detection for custom endpoints that didn't set
    # api_mode explicitly (e.g. kimi.com/coding reached via custom config).
-    _fallback_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
+    _fallback_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
    _fallback_client = _maybe_wrap_anthropic(
        _fallback_client, model, custom_key, custom_base, custom_mode,
    )
@@ -2079,7 +2121,7 @@ def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str
        return None, None
    api_key, base_url = resolved
    logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
-    real_client = OpenAI(api_key=api_key, base_url=base_url)
+    real_client = _create_openai_client(api_key=api_key, base_url=base_url)
    return CodexAuxiliaryClient(real_client, model), model


@@ -2116,7 +2158,7 @@ def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
            return None, None
        base_url = _CODEX_AUX_BASE_URL
    logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", model)
-    real_client = OpenAI(
+    real_client = _create_openai_client(
        api_key=codex_token,
        base_url=base_url,
        default_headers=_codex_cloudflare_headers(codex_token),
@@ -2216,7 +2258,7 @@ def _try_azure_foundry(
    if _dq:
        extra["default_query"] = _dq

-    client = OpenAI(api_key=api_key, base_url=_clean_base, **extra)
+    client = _create_openai_client(api_key=api_key, base_url=_clean_base, **extra)

    if runtime_api_mode == "codex_responses":
        # GPT-5.x / o-series / codex models on Azure Foundry are
@@ -2255,9 +2297,16 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
    if not token:
        return None, None

-    # Allow base URL override from config.yaml model.base_url, but only
-    # when the configured provider is anthropic — otherwise a non-Anthropic
-    # base_url (e.g. Codex endpoint) would leak into Anthropic requests.
+    # Allow base URL override from config.yaml model.base_url, but only when:
+    #   1. the configured provider is anthropic (otherwise a non-Anthropic
+    #      base_url, e.g. Codex endpoint, would leak into Anthropic requests), AND
+    #   2. the override URL actually points at an Anthropic-compatible endpoint.
+    # Without gate (2), operators who route main-session traffic through a
+    # non-Anthropic provider that accepts Anthropic-format requests (e.g.
+    # OpenRouter at openrouter.ai/api/v1, with provider=anthropic in config.yaml)
+    # would have every auxiliary side-channel call (memory extractors,
+    # reflection, vision, title generation) 401 from the foreign host —
+    # see issue #52608.
    base_url = _pool_runtime_base_url(entry, _ANTHROPIC_DEFAULT_BASE_URL) if pool_present else _ANTHROPIC_DEFAULT_BASE_URL
    try:
        from hermes_cli.config import load_config
@@ -2267,7 +2316,7 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
            cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
            if cfg_provider == "anthropic":
                cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/")
-                if cfg_base_url:
+                if cfg_base_url and _is_anthropic_compatible_host(cfg_base_url):
                    base_url = cfg_base_url
    except Exception:
        pass
@@ -2470,7 +2519,7 @@ def _is_payment_error(exc: Exception) -> bool:
    # but sometimes wrap them in 429 or other codes.
    # Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
    # uses different language but is semantically identical to credit exhaustion.
-    if status in {402, 404, 429, None}:
+    if status in {402, 403, 404, 429, None}:
        if any(kw in err_lower for kw in (
            "credits", "insufficient funds",
            "can only afford", "billing",
@@ -2479,6 +2528,8 @@ def _is_payment_error(exc: Exception) -> bool:
            "balance_depleted", "no usable credits",
            "model_not_supported_on_free_tier",
            "not available on the free tier",
+            "requires a subscription", "upgrade for access",
+            "upgrade for higher limits", "reached your session usage limit",
            # Daily / monthly / weekly quota exhaustion keywords
            "quota exceeded", "quota_exceeded",
            "too many tokens per day", "daily limit",
@@ -2697,6 +2748,79 @@ def _is_model_not_found_error(exc: Exception) -> bool:
    ))


+def _is_model_incompatible_error(exc: Exception) -> bool:
+    """Detect "this route cannot serve this model" 400s (capability mismatch).
+
+    Distinct from :func:`_is_model_not_found_error` (the model does not exist
+    anywhere): here the model name is valid but the *current provider/account*
+    is structurally unable to run it. The canonical case is a configured
+    fallback that cannot run the main model — e.g. an ``openai-codex`` /
+    ChatGPT-account fallback asked to compress a ``glm-5.2`` conversation::
+
+        Error code: 400 - {'detail': "The 'glm-5.2' model is not supported
+        when using Codex with a ChatGPT account."}
+
+    The candidate authenticates fine and builds a client, so the auth and
+    payment predicates don't fire and the call would otherwise raise and
+    abort the whole auxiliary task (commonly compression — which then drops
+    middle turns and churns the session, destroying the prompt cache).
+    Treating it as a fallback-worthy capability error lets the chain skip the
+    incapable route and continue to the next candidate, mirroring the
+    context-window feasibility screen (#52392).
+
+    Billing/quota 400s belong to :func:`_is_payment_error`; "model does not
+    exist" 400s belong to :func:`_is_model_not_found_error`. This predicate
+    explicitly excludes both so the three don't overlap.
+    """
+    status = getattr(exc, "status_code", None)
+    if status not in {400, None}:
+        return False
+    err_lower = str(exc).lower()
+    # Not-found 400s ("invalid model ID", "model does not exist") are owned by
+    # _is_model_not_found_error. Billing/free-tier 400s are owned by the
+    # payment path — key on the billing keywords directly here rather than
+    # calling _is_payment_error(), because that predicate is status-gated
+    # ({402,403,404,429,None}) and would not recognise a 400-coded billing
+    # body, letting it leak into this capability bucket.
+    if _is_model_not_found_error(exc):
+        return False
+    if any(kw in err_lower for kw in (
+        "credits", "insufficient funds", "billing", "out of funds",
+        "balance_depleted", "no usable credits", "payment required",
+        "free tier", "free-tier", "not available on the free tier",
+        "model_not_supported_on_free_tier", "quota",
+    )):
+        return False
+    return any(kw in err_lower for kw in (
+        "is not supported when using",   # codex/ChatGPT-account model gating
+        "model is not supported",
+        "not supported with this",
+        "not supported for this account",
+        "model_not_supported",
+        "does not support this model",
+        "unsupported model",
+    ))
+
+
+def _is_invalid_aux_response_error(exc: Exception) -> bool:
+    """Detect provider responses that authenticated but cannot serve aux shape.
+
+    Some OpenAI-compatible routes return HTTP 200 with an empty/malformed
+    ChatCompletion instead of a normal provider error.  That is still a
+    provider/model capability failure for auxiliary tasks: downstream callers
+    need ``choices[0].message`` and should be able to continue through the
+    same fallback path as explicit model-incompatibility errors.
+    """
+    if not isinstance(exc, RuntimeError):
+        return False
+    msg = str(exc).lower()
+    return (
+        "auxiliary " in msg
+        and "llm returned invalid response" in msg
+        and "choices[0].message" in msg
+    )
+
+
 def _evict_cached_clients(provider: str) -> None:
    """Drop cached auxiliary clients for a provider so fresh creds are used."""
    normalized = _normalize_aux_provider(provider)
@@ -3147,6 +3271,88 @@ def _try_main_agent_model_fallback(
    return client, resolved_model or main_model, label


+# ── Context-window screening for runtime fallback chains (issue #52392) ──
+#
+# When the runtime auxiliary fallback chain selects a candidate that is
+# reachable but has a context window smaller than the compression task
+# requires, the call errors out instead of continuing to the next, viable
+# candidate. The startup feasibility check in
+# ``agent.conversation_compression.check_compression_model_feasibility``
+# already filters too-small auxiliary models at startup, but the runtime
+# fallback chain (``_try_configured_fallback_chain`` and
+# ``_try_main_fallback_chain``) does not apply the same filter, so
+# compression can stop at the first alive door even if the room behind it
+# is too small.
+#
+# The helpers below screen each candidate by its effective context window
+# before it is returned. ``None`` results from ``get_model_context_length``
+# are passed through (we cannot prove a model is too small, so we do not
+# block it). This preserves the existing fallback surface for
+# unrecognised/custom models while closing the gap on the well-known ones.
+
+def _task_minimum_context_length(task: Optional[str]) -> Optional[int]:
+    """Return the minimum context length required for an auxiliary task.
+
+    Only ``compression`` carries an explicit minimum today (the same
+    ``MINIMUM_CONTEXT_LENGTH`` (64K) floor that
+    ``check_compression_model_feasibility`` already enforces at startup).
+    Other tasks (``vision``, ``title_generation``, ``web_extract``,
+    ``skills_hub``, ``mcp``, ``session_search``) return ``None`` — they
+    have no per-task context floor and the runtime chain must remain
+    permissive for them.
+
+    Returns ``None`` for an empty/``None`` task name so the helper is a
+    safe no-op when called from generic sites.
+    """
+    if not task:
+        return None
+    if task == "compression":
+        return MINIMUM_CONTEXT_LENGTH
+    return None
+
+
+def _candidate_context_window(
+    provider: str,
+    model: str,
+    base_url: str = "",
+    api_key: str = "",
+) -> Optional[int]:
+    """Resolve the effective context window for a fallback candidate.
+
+    Thin wrapper around :func:`agent.model_metadata.get_model_context_length`
+    that swallows probe failures (returns ``None``). Callers treat
+    ``None`` as "unknown — pass through" so the existing fallback
+    surface is preserved when the context-length resolver chain cannot
+    determine a value (custom endpoints, models not in the registry,
+    offline endpoints).
+
+    Best-effort, never raises — the runtime fallback chain must keep
+    moving even if the resolver hits a probe error.
+    """
+    if not model:
+        return None
+    try:
+        ctx = get_model_context_length(
+            model,
+            base_url=base_url,
+            api_key=api_key,
+            provider=provider,
+        )
+    except Exception as exc:
+        logger.debug(
+            "Auxiliary fallback: could not resolve context window for %s/%s: %s",
+            provider, model, exc,
+        )
+        return None
+    # ``get_model_context_length`` returns an int (with a 256K default
+    # fallback when nothing else matches). We still propagate ``None`` if
+    # a future change returns ``Optional[int]`` — being explicit is
+    # cheap and the test suite covers both shapes.
+    if isinstance(ctx, int) and ctx > 0:
+        return ctx
+    return None
+
+
 def _try_configured_fallback_chain(
    task: str,
    failed_provider: str,
@@ -3171,6 +3377,7 @@ def _try_configured_fallback_chain(

    skip = failed_provider.lower().strip()
    tried = []
+    min_ctx = _task_minimum_context_length(task)

    for i, entry in enumerate(chain):
        if not isinstance(entry, dict):
@@ -3188,6 +3395,20 @@ def _try_configured_fallback_chain(
            fb_client, resolved_model = None, None

        if fb_client is not None:
+            if min_ctx is not None and resolved_model:
+                fb_ctx = _candidate_context_window(
+                    fb_provider,
+                    resolved_model,
+                    base_url=str(entry.get("base_url") or ""),
+                    api_key=_fallback_entry_api_key(entry) or "",
+                )
+                if fb_ctx is not None and fb_ctx < min_ctx:
+                    logger.info(
+                        "Auxiliary %s: skipping %s (%s context=%d < min=%d), continuing chain",
+                        task, label, resolved_model, fb_ctx, min_ctx,
+                    )
+                    tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
+                    continue
            logger.info(
                "Auxiliary %s: %s on %s — configured fallback to %s (%s)",
                task, reason, failed_provider, label, resolved_model or fb_model or "default",
@@ -3203,6 +3424,28 @@ def _try_configured_fallback_chain(
    return None, None, ""


+def _try_configured_fallback_for_unavailable_client(
+    task: Optional[str],
+    failed_provider: str,
+) -> Tuple[Optional[Any], Optional[str], str]:
+    """Try task fallback_chain when an explicit aux provider cannot build.
+
+    This covers the "no client" case before any request is sent: missing
+    raw env key, unavailable OAuth/pool credentials, or provider resolver
+    returning ``(None, None)``.  It deliberately stops at the configured
+    per-task fallback chain; the main-agent model remains the last-resort
+    runtime fallback for request-time capacity errors.
+    """
+    explicit = (failed_provider or "").strip().lower()
+    if not task or not explicit or explicit in {"auto"}:
+        return None, None, ""
+    return _try_configured_fallback_chain(
+        task,
+        explicit,
+        reason="provider unavailable",
+    )
+
+
 def _fallback_entry_api_key(entry: Dict[str, Any]) -> Optional[str]:
    """Resolve inline or env-backed API key from a fallback-chain entry."""
    explicit = str(entry.get("api_key") or "").strip()
@@ -3261,6 +3504,7 @@ def _try_main_fallback_chain(
    main_norm = (_read_main_provider() or "").strip().lower()
    skip = {p for p in (failed_norm, main_norm, "auto") if p}
    tried: List[str] = []
+    min_ctx = _task_minimum_context_length(task)

    for i, entry in enumerate(chain):
        if not isinstance(entry, dict):
@@ -3284,6 +3528,20 @@ def _try_main_fallback_chain(
            logger.debug("Auxiliary %s: main fallback %s failed to resolve: %s", task or "call", label, exc)
            fb_client, resolved_model = None, None
        if fb_client is not None:
+            if min_ctx is not None:
+                fb_ctx = _candidate_context_window(
+                    fb_provider,
+                    resolved_model or fb_model,
+                    base_url=str(entry.get("base_url") or ""),
+                    api_key=_fallback_entry_api_key(entry) or "",
+                )
+                if fb_ctx is not None and fb_ctx < min_ctx:
+                    logger.info(
+                        "Auxiliary %s: skipping %s (context=%d < min=%d), continuing chain",
+                        task or "call", label, fb_ctx, min_ctx,
+                    )
+                    tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
+                    continue
            logger.info(
                "Auxiliary %s: %s on %s — main fallback chain to %s (%s)",
                task or "call", reason, failed_provider or "auto", label,
@@ -3385,6 +3643,37 @@ def _resolve_auto(
    # config.yaml (auxiliary.<task>.provider) still win over this.
    main_provider = str(runtime_provider or _read_main_provider() or "")
    main_model = str(runtime_model or _read_main_model() or "")
+
+    # MoA virtual provider: the "model" is a preset name (e.g. "opus-gpt") and
+    # there is no real "moa" HTTP endpoint, so resolving an aux client against
+    # provider="moa"/model=<preset> sends the preset name as the model id and
+    # the provider 400s ("opus-gpt is not a valid model ID"). Auxiliary tasks
+    # (title generation, compression, vision, …) don't need the reference
+    # fan-out — they should run on the aggregator, which is the preset's acting
+    # model. Resolve the MoA preset to its aggregator slot and continue Step 1
+    # with that real provider+model. Mirrors the MoA context-length resolution.
+    if main_provider == "moa":
+        try:
+            from hermes_cli.config import load_config
+            from hermes_cli.moa_config import resolve_moa_preset
+
+            _preset = resolve_moa_preset(load_config().get("moa") or {}, main_model)
+            _agg = _preset.get("aggregator") or {}
+            _agg_provider = str(_agg.get("provider") or "").strip()
+            _agg_model = str(_agg.get("model") or "").strip()
+            if _agg_provider and _agg_model and _agg_provider.lower() != "moa":
+                main_provider = _agg_provider
+                main_model = _agg_model
+                # The MoA virtual runtime carries a non-HTTP base_url
+                # ("moa://local") and a placeholder api_key; they belong to the
+                # facade, not the aggregator's real provider. Drop them so the
+                # aggregator resolves through its own provider credentials.
+                runtime_base_url = ""
+                runtime_api_key = ""
+                runtime_api_mode = ""
+        except Exception:
+            logger.debug("MoA aux resolution to aggregator failed", exc_info=True)
+
    if (main_provider and main_model
            and main_provider not in {"auto", ""}):
        resolved_provider = main_provider
@@ -3531,6 +3820,10 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
    _merged_async = _apply_user_default_headers(async_kwargs.get("default_headers"))
    if _merged_async:
        async_kwargs["default_headers"] = _merged_async
+    async_kwargs = {
+        **_openai_http_client_kwargs(sync_base_url, async_mode=True),
+        **async_kwargs,
+    }
    return AsyncOpenAI(**async_kwargs), model


@@ -3741,7 +4034,7 @@ def resolve_provider_client(
                               "but no Codex OAuth token found (run: hermes model)")
                return None, None
            final_model = _normalize_resolved_model(model, provider)
-            raw_client = OpenAI(
+            raw_client = _create_openai_client(
                api_key=codex_token,
                base_url=_CODEX_AUX_BASE_URL,
                default_headers=_codex_cloudflare_headers(codex_token),
@@ -3822,7 +4115,7 @@ def resolve_provider_client(
            _merged_custom = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_custom:
                extra["default_headers"] = _merged_custom
-            client = OpenAI(api_key=custom_key, base_url=_clean_base, **extra)
+            client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **extra)
            client = _wrap_if_needed(client, final_model, custom_base, custom_key)
            return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                    else (client, final_model))
@@ -3926,7 +4219,7 @@ def resolve_provider_client(
                        _fb_headers = _apply_user_default_headers(_fb_extra.get("default_headers"))
                        if _fb_headers:
                            _fb_extra["default_headers"] = _fb_headers
-                        client = OpenAI(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
+                        client = _create_openai_client(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
                        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                                else (client, final_model))
                    sync_anthropic = AnthropicAuxiliaryClient(
@@ -3935,7 +4228,7 @@ def resolve_provider_client(
                    if async_mode:
                        return AsyncAnthropicAuxiliaryClient(sync_anthropic), final_model
                    return sync_anthropic, final_model
-                client = OpenAI(api_key=custom_key, base_url=_clean_base2, **_extra2)
+                client = _create_openai_client(api_key=custom_key, base_url=_clean_base2, **_extra2)
                # codex_responses or inherited auto-detect (via _wrap_if_needed).
                # _wrap_if_needed reads the closed-over `api_mode` (the task-level
                # override). Named-provider entry api_mode=codex_responses also
@@ -4077,7 +4370,7 @@ def resolve_provider_client(
        _merged_main = _apply_user_default_headers(headers)
        if _merged_main:
            headers = _merged_main
-        client = OpenAI(api_key=api_key, base_url=base_url,
+        client = _create_openai_client(api_key=api_key, base_url=base_url,
                        **({"default_headers": headers} if headers else {}))

        # Copilot GPT-5+ models (except gpt-5-mini) require the Responses
@@ -4613,7 +4906,7 @@ def _refresh_nous_auxiliary_client(
        return None, model

    fresh_key, fresh_base_url = runtime
-    sync_client = OpenAI(api_key=fresh_key, base_url=fresh_base_url)
+    sync_client = _create_openai_client(api_key=fresh_key, base_url=fresh_base_url)
    final_model = model

    current_loop = None
@@ -5196,10 +5489,24 @@ def _build_call_kwargs(
        # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
        # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
        # there.
+        #
+        # NVIDIA NIM (integrate.api.nvidia.com and local NIM endpoints) is a
+        # second exception: some models—notably minimaxai/minimax-m3—return HTTP
+        # 200 with an empty choices[] payload when max_tokens is omitted. The main
+        # NVIDIA chat path already sends an output cap via the provider profile;
+        # preserve it on the auxiliary path too.
        _effective_base = base_url or (
            _current_custom_base_url() if provider == "custom" else ""
        )
-        if _is_anthropic_compat_endpoint(provider, _effective_base):
+        _provider_norm = str(provider or "").strip().lower()
+        _is_nvidia_nim = (
+            _provider_norm in {"nvidia", "nvidia-nim", "nim", "build-nvidia", "nemotron"}
+            or base_url_host_matches(_effective_base, "integrate.api.nvidia.com")
+        )
+        if (
+            _is_anthropic_compat_endpoint(provider, _effective_base)
+            or _is_nvidia_nim
+        ):
            kwargs["max_tokens"] = max_tokens

    if tools:
@@ -5254,6 +5561,9 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
        if not choices or not hasattr(choices[0], "message"):
            raise AttributeError("missing choices[0].message")
    except (AttributeError, TypeError, IndexError) as exc:
+        recovered = _recover_aux_response_message(response)
+        if recovered is not None:
+            return recovered
        response_type = type(response).__name__
        response_preview = str(response)[:120]
        raise RuntimeError(
@@ -5265,6 +5575,64 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
    return response


+def _recover_aux_response_message(response: Any) -> Optional[Any]:
+    """Synthesize chat-completions shape from Responses-style text fields.
+
+    Auxiliary callers consume ``choices[0].message``.  Some compatible
+    endpoints return text outside ``choices`` (for example ``output_text`` or
+    ``output`` items).  Preserve that response before declaring it malformed.
+    """
+    text = _extract_aux_response_text(response)
+    if not text:
+        return None
+
+    choice = SimpleNamespace(
+        message=SimpleNamespace(content=text),
+        finish_reason=getattr(response, "finish_reason", None) or "stop",
+    )
+    try:
+        response.choices = [choice]
+        return response
+    except Exception:
+        return SimpleNamespace(
+            id=getattr(response, "id", ""),
+            model=getattr(response, "model", ""),
+            object=getattr(response, "object", "chat.completion"),
+            choices=[choice],
+            usage=getattr(response, "usage", None),
+        )
+
+
+def _extract_aux_response_text(response: Any) -> str:
+    output_text = _obj_get(response, "output_text")
+    if isinstance(output_text, str) and output_text.strip():
+        return output_text.strip()
+
+    output = _obj_get(response, "output")
+    if not isinstance(output, list):
+        return ""
+
+    parts: List[str] = []
+    for item in output:
+        item_type = _obj_get(item, "type")
+        if item_type and item_type != "message":
+            continue
+        for part in (_obj_get(item, "content") or []):
+            part_type = _obj_get(part, "type")
+            if part_type in {"output_text", "text", None}:
+                text = _obj_get(part, "text")
+                if isinstance(text, str) and text.strip():
+                    parts.append(text.strip())
+    return "\n".join(parts).strip()
+
+
+def _obj_get(obj: Any, key: str, default: Any = None) -> Any:
+    value = getattr(obj, key, default)
+    if value is default and isinstance(obj, dict):
+        value = obj.get(key, default)
+    return value
+
+
 def call_llm(
    task: str = None,
    *,
@@ -5344,21 +5712,30 @@ def call_llm(
        )
        if client is None:
            # When the user explicitly chose a non-OpenRouter provider but no
-            # credentials were found, fail fast instead of silently routing
-            # through OpenRouter (which causes confusing 404s).
+            # credentials were found, honor the task fallback_chain before
+            # raising.  Missing raw env keys are recoverable for auxiliary
+            # tasks because fallback entries may use OAuth / credential-pool
+            # auth (for example openai-codex).
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                raise RuntimeError(
-                    f"Provider '{_explicit}' is set in config.yaml but no API key "
-                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                    f"variable, or switch to a different provider with `hermes model`."
+                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                    task, _explicit,
                )
+                if fb_client is not None:
+                    client, final_model = fb_client, fb_model
+                    resolved_provider = fb_label or resolved_provider
+                else:
+                    raise RuntimeError(
+                        f"Provider '{_explicit}' is set in config.yaml but no API key "
+                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                        f"variable, or switch to a different provider with `hermes model`."
+                    )
            # For auto/custom with no credentials, try the full auto chain
            # rather than hardcoding OpenRouter (which may be depleted).
            # Pass model=None so each provider uses its own default —
            # resolved_model may be an OpenRouter-format slug that doesn't
            # work on other providers.
-            if not resolved_base_url:
+            if client is None and not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", main_runtime=main_runtime, task=task)
@@ -5653,10 +6030,21 @@ def call_llm(
        # When the provider returns a 429 rate-limit (not billing), fall
        # back to an alternative provider instead of exhausting retries
        # against the same rate-limited endpoint.
+        #
+        # ── Auth error fallback (#21165) ─────────────────────────────
+        # When the resolved provider returns 401 and neither the Nous
+        # refresh path nor explicit provider credential refresh applies,
+        # fall back to an alternative provider instead of dropping the
+        # auxiliary task on the floor (silent compression failure /
+        # message loss). Auth is NOT a capacity error: it only bypasses
+        # the explicit-provider gate when the user is in auto mode.
        should_fallback = (
-            _is_payment_error(first_err)
+            _is_auth_error(first_err)
+            or _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
+            or _is_model_incompatible_error(first_err)
+            or _is_invalid_aux_response_error(first_err)
        )
        # Respect explicit provider choice for transient errors (auth, request
        # validation, etc.) but allow fallback when the provider clearly cannot
@@ -5667,9 +6055,24 @@ def call_llm(
        is_auto = resolved_provider in {"auto", "", None}
        # Capacity errors bypass the explicit-provider gate: the provider
        # literally cannot serve this request regardless of user intent.
-        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
+        # Rate limits are included: after retries are exhausted, a 429 means
+        # the provider cannot serve this request — fall back. See #52228.
+        # Model-incompatibility 400s are also a hard capability mismatch (the
+        # route cannot run this model at all — e.g. a codex/ChatGPT-account
+        # fallback asked to compress a glm-5.2 conversation), so they bypass
+        # the explicit-provider gate and continue to the next candidate
+        # instead of aborting the auxiliary task and churning the session.
+        is_capacity_error = (
+            _is_payment_error(first_err)
+            or _is_connection_error(first_err)
+            or _is_rate_limit_error(first_err)
+            or _is_model_incompatible_error(first_err)
+            or _is_invalid_aux_response_error(first_err)
+        )
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_payment_error(first_err):
+            if _is_auth_error(first_err):
+                reason = "auth error"
+            elif _is_payment_error(first_err):
                reason = "payment error"
                # Resolve the actual provider label (resolved_provider may be
                # "auto"; the client's base_url tells us which backend got the
@@ -5680,6 +6083,10 @@ def call_llm(
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
+            elif _is_model_incompatible_error(first_err):
+                reason = "model incompatible with route"
+            elif _is_invalid_aux_response_error(first_err):
+                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
@@ -5854,12 +6261,21 @@ async def async_call_llm(
        if client is None:
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                raise RuntimeError(
-                    f"Provider '{_explicit}' is set in config.yaml but no API key "
-                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                    f"variable, or switch to a different provider with `hermes model`."
+                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                    task, _explicit,
                )
-            if not resolved_base_url:
+                if fb_client is not None:
+                    client, final_model = _to_async_client(
+                        fb_client, fb_model or "", is_vision=(task == "vision")
+                    )
+                    resolved_provider = fb_label or resolved_provider
+                else:
+                    raise RuntimeError(
+                        f"Provider '{_explicit}' is set in config.yaml but no API key "
+                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                        f"variable, or switch to a different provider with `hermes model`."
+                    )
+            if client is None and not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", async_mode=True, main_runtime=main_runtime, task=task)
@@ -6105,24 +6521,47 @@ async def async_call_llm(
                        raise

        # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
+        # Auth error fallback (#21165): a 401 that survived the refresh path
+        # falls back in auto mode just like the sync call_llm() path. Auth is
+        # NOT a capacity error, so on an explicit provider it still respects
+        # the user's choice (handled by the is_auto/is_capacity_error gate).
        should_fallback = (
+            _is_auth_error(first_err)
+            or _is_payment_error(first_err)
+            or _is_connection_error(first_err)
+            or _is_rate_limit_error(first_err)
+            or _is_model_incompatible_error(first_err)
+            or _is_invalid_aux_response_error(first_err)
+        )
+        # Capacity errors (payment/quota/connection/rate-limit) bypass the
+        # explicit-provider gate — the provider cannot serve the request
+        # regardless of user intent. Rate limits are included: after retries
+        # are exhausted, a 429 means the provider is at capacity. See #52228.
+        # See #26803: daily token quota must fall back like a 402 credit error.
+        # Model-incompatibility 400s (route cannot run this model at all)
+        # bypass the gate too — see the sync call_llm() path for rationale.
+        is_auto = resolved_provider in {"auto", "", None}
+        is_capacity_error = (
            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
+            or _is_model_incompatible_error(first_err)
+            or _is_invalid_aux_response_error(first_err)
        )
-        # Capacity errors (payment/quota/connection) bypass the explicit-provider
-        # gate — the provider cannot serve the request regardless of user intent.
-        # See #26803: daily token quota must fall back like a 402 credit error.
-        is_auto = resolved_provider in {"auto", "", None}
-        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_payment_error(first_err):
+            if _is_auth_error(first_err):
+                reason = "auth error"
+            elif _is_payment_error(first_err):
                reason = "payment error"
                _mark_provider_unhealthy(
                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
+            elif _is_model_incompatible_error(first_err):
+                reason = "model incompatible with route"
+            elif _is_invalid_aux_response_error(first_err):
+                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -28,6 +28,7 @@ from typing import Any, Dict, Optional
 from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
 from agent.error_classifier import FailoverReason
+from agent.gemini_native_adapter import is_native_gemini_base_url
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
    _sanitize_surrogates,
@@ -37,6 +38,18 @@ from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname, env_float, env_int

 logger = logging.getLogger(__name__)
+_OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
+
+# When the fallback chain is fully exhausted on a non-rate-limit failure
+# (e.g. every provider returns a non-retryable client error like HTTP 400),
+# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
+# and does not reset _fallback_index=0 to replay the entire chain again.
+# Without this, a client/gateway that re-submits immediately would re-marshal
+# the full (potentially 80k-token) context once per provider every turn and
+# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
+# billing reasons keep their own 60s cooldown (set above); this is the
+# narrower non-rate-limit case.  See issue #24996.
+_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0


 def _ra():
@@ -115,6 +128,23 @@ def _is_openai_codex_backend(agent) -> bool:
    )


+def _validated_openrouter_provider_sort(raw_sort: Any) -> Optional[str]:
+    """Return a normalized OpenRouter provider.sort value or None."""
+    if not isinstance(raw_sort, str):
+        return None
+    sort_value = raw_sort.strip().lower()
+    if not sort_value:
+        return None
+    if sort_value in _OPENROUTER_PROVIDER_SORT_VALUES:
+        return sort_value
+    logger.warning(
+        "Ignoring invalid OpenRouter provider.sort value %r (allowed: %s)",
+        raw_sort,
+        ", ".join(sorted(_OPENROUTER_PROVIDER_SORT_VALUES)),
+    )
+    return None
+
+
 def _env_float(name: str, default: float) -> float:
    try:
        return float(os.getenv(name, str(default)))
@@ -229,6 +259,11 @@ def interruptible_api_call(agent, api_kwargs: dict):
                        invalidate_runtime_client(region)
                    raise
                result["response"] = normalize_converse_response(raw_response)
+            elif agent.provider == "moa":
+                # MoA is a virtual chat-completions provider backed by the
+                # in-process MoAClient facade. Do not rebuild a request-local
+                # OpenAI client from the virtual runtime metadata.
+                result["response"] = agent.client.chat.completions.create(**api_kwargs)
            else:
                request_client = _set_request_client(
                    agent._create_request_openai_client(
@@ -698,8 +733,9 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        _prefs["ignore"] = agent.providers_ignored
    if agent.providers_order:
        _prefs["order"] = agent.providers_order
-    if agent.provider_sort:
-        _prefs["sort"] = agent.provider_sort
+    _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
+    if _provider_sort:
+        _prefs["sort"] = _provider_sort
    if agent.provider_require_parameters:
        _prefs["require_parameters"] = True
    if agent.provider_data_collection:
@@ -1015,18 +1051,23 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
                    "arguments": tool_call.function.arguments
                },
            }
-            # Defence-in-depth: redact credentials from tool call arguments
-            # before they enter conversation history. Tool execution uses the
-            # raw API response object, not this dict, so redacting the
-            # persisted shape is safe and only affects storage. Catches the
-            # case where a model accidentally inlines a secret into a tool
-            # call (e.g. `terminal(command="curl -H 'Authorization: Bearer
-            # sk-...'")`). (#19798)
-            if isinstance(tc_dict["function"]["arguments"], str):
-                from agent.redact import redact_sensitive_text
-                tc_dict["function"]["arguments"] = redact_sensitive_text(
-                    tc_dict["function"]["arguments"]
-                )
+            # Tool-call arguments are intentionally NOT redacted here. This
+            # dict enters the in-memory conversation history that is replayed
+            # to the model on every subsequent turn AND persisted to state.db,
+            # which is itself replayed verbatim on session resume
+            # (get_messages_as_conversation). Masking a credential to `***`
+            # here poisons that replay: the model reads back its own
+            # `PGPASSWORD='***' psql ...` call and copies the placeholder into
+            # the next tool call, breaking every credential-dependent command
+            # on the second turn (#43083). The masking also provided no real
+            # protection — the same secret still leaks verbatim through tool
+            # OUTPUT (file contents, command output, diffs, the compaction
+            # block), none of which this pass ever touched. Keeping secrets
+            # out of the replayable store is a separate tokenization/vault
+            # concern, not something arg-redaction can deliver without
+            # breaking replay. Storage-time redaction remains governed by the
+            # `security.redact_secrets` toggle. (#19798 introduced this;
+            # #43083 removed it.)
            # Preserve extra_content (e.g. Gemini thought_signature) so it
            # is sent back on subsequent API calls.  Without this, Gemini 3
            # thinking models reject the request with a 400 error.
@@ -1093,8 +1134,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
        if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
            agent._rate_limited_until = time.monotonic() + 60
    if agent._fallback_index >= len(agent._fallback_chain):
+        # Chain exhausted.  If we actually walked a non-empty chain and the
+        # failure was NOT a rate-limit/billing event (those already armed
+        # their own 60s cooldown above), arm a short cooldown so the next
+        # turn's restore_primary_runtime stays gated instead of resetting
+        # _fallback_index=0 and re-marshaling the whole context across every
+        # provider again.  Guards the cross-turn replay storm in #24996.
+        if (
+            len(agent._fallback_chain) > 0
+            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
+        ):
+            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
+            agent._rate_limited_until = max(
+                _existing_cooldown,
+                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
+            )
        return False
-
    fb = agent._fallback_chain[agent._fallback_index]
    agent._fallback_index += 1
    fb_provider = (fb.get("provider") or "").strip().lower()
@@ -1210,14 +1265,16 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
            agent._transport_cache.clear()
        agent._fallback_activated = True

-        # Clear the credential pool when the fallback provider doesn't match
-        # the pool's provider.  The pool was seeded for the primary provider;
-        # leaving it attached means downstream recovery (rate_limit / billing /
-        # auth) calls ``_swap_credential`` with a primary entry which overwrites
-        # the agent's ``base_url`` back to the primary's endpoint — every
-        # fallback request then 404s against the wrong host.  See #33163.
+        # Rebind the credential pool to the fallback provider when the provider
+        # changes.  Keeping the primary pool attached would make downstream
+        # recovery (rate_limit / billing / auth) mutate the wrong credential
+        # set and can overwrite the fallback's base_url back to the primary
+        # endpoint.  See #33163.
+        #
        # When the fallback shares the pool's provider (e.g. both openrouter
-        # entries with different routing) the pool is preserved.
+        # entries with different routing) the pool is preserved.  When the
+        # providers differ, load the fallback provider's own pool if one exists
+        # so provider-specific rotation continues to work after the switch.
        _existing_pool = getattr(agent, "_credential_pool", None)
        if _existing_pool is not None:
            _pool_provider = (getattr(_existing_pool, "provider", "") or "").strip().lower()
@@ -1228,6 +1285,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
                    fb_provider, fb_model, _pool_provider,
                )
                agent._credential_pool = None
+        if getattr(agent, "_credential_pool", None) is None:
+            try:
+                from agent.credential_pool import load_pool
+
+                fallback_pool = load_pool(fb_provider)
+                if fallback_pool and fallback_pool.has_credentials():
+                    agent._credential_pool = fallback_pool
+                    logger.info(
+                        "Fallback to %s/%s: attached fallback credential pool",
+                        fb_provider, fb_model,
+                    )
+            except Exception as exc:
+                logger.debug(
+                    "Fallback to %s/%s: could not attach credential pool: %s",
+                    fb_provider, fb_model, exc,
+                )

        # Honor per-provider / per-model request_timeout_seconds for the
        # fallback target (same knob the primary client uses).  None = use
@@ -1458,8 +1531,9 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str:
                provider_preferences["ignore"] = agent.providers_ignored
            if agent.providers_order:
                provider_preferences["order"] = agent.providers_order
-            if agent.provider_sort:
-                provider_preferences["sort"] = agent.provider_sort
+            _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
+            if _provider_sort:
+                provider_preferences["sort"] = _provider_sort
            if provider_preferences and (
                (agent.provider or "").strip().lower() == "openrouter"
                or agent._is_openrouter_url()
@@ -1838,7 +1912,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
        stream_kwargs = {
            **api_kwargs,
            "stream": True,
-            "stream_options": {"include_usage": True},
            "timeout": _httpx.Timeout(
                connect=_conn_cap,
                read=_stream_read_timeout,
@@ -1846,6 +1919,14 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pool=_conn_cap,
            ),
        }
+        # OpenAI's `stream_options={"include_usage": True}` drives usage
+        # accounting on OpenAI-compatible endpoints (incl. the Gemini OpenAI
+        # compat shim and aggregators like OpenRouter).  Google's *native*
+        # Gemini REST endpoint rejects the keyword outright
+        # (`Completions.create() got an unexpected keyword argument
+        # 'stream_options'`), so omit it only for that endpoint.
+        if not is_native_gemini_base_url(agent.base_url):
+            stream_kwargs["stream_options"] = {"include_usage": True}
        request_client = _set_request_client(
            agent._create_request_openai_client(
                reason="chat_completion_stream_request",
@@ -2246,7 +2327,15 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                                _fire_first_delta()
                                agent._fire_reasoning_delta(thinking_text)

-            # Return the native Anthropic Message for downstream processing
+            # Return the native Anthropic Message for downstream processing.
+            # If the stream was interrupted (the event loop broke out above on
+            # agent._interrupt_requested), do NOT call get_final_message() — on
+            # a partially-consumed stream the SDK may hang draining remaining
+            # events or return a Message with incomplete tool_use blocks (partial
+            # JSON in `input`). The outer poll loop raises InterruptedError, so
+            # this return value is discarded anyway.
+            if agent._interrupt_requested:
+                return None
            return stream.get_final_message()

    def _call():
@@ -2391,12 +2480,19 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            diag=request_client_holder.get("diag"),
                        )
                        _close_request_client_once("stream_mid_tool_retry_cleanup")
-                        try:
-                            agent._replace_primary_openai_client(
-                                reason="stream_mid_tool_retry_pool_cleanup"
-                            )
-                        except Exception:
-                            pass
+                        if agent.api_mode == "anthropic_messages":
+                            try:
+                                agent._anthropic_client.close()
+                                agent._rebuild_anthropic_client()
+                            except Exception:
+                                pass
+                        else:
+                            try:
+                                agent._replace_primary_openai_client(
+                                    reason="stream_mid_tool_retry_pool_cleanup"
+                                )
+                            except Exception:
+                                pass
                        continue

                    # SSE error events from proxies (e.g. OpenRouter sends
@@ -2444,12 +2540,19 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            _close_request_client_once("stream_retry_cleanup")
                            # Also rebuild the primary client to purge
                            # any dead connections from the pool.
-                            try:
-                                agent._replace_primary_openai_client(
-                                    reason="stream_retry_pool_cleanup"
-                                )
-                            except Exception:
-                                pass
+                            if agent.api_mode == "anthropic_messages":
+                                try:
+                                    agent._anthropic_client.close()
+                                    agent._rebuild_anthropic_client()
+                                except Exception:
+                                    pass
+                            else:
+                                try:
+                                    agent._replace_primary_openai_client(
+                                        reason="stream_retry_pool_cleanup"
+                                    )
+                                except Exception:
+                                    pass
                            continue
                        # Retries exhausted. Log the final failure with
                        # full diagnostic detail (chain, headers,
@@ -2561,6 +2664,17 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0)
        else:
            _stream_stale_timeout = _stream_stale_timeout_base
+        # Reasoning-model floor: known reasoning models (Nemotron 3 Ultra,
+        # OpenAI o1/o3, Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ,
+        # xAI Grok reasoning, etc.) routinely exceed the default 180s chat-
+        # model threshold during their thinking phase.  The cloud gateway
+        # upstream kills the socket first, surfacing as BrokenPipeError.
+        # Raises the floor only — never overrides explicit user config
+        # (handled by get_provider_stale_timeout above).
+        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
+        _reasoning_floor = get_reasoning_stale_timeout_floor(api_kwargs.get("model"))
+        if _reasoning_floor is not None:
+            _stream_stale_timeout = max(_stream_stale_timeout, _reasoning_floor)

    t = threading.Thread(target=_call, daemon=True)
    t.start()
@@ -2609,10 +2723,17 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pass
            # Rebuild the primary client too — its connection pool
            # may hold dead sockets from the same provider outage.
-            try:
-                agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
-            except Exception:
-                pass
+            if agent.api_mode == "anthropic_messages":
+                try:
+                    agent._anthropic_client.close()
+                    agent._rebuild_anthropic_client()
+                except Exception:
+                    pass
+            else:
+                try:
+                    agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
+                except Exception:
+                    pass
            # Reset the timer so we don't kill repeatedly while
            # the inner thread processes the closure.
            last_chunk_time["t"] = time.time()
@@ -2688,7 +2809,30 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                role="assistant", content=_partial_text, tool_calls=None,
                reasoning_content=None,
            )
-            return SimpleNamespace(
+            # Detect provider output-layer content filtering (e.g. MiniMax
+            # "output new_sensitive (1027)", Azure/OpenAI content_filter,
+            # Anthropic safety refusal).  The raw error is about to be
+            # swallowed into a finish_reason=length stub, so classify it HERE
+            # while we still have it and stamp the stub.  Retrying such a
+            # content-deterministic filter on the same primary just re-hits
+            # the filter — the conversation loop reads this tag and activates
+            # the fallback chain instead of burning continuation retries.
+            # error_classifier is the single source of truth for "what counts
+            # as a content filter" (#32421).
+            _content_filter_terminated = False
+            try:
+                from agent.error_classifier import classify_api_error, FailoverReason
+                _cls = classify_api_error(
+                    result["error"],
+                    provider=str(getattr(agent, "provider", "") or ""),
+                    model=str(getattr(agent, "model", "") or ""),
+                )
+                _content_filter_terminated = (
+                    _cls.reason == FailoverReason.content_policy_blocked
+                )
+            except Exception:
+                _content_filter_terminated = False
+            _stub = SimpleNamespace(
                id=PARTIAL_STREAM_STUB_ID,
                model=getattr(agent, "model", "unknown"),
                choices=[SimpleNamespace(
@@ -2697,6 +2841,9 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                usage=None,
                _dropped_tool_names=_partial_names or None,
            )
+            if _content_filter_terminated:
+                _stub._content_filter_terminated = True
+            return _stub
        raise result["error"]
    return result["response"]

--- a/agent/coding_context.py
+++ b/agent/coding_context.py
@@ -60,6 +60,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 logger = logging.getLogger("hermes.coding_context")

 CODING_TOOLSET = "coding"
@@ -83,6 +85,59 @@ _PROJECT_MARKERS = (
 # Agent-instruction files surfaced separately from manifests in the snapshot.
 _CONTEXT_FILES = ("AGENTS.md", "CLAUDE.md", ".cursorrules")

+# Source-file extensions that make a git repo a *code* workspace even with no
+# manifest. Without this, `git init` on a notes/writing/research folder (a huge
+# non-coding use case) would flip the whole session into the coding posture just
+# for having a `.git`. A manifest still wins on its own (see `_PROJECT_MARKERS`).
+_CODE_EXTENSIONS = frozenset({
+    ".py", ".pyi", ".ipynb", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
+    ".go", ".rs", ".java", ".kt", ".kts", ".scala", ".rb", ".php", ".c", ".h",
+    ".cc", ".cpp", ".hpp", ".cs", ".swift", ".m", ".mm", ".dart", ".ex", ".exs",
+    ".lua", ".sh", ".bash", ".zsh", ".sql", ".vue", ".svelte", ".r", ".jl",
+    ".hs", ".clj", ".erl", ".pl",
+})
+
+# Dirs never worth scanning for the code check (deps/build/vcs/venv noise).
+_CODE_SCAN_SKIP_DIRS = frozenset({
+    ".git", "node_modules", "venv", ".venv", "__pycache__", "dist", "build",
+    "target", ".next", ".turbo", "vendor",
+})
+
+# Bounded sweep: a code workspace reveals itself in the first handful of entries.
+_CODE_SCAN_MAX_ENTRIES = 500
+
+
+def _has_code_files(root: Path) -> bool:
+    """Cheap, bounded check for source files in a repo's top two levels.
+
+    Lets a git repo of loose scripts (no manifest) still read as a code
+    workspace while a bare notes/writing repo does not. Scans the root and its
+    immediate subdirectories only, capped at ``_CODE_SCAN_MAX_ENTRIES`` stats —
+    a handful of readdirs at session start, not a full walk.
+    """
+    seen = 0
+    stack = [(root, True)]
+    while stack:
+        directory, is_root = stack.pop()
+        try:
+            with os.scandir(directory) as entries:
+                for entry in entries:
+                    seen += 1
+                    if seen > _CODE_SCAN_MAX_ENTRIES:
+                        return False
+                    name = entry.name
+                    try:
+                        if entry.is_file():
+                            if os.path.splitext(name)[1].lower() in _CODE_EXTENSIONS:
+                                return True
+                        elif is_root and entry.is_dir() and name not in _CODE_SCAN_SKIP_DIRS and not name.startswith("."):
+                            stack.append((Path(entry.path), False))
+                    except OSError:
+                        continue
+        except OSError:
+            continue
+    return False
+
 # Lockfile → package manager, checked in priority order.
 _PY_LOCKFILES = (("uv.lock", "uv"), ("poetry.lock", "poetry"), ("Pipfile.lock", "pipenv"))
 _JS_LOCKFILES = (
@@ -298,6 +353,29 @@ def _coding_mode(config: Optional[dict[str, Any]]) -> str:
    return "auto"


+def _coding_instructions(config: Optional[dict[str, Any]]) -> str:
+    """Standing operator instructions for the coding posture (config).
+
+    ``agent.coding_instructions`` — a string or list of strings appended to the
+    coding brief as an extra stable system block, so a user can pin project-wide
+    coding-workflow rules (e.g. "for UI work don't run tsc/lint until I approve;
+    clean the diff before committing") without editing the shipped brief.
+    Cache-safe: resolved once per session into the stable system-prompt tier,
+    like the rest of the posture.
+    """
+    if config is None:
+        try:
+            from hermes_cli.config import load_config
+
+            config = load_config()
+        except Exception:
+            config = {}
+    raw = ((config or {}).get("agent", {}) or {}).get("coding_instructions", "")
+    if isinstance(raw, (list, tuple)):
+        return "\n".join(str(item).strip() for item in raw if str(item).strip())
+    return str(raw or "").strip()
+
+
 def _resolve_cwd(cwd: Optional[str | Path]) -> Path:
    if cwd:
        return Path(cwd).expanduser()
@@ -368,10 +446,16 @@ def _detect_profile_name(mode: str, platform: str, cwd_str: str) -> str:
    if platform and platform.strip().lower() not in INTERACTIVE_CODING_PLATFORMS:
        return GENERAL_PROFILE.name
    cwd = Path(cwd_str)
+    # A recognized project root (manifest / AGENTS.md / .cursorrules) is a code
+    # workspace on its own — cheap stat checks, no scan.
+    if _marker_root(cwd) is not None:
+        return CODING_PROFILE.name
    git_root = _git_root(cwd)
    if git_root is not None and git_root == _home():
        git_root = None  # dotfiles repo at $HOME — not a code workspace
-    if git_root is not None or _marker_root(cwd) is not None:
+    # A bare git repo only counts when it actually holds code, so `git init` on a
+    # notes/writing/research folder stays in the general posture.
+    if git_root is not None and _has_code_files(git_root):
        return CODING_PROFILE.name
    return GENERAL_PROFILE.name

@@ -398,6 +482,9 @@ class RuntimeMode:
    # only to steer edit-format guidance toward the model's family — see
    # ``_edit_format_line``. Fixed for the session, so cache-safe.
    model: Optional[str] = None
+    # Standing operator instructions (``agent.coding_instructions``), appended
+    # as an extra stable system block. Empty unless the user configures it.
+    instructions: str = ""

    @property
    def kind(self) -> str:
@@ -444,6 +531,10 @@ class RuntimeMode:
        workspace = build_coding_workspace_block(self.cwd)
        if workspace:
            blocks.append(workspace)
+        # Operator instructions ride their own block so the brief (block 0) stays
+        # byte-stable and cache-keyed independently of user config.
+        if self.instructions:
+            blocks.append(f"Operator instructions (from config):\n{self.instructions}")
        return blocks

    def compact_skill_categories(self) -> frozenset[str]:
@@ -496,6 +587,7 @@ def resolve_runtime_mode(
        cwd=resolved_cwd,
        config_mode=mode,
        model=model,
+        instructions=_coding_instructions(config),
    )


@@ -588,12 +680,14 @@ def _enabled_mcp_servers(config: Optional[dict[str, Any]]) -> list[str]:


 def _git(cwd: Path, *args: str) -> str:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        out = subprocess.run(
            ["git", "-C", str(cwd), *args],
            capture_output=True,
            text=True,
            timeout=_GIT_TIMEOUT,
+            **_popen_kwargs,
        )
    except (OSError, subprocess.SubprocessError):
        return ""
--- a/agent/context_breakdown.py
+++ b/agent/context_breakdown.py
@@ -0,0 +1,156 @@
+"""Live session context-window breakdown for UI surfaces.
+
+Estimates how the next provider request is composed: system prompt tiers,
+tool schemas, and conversation history. Uses the same rough char/4 heuristic
+as ``agent.model_metadata.estimate_request_tokens_rough`` so numbers align
+with compression thresholds — not exact tokenizer counts.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+_SKILLS_BLOCK_RE = re.compile(r"<available_skills>.*?</available_skills>", re.DOTALL)
+
+_SUBAGENT_TOOL_NAMES = frozenset({"delegate_task"})
+
+_CATEGORY_COLORS = {
+    "system_prompt": "var(--context-usage-system)",
+    "tool_definitions": "var(--context-usage-tools)",
+    "rules": "var(--context-usage-rules)",
+    "skills": "var(--context-usage-skills)",
+    "mcp": "var(--context-usage-mcp)",
+    "subagent_definitions": "var(--context-usage-subagents)",
+    "memory": "var(--context-usage-memory)",
+    "conversation": "var(--context-usage-conversation)",
+}
+
+
+def _chars_to_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return (len(text) + 3) // 4
+
+
+def _json_tokens(value: Any) -> int:
+    if not value:
+        return 0
+    return _chars_to_tokens(json.dumps(value, ensure_ascii=False))
+
+
+def _tool_name(tool: dict) -> str:
+    fn = tool.get("function") if isinstance(tool, dict) else None
+    if isinstance(fn, dict):
+        return str(fn.get("name") or "")
+    return str(tool.get("name") or "")
+
+
+def _split_tools(tools: Sequence[dict]) -> Tuple[List[dict], List[dict], List[dict]]:
+    builtin: List[dict] = []
+    mcp: List[dict] = []
+    subagent: List[dict] = []
+    for tool in tools:
+        name = _tool_name(tool)
+        if name.startswith("mcp_"):
+            mcp.append(tool)
+        elif name in _SUBAGENT_TOOL_NAMES:
+            subagent.append(tool)
+        else:
+            builtin.append(tool)
+    return builtin, mcp, subagent
+
+
+def _memory_blocks(agent: Any) -> Tuple[str, str]:
+    memory_block = ""
+    user_block = ""
+    store = getattr(agent, "_memory_store", None)
+    if store is None:
+        return memory_block, user_block
+    try:
+        if getattr(agent, "_memory_enabled", True):
+            memory_block = store.format_for_system_prompt("memory") or ""
+        if getattr(agent, "_user_profile_enabled", True):
+            user_block = store.format_for_system_prompt("user") or ""
+    except Exception:
+        pass
+    return memory_block, user_block
+
+
+def _strip_blocks(text: str, *blocks: str) -> str:
+    out = text
+    for block in blocks:
+        if block:
+            out = out.replace(block, "")
+    return out.strip()
+
+
+def compute_session_context_breakdown(
+    agent: Any,
+    messages: Optional[List[dict]] = None,
+) -> Dict[str, Any]:
+    """Return a Cursor-style context usage breakdown for one live agent."""
+    from agent.model_metadata import estimate_messages_tokens_rough
+    from agent.system_prompt import build_system_prompt_parts
+
+    parts = build_system_prompt_parts(agent)
+    stable = parts.get("stable", "") or ""
+    context = parts.get("context", "") or ""
+    volatile = parts.get("volatile", "") or ""
+
+    skills_match = _SKILLS_BLOCK_RE.search(stable)
+    skills_index = skills_match.group(0) if skills_match else ""
+
+    memory_block, user_block = _memory_blocks(agent)
+    memory_text = "\n\n".join(part for part in (memory_block, user_block) if part).strip()
+
+    system_core = _strip_blocks(stable, skills_index)
+    system_tail = _strip_blocks(volatile, memory_block, user_block)
+    system_prompt_text = "\n\n".join(part for part in (system_core, system_tail) if part).strip()
+
+    tools = list(getattr(agent, "tools", None) or [])
+    builtin_tools, mcp_tools, subagent_tools = _split_tools(tools)
+
+    conversation_tokens = estimate_messages_tokens_rough(messages or [])
+
+    categories = [
+        ("system_prompt", "System prompt", _chars_to_tokens(system_prompt_text)),
+        ("tool_definitions", "Tool definitions", _json_tokens(builtin_tools)),
+        ("rules", "Rules", _chars_to_tokens(context)),
+        ("skills", "Skills", _chars_to_tokens(skills_index)),
+        ("mcp", "MCP", _json_tokens(mcp_tools)),
+        ("subagent_definitions", "Subagent definitions", _json_tokens(subagent_tools)),
+        ("memory", "Memory", _chars_to_tokens(memory_text)),
+        ("conversation", "Conversation", conversation_tokens),
+    ]
+
+    estimated_total = sum(tokens for _, _, tokens in categories)
+
+    comp = getattr(agent, "context_compressor", None)
+    context_max = int(getattr(comp, "context_length", 0) or 0) if comp else 0
+    measured_used = int(getattr(comp, "last_prompt_tokens", 0) or 0) if comp else 0
+    context_used = measured_used if measured_used > 0 else estimated_total
+    context_percent = (
+        max(0, min(100, round(context_used / context_max * 100)))
+        if context_max
+        else 0
+    )
+
+    return {
+        "categories": [
+            {
+                "color": _CATEGORY_COLORS.get(category_id, "var(--ui-text-tertiary)"),
+                "id": category_id,
+                "label": label,
+                "tokens": tokens,
+            }
+            for category_id, label, tokens in categories
+            if tokens > 0
+        ],
+        "context_max": context_max,
+        "context_percent": context_percent,
+        "context_used": context_used,
+        "estimated_total": estimated_total,
+        "model": getattr(agent, "model", "") or "",
+    }
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -890,7 +890,15 @@ class ContextCompressor(ContextEngine):
        # This is independent of the abort_on_summary_failure config flag:
        # rotating on a broken credential is never the right behavior.
        self._last_summary_auth_failure: bool = False
-        # When a user-configured summary model fails and we recover by
+        # Set when summary generation ultimately fails due to a transient
+        # network/connection error (httpx/httpcore connection drop, premature
+        # stream close, etc.) — distinct from auth failures but treated the
+        # same way by compress(): ABORT and preserve the session unchanged
+        # rather than destroy the middle window for a deterministic
+        # "summary unavailable" marker. Retrying once the network recovers is
+        # strictly better than discarding context for a transient blip
+        # (#29559, #25585). Independent of abort_on_summary_failure.
+        self._last_summary_network_failure: bool = False
        # retrying on the main model, record the failure so gateway /
        # CLI callers can still warn the user even though compression
        # succeeded.  Silent recovery would hide the broken config.
@@ -1687,6 +1695,7 @@ This compaction should PRIORITISE preserving all information related to the focu
            self._summary_model_fallen_back = False
            self._last_summary_error = None
            self._last_summary_auth_failure = False
+            self._last_summary_network_failure = False
            return self._with_summary_prefix(summary)
        except Exception as e:
            # ``call_llm`` raises ``RuntimeError`` for two very different cases:
@@ -1819,6 +1828,15 @@ This compaction should PRIORITISE preserving all information related to the focu
            if len(err_text) > 220:
                err_text = err_text[:217].rstrip() + "..."
            self._last_summary_error = err_text
+            # A terminal connection/network failure (we reach this branch only
+            # after any main-model fallback has already been tried or is
+            # unavailable). Flag it so compress() ABORTS and preserves the
+            # session unchanged instead of destroying the middle window for a
+            # placeholder marker — retrying once the network recovers is
+            # strictly better than dropping context (#29559, #25585). Mirrors
+            # the auth-failure carve-out; independent of abort_on_summary_failure.
+            if _is_streaming_closed:
+                self._last_summary_network_failure = True
            logger.warning(
                "Failed to generate context summary: %s. "
                "Further summary attempts paused for %d seconds.",
@@ -2382,6 +2400,7 @@ This compaction should PRIORITISE preserving all information related to the focu
        self._last_aux_model_failure_model = None
        self._last_compress_aborted = False
        self._last_summary_auth_failure = False
+        self._last_summary_network_failure = False

        # Manual /compress (force=True) bypasses the failure cooldown so the
        # user can retry immediately after an auto-compress abort.  Without
@@ -2498,15 +2517,21 @@ This compaction should PRIORITISE preserving all information related to the focu
        #           surface a warning.
        # Default is False (historical behavior).
        #
-        # EXCEPTION — auth failures always abort. A 401/403 from the summary
-        # call means the credential or endpoint is broken (invalid/blocked
-        # key, or a token pointed at the wrong inference host). Rotating into
+        # EXCEPTION — auth AND transient network failures always abort. A
+        # 401/403 from the summary call means the credential or endpoint is
+        # broken (invalid/blocked key, or a token pointed at the wrong
+        # inference host). A connection/stream-close error means the network
+        # blipped at the compaction moment (#29559). In BOTH cases rotating into
        # a child session with a placeholder summary on a broken credential
        # strands the user on a degraded session for zero benefit — every
        # subsequent call fails the same way. So when the failure was an auth
        # error we abort regardless of abort_on_summary_failure, preserving
        # the conversation unchanged until the credential is fixed.
-        if not summary and (self.abort_on_summary_failure or self._last_summary_auth_failure):
+        if not summary and (
+            self.abort_on_summary_failure
+            or self._last_summary_auth_failure
+            or self._last_summary_network_failure
+        ):
            n_skipped = compress_end - compress_start
            self._last_summary_dropped_count = 0  # nothing actually dropped
            self._last_summary_fallback_used = False
@@ -2521,6 +2546,15 @@ This compaction should PRIORITISE preserving all information related to the focu
                        "with /compress or start fresh with /new.",
                        n_skipped,
                    )
+                elif self._last_summary_network_failure:
+                    logger.warning(
+                        "Summary generation failed with a network/connection "
+                        "error — aborting compression. %d message(s) preserved "
+                        "unchanged; the session was NOT rotated. This is "
+                        "transient: retry with /compress once connectivity "
+                        "recovers, or continue the conversation as-is.",
+                        n_skipped,
+                    )
                else:
                    logger.warning(
                        "Summary generation failed — aborting compression "
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Awaitable, Callable

 from agent.model_metadata import estimate_tokens_rough
+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags

 _QUOTED_REFERENCE_VALUE = r'(?:`[^`\n]+`|"[^"\n]+"|\'[^\'\n]+\')'
 REFERENCE_PATTERN = re.compile(
@@ -290,6 +291,7 @@ def _expand_git_reference(
    args: list[str],
    label: str,
 ) -> tuple[str | None, str | None]:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["git", *args],
@@ -298,6 +300,7 @@ def _expand_git_reference(
            text=True,
            timeout=30,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"{ref.raw}: git command timed out (30s)", None
@@ -325,9 +328,9 @@ async def _fetch_url_content(
 async def _default_url_fetcher(url: str) -> str:
    from tools.web_tools import web_extract_tool

-    raw = await web_extract_tool([url], format="markdown", use_llm_processing=True)
+    raw = await web_extract_tool([url], format="markdown")
    payload = json.loads(raw)
-    docs = payload.get("data", {}).get("documents", [])
+    docs = payload.get("results", [])
    if not docs:
        return ""
    doc = docs[0]
@@ -483,6 +486,7 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]:


 def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["rg", "--files", str(path.relative_to(cwd))],
@@ -491,6 +495,7 @@ def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
            text=True,
            timeout=10,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
        return None
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -90,6 +90,7 @@ def check_compression_model_feasibility(agent: Any) -> None:
    try:
        from agent.auxiliary_client import (
            _resolve_task_provider_model,
+            _try_configured_fallback_for_unavailable_client,
            get_text_auxiliary_client,
        )
        from agent.model_metadata import (
@@ -97,10 +98,6 @@ def check_compression_model_feasibility(agent: Any) -> None:
            get_model_context_length,
        )

-        client, aux_model = get_text_auxiliary_client(
-            "compression",
-            main_runtime=agent._current_main_runtime(),
-        )
        # Best-effort aux provider label for the warning message. The
        # configured provider may be "auto", in which case we fall back
        # to the client's base_url hostname so the user can still tell
@@ -109,6 +106,19 @@ def check_compression_model_feasibility(agent: Any) -> None:
            _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
        except Exception:
            _aux_cfg_provider = ""
+        client, aux_model = get_text_auxiliary_client(
+            "compression",
+            main_runtime=agent._current_main_runtime(),
+        )
+        if client is None or not aux_model:
+            fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                "compression",
+                _aux_cfg_provider,
+            )
+            if fb_client is not None and fb_model:
+                client, aux_model = fb_client, fb_model
+                if "(" in fb_label and fb_label.endswith(")"):
+                    _aux_cfg_provider = fb_label.rsplit("(", 1)[1][:-1]
        if client is None or not aux_model:
            if _aux_cfg_provider and _aux_cfg_provider != "auto":
                msg = (
@@ -278,6 +288,29 @@ def replay_compression_warning(agent: Any) -> None:
            pass


+def conversation_history_after_compression(agent: Any, messages: list) -> Optional[list]:
+    """Return the correct flush baseline after a compression boundary.
+
+    Legacy compression rotates to a fresh child session. That child has not
+    seen the compacted transcript through the normal same-turn flush path yet,
+    so callers must clear ``conversation_history`` to ``None`` and let the next
+    persistence call write the whole compacted list.
+
+    In-place compaction is different: ``archive_and_compact()`` has already
+    soft-archived the previous active rows and inserted ``messages`` as the new
+    active live transcript under the same session id. If the same agent turn
+    continues with ``conversation_history=None``, the identity-based flush path
+    treats those already-persisted compacted dicts as new and appends them a
+    second time, doubling the active context and retriggering compression.
+
+    A shallow copy is intentional: it captures the current compacted dict
+    identities as history while allowing later same-turn appends to remain new.
+    """
+    if bool(getattr(agent, "_last_compaction_in_place", False)):
+        return list(messages)
+    return None
+
+
 def compress_context(
    agent: Any,
    messages: list,
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -28,6 +28,7 @@ import uuid
 from typing import Any, Dict, List, Optional

 from agent.codex_responses_adapter import _summarize_user_message_for_log
+from agent.conversation_compression import conversation_history_after_compression
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
 from agent.iteration_budget import IterationBudget
@@ -35,6 +36,7 @@ from agent.turn_context import build_turn_context
 from agent.turn_retry_state import TurnRetryState
 from agent.memory_manager import build_memory_context_block
 from agent.message_sanitization import (
+    close_interrupted_tool_sequence,
    _repair_tool_call_arguments,
    _sanitize_messages_non_ascii,
    _sanitize_messages_surrogates,
@@ -55,7 +57,7 @@ from agent.model_metadata import (
 )
 from agent.process_bootstrap import _install_safe_stdio
 from agent.prompt_caching import apply_anthropic_cache_control
-from agent.retry_utils import jittered_backoff
+from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff
 from agent.trajectory import has_incomplete_scratchpad
 from agent.usage_pricing import estimate_usage_cost, normalize_usage
 from hermes_constants import PARTIAL_STREAM_STUB_ID
@@ -501,6 +503,7 @@ def run_conversation(
    stream_callback: Optional[callable] = None,
    persist_user_message: Optional[str] = None,
    persist_user_timestamp: Optional[float] = None,
+    moa_config: Optional[dict[str, Any]] = None,
 ) -> Dict[str, Any]:
    """
    Run a complete conversation with tool calling until completion.
@@ -523,6 +526,19 @@ def run_conversation(
    Returns:
        Dict: Complete conversation result with final response and message history
    """
+    if moa_config is None:
+        try:
+            from hermes_cli.moa_config import decode_moa_turn
+
+            _decoded_message, _decoded_moa_config = decode_moa_turn(user_message)
+            if _decoded_moa_config is not None:
+                user_message = _decoded_message
+                moa_config = _decoded_moa_config
+                if persist_user_message is None:
+                    persist_user_message = _decoded_message
+        except Exception:
+            pass
+
    # ── Per-turn setup (the prologue) ──
    # All once-per-turn setup — stdio guarding, retry-counter resets, user
    # message sanitization, todo/nudge hydration, system-prompt restore-or-
@@ -572,6 +588,13 @@ def run_conversation(
    compression_attempts = 0
    _turn_exit_reason = "unknown"  # Diagnostic: why the loop ended

+    # Per-turn tally of consecutive successful credential-pool token refreshes,
+    # keyed by (provider, pool-entry-id). A persistent upstream 401 lets
+    # ``try_refresh_current()`` "succeed" forever on a single-entry OAuth pool,
+    # so this tally caps same-entry refreshes and lets the fallback chain take
+    # over instead of spinning. Reset here so each turn starts fresh. See #26080.
+    agent._auth_pool_refresh_counts = {}
+
    # Optional opt-in runtime: if api_mode == codex_app_server, hand the
    # turn to the codex app-server subprocess (terminal/file ops/patching
    # all run inside Codex). Default Hermes path is bypassed entirely.
@@ -801,6 +824,28 @@ def run_conversation(
        if effective_system:
            api_messages = [{"role": "system", "content": effective_system}] + api_messages

+        if moa_config:
+            try:
+                from agent.moa_loop import aggregate_moa_context
+
+                _moa_context = aggregate_moa_context(
+                    user_prompt=original_user_message if isinstance(original_user_message, str) else str(original_user_message),
+                    api_messages=api_messages,
+                    reference_models=moa_config.get("reference_models") or [],
+                    aggregator=moa_config.get("aggregator") or {},
+                    temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
+                    aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
+                )
+                if _moa_context:
+                    for _msg in reversed(api_messages):
+                        if _msg.get("role") == "user":
+                            _base = _msg.get("content", "")
+                            if isinstance(_base, str):
+                                _msg["content"] = _base + "\n\n" + _moa_context
+                            break
+            except Exception as _moa_exc:
+                logger.warning("MoA context aggregation failed: %s", _moa_exc)
+
        # Inject ephemeral prefill messages right after the system prompt
        # but before conversation history. Same API-call-time-only pattern.
        if agent.prefill_messages:
@@ -1122,7 +1167,7 @@ def run_conversation(
                # stream.  Mirror the ACP exclusion used for Responses
                # API upgrade (lines ~1083-1085).
                elif (
-                    agent.provider == "copilot-acp"
+                    agent.provider in {"copilot-acp", "moa"}
                    or str(agent.base_url or "").lower().startswith("acp://copilot")
                    or str(agent.base_url or "").lower().startswith("acp+tcp://")
                ):
@@ -1396,10 +1441,12 @@ def run_conversation(
                    while time.time() < sleep_end:
                        if agent._interrupt_requested:
                            agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
+                            _interrupt_text = f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries})."
+                            close_interrupted_tool_sequence(messages, _interrupt_text)
                            agent._persist_session(messages, conversation_history)
                            agent.clear_interrupt()
                            return {
-                                "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).",
+                                "final_response": _interrupt_text,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
@@ -1652,6 +1699,56 @@ def run_conversation(

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
+                        # ── Content-filter stream stall → fallback (#32421) ──
+                        # When the provider's output-layer safety filter (e.g.
+                        # MiniMax "output new_sensitive (1027)", Azure
+                        # content_filter) kills the stream mid-delivery, the
+                        # raw error was classified at the swallow point and the
+                        # stub tagged ``_content_filter_terminated``.  This
+                        # filter is content-deterministic — continuation
+                        # retries against the SAME primary just re-hit it and
+                        # burn paid attempts (the loop used to give up with
+                        # "Response remained truncated after 3 continuation
+                        # attempts" and never consult the fallback chain).
+                        # Escalate to the configured fallback BEFORE retrying.
+                        _cf_terminated = getattr(
+                            response, "_content_filter_terminated", False
+                        )
+                        if (
+                            _cf_terminated
+                            and agent._fallback_index < len(agent._fallback_chain)
+                        ):
+                            agent._vprint(
+                                f"{agent.log_prefix}🛡️  Content filter terminated "
+                                f"stream — activating fallback provider...",
+                                force=True,
+                            )
+                            agent._emit_status(
+                                "Content filter terminated stream; switching to fallback..."
+                            )
+                            if agent._try_activate_fallback():
+                                # Roll the partial content (if any was already
+                                # appended in a prior continuation pass) back to
+                                # the last clean turn so the fallback provider
+                                # gets a coherent continuation point.
+                                if truncated_response_parts:
+                                    messages = agent._get_messages_up_to_last_assistant(messages)
+                                agent._session_messages = messages
+                                length_continue_retries = 0
+                                truncated_response_parts = []
+                                retry_count = 0
+                                compression_attempts = 0
+                                _retry.primary_recovery_attempted = False
+                                _retry.restart_with_rebuilt_messages = True
+                                break
+                            # No fallback available — fall through to normal
+                            # continuation (best-effort, may loop).
+                            agent._vprint(
+                                f"{agent.log_prefix}⚠️  No fallback provider "
+                                f"configured — retrying with same provider "
+                                f"(may re-hit filter)...",
+                                force=True,
+                            )
                        if assistant_message is not None and not _trunc_has_tool_calls:
                            length_continue_retries += 1
                            interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
@@ -1971,9 +2068,21 @@ def run_conversation(
                    agent.thinking_callback("")
                api_elapsed = time.time() - api_start_time
                agent._vprint(f"{agent.log_prefix}⚡ Interrupted during API call.", force=True)
-                agent._persist_session(messages, conversation_history)
                interrupted = True
-                final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
+                # Preserve any assistant text already streamed to the user
+                # before the stop landed. Dropping it leaves history with no
+                # record of the half-finished reply on screen, so the next turn
+                # the model "forgets" what it just said — exactly what users hit
+                # when they stop to redirect mid-response.
+                _partial = agent._strip_think_blocks(
+                    getattr(agent, "_current_streamed_assistant_text", "") or ""
+                ).strip()
+                if _partial:
+                    messages.append({"role": "assistant", "content": _partial})
+                    final_response = _partial
+                else:
+                    final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
+                agent._persist_session(messages, conversation_history)
                break

            except Exception as api_error:
@@ -2207,6 +2316,15 @@ def run_conversation(
                    # "unknown variant `image_url`, expected `text`".
                    "unknown variant `image_url`, expected `text`",
                    "unknown variant image_url, expected text",
+                    # OpenRouter routes a request to upstream endpoints and,
+                    # when none of the candidate endpoints for the model accept
+                    # image input, returns HTTP 404 "No endpoints found that
+                    # support image input". Without this phrase the agent never
+                    # strips the images, the retry loop re-sends the same
+                    # rejected request until exhaustion, and the gateway leaves
+                    # every subsequent message queued behind the stuck turn —
+                    # the P1 in issue #21160. The 404 passes the 4xx gate below.
+                    "no endpoints found that support image input",
                )
                _err_lower = _err_body.lower()
                _looks_like_image_rejection = any(
@@ -2663,10 +2781,12 @@ def run_conversation(
                # Check for interrupt before deciding to retry
                if agent._interrupt_requested:
                    agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
+                    _interrupt_text = f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))})."
+                    close_interrupted_tool_sequence(messages, _interrupt_text)
                    agent._persist_session(messages, conversation_history)
                    agent.clear_interrupt()
                    return {
-                        "final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).",
+                        "final_response": _interrupt_text,
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
@@ -2776,10 +2896,9 @@ def run_conversation(
                            approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )
-                        # Compression created a new session — clear history
-                        # so _flush_messages_to_session_db writes compressed
-                        # messages to the new session, not skipping them.
-                        conversation_history = None
+                        conversation_history = conversation_history_after_compression(
+                            agent, messages
+                        )
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
                            agent._buffer_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
@@ -2791,26 +2910,43 @@ def run_conversation(
                    # Fall through to normal error handling if compression
                    # is exhausted or didn't help.

-                # Eager fallback for rate-limit errors (429 or quota exhaustion).
-                # When a fallback model is configured, switch immediately instead
-                # of burning through retries with exponential backoff -- the
-                # primary provider won't recover within the retry window.
+                # Eager fallback for rate-limit errors (429 or quota exhaustion)
+                # and transport errors (connection failure / timeout / provider
+                # overloaded).  Rate limits and billing: switch immediately —
+                # the primary provider won't recover within the retry window.
+                # Transport errors: allow 1 retry first (transient hiccups
+                # recover), then fall back if the provider is truly unreachable.
                is_rate_limited = classified.reason in {
                    FailoverReason.rate_limit,
                    FailoverReason.billing,
                }
-                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
+                _is_transport_failure = classified.reason in {
+                    FailoverReason.timeout,
+                    FailoverReason.overloaded,
+                }
+                _should_fallback = (
+                    is_rate_limited
+                    or (_is_transport_failure and retry_count >= 2)
+                )
+                if _should_fallback and agent._fallback_index < len(agent._fallback_chain):
                    # Don't eagerly fallback if credential pool rotation may
                    # still recover.  See _pool_may_recover_from_rate_limit
-                    # for the single-credential-pool exception.  Fixes #11314.
+                    # for the single-credential-pool and CloudCode-quota
+                    # exceptions.  Fixes #11314 and #13636.
                    pool_may_recover = _ra()._pool_may_recover_from_rate_limit(
                        agent._credential_pool,
+                        provider=agent.provider,
+                        base_url=getattr(agent, "base_url", None),
                    )
                    if not pool_may_recover:
                        if classified.reason == FailoverReason.billing:
                            agent._buffer_status(
                                "⚠️ Billing or credits exhausted — switching to fallback provider..."
                            )
+                        elif _is_transport_failure:
+                            agent._buffer_status(
+                                "⚠️ Provider unreachable — switching to fallback provider..."
+                            )
                        else:
                            agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
@@ -2985,10 +3121,9 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history
-                    # so _flush_messages_to_session_db writes compressed
-                    # messages to the new session, not skipping them.
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3152,10 +3287,9 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history
-                    # so _flush_messages_to_session_db writes compressed
-                    # messages to the new session, not skipping them.
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3417,6 +3551,13 @@ def run_conversation(
                    ):
                        _retry.primary_recovery_attempted = True
                        retry_count = 0
+                        # Primary transport recovery starts a fresh attempt
+                        # cycle. Re-open fallback state so a follow-on 429 can
+                        # still activate fallback_providers after stale
+                        # pre-recovery fallback/credential-pool bookkeeping.
+                        _retry.has_retried_429 = False
+                        agent._fallback_index = 0
+                        agent._fallback_activated = False
                        continue
                    # Try fallback before giving up entirely
                    if agent._has_pending_fallback():
@@ -3482,6 +3623,65 @@ def run_conversation(
                            force=True,
                        )

+                    # Detect thinking-timeout pattern: a known reasoning model
+                    # hit a transport-layer error before the first content
+                    # token arrived.  Distinct from _is_stream_drop above
+                    # (which fires for large file-write stream drops) and
+                    # from any classifier reason that's not a transport
+                    # timeout.  Reuses the reasoning-model allowlist from
+                    # agent/reasoning_timeouts.py (Fixes #52217) so the
+                    # trigger is consistent with what the per-model
+                    # stale-timeout floor covers.  After the classifier
+                    # override at agent/error_classifier.py:720-738 (this
+                    # PR), transport disconnects on reasoning models route
+                    # to FailoverReason.timeout rather than
+                    # context_overflow, so this branch actually fires.
+                    # Detection and message text live in
+                    # agent.thinking_timeout_guidance so they're
+                    # unit-testable without driving the full retry loop.
+                    # (Part 2 of Fixes #52310.)
+                    from agent.thinking_timeout_guidance import (
+                        is_thinking_timeout,
+                    )
+                    _is_thinking_timeout = is_thinking_timeout(
+                        classified,
+                        _model,
+                        error_msg,
+                    )
+                    if _is_thinking_timeout:
+                        agent._vprint(
+                            f"{agent.log_prefix}   💡 The model's thinking "
+                            f"phase exceeded the upstream proxy's idle "
+                            f"timeout before the first content token "
+                            f"arrived. This is a known issue with "
+                            f"reasoning models behind cloud gateways "
+                            f"(NVIDIA NIM, OpenAI, Anthropic, DeepSeek).",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      Workarounds in priority order:",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      1. Set "
+                            f"`providers.{_provider}.models.{_model}.stale_timeout_seconds: 900` "
+                            f"in `~/.hermes/config.yaml` to extend the per-call "
+                            f"timeout. (Hermes's built-in floor is 600s for "
+                            f"known reasoning models — if you still see this "
+                            f"after raising, the upstream cap is even shorter.)",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      2. Lower `reasoning_budget` or set "
+                            f"`reasoning_effort: medium` on this model if the provider supports it.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      3. Use a smaller / faster reasoning "
+                            f"model if the task doesn't require deep thinking.",
+                            force=True,
+                        )
+
                    logger.error(
                        "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s",
                        agent.log_prefix, max_retries, _final_summary,
@@ -3498,7 +3698,22 @@ def run_conversation(
                            _final_response += f"\n\n{_billing_guidance}"
                    else:
                        _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
-                    if _is_stream_drop:
+                    if _is_thinking_timeout:
+                        # Thinking-timeout guidance overrides the generic
+                        # stream-drop guidance — the latter is wrong for
+                        # this case (it suggests splitting large file
+                        # writes, which isn't what happened).  See the
+                        # reasoning-model override at
+                        # agent/error_classifier.py:720-738 and the
+                        # detection block above for context.
+                        from agent.thinking_timeout_guidance import (
+                            build_thinking_timeout_guidance,
+                        )
+                        _final_response += build_thinking_timeout_guidance(
+                            provider=_provider,
+                            model=_model,
+                        )
+                    elif _is_stream_drop:
                        _final_response += (
                            "\n\nThe provider's stream connection keeps "
                            "dropping — this often happens when generating "
@@ -3530,20 +3745,47 @@ def run_conversation(
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
-                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
+                                # Cap at 10 minutes. Anthropic Tier 1 input-token
+                                # buckets reset in ~171s, so a 120s cap caused us to
+                                # retry before the actual reset window and re-trip the
+                                # limit. 600s covers all realistic provider reset
+                                # windows while still rejecting pathological values. (#26293)
+                                _retry_after = min(float(_ra_raw), 600)
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
+                _backoff_policy = None
+                if is_rate_limited and not _retry_after:
+                    wait_time, _backoff_policy = adaptive_rate_limit_backoff(
+                        retry_count,
+                        base_url=str(_base),
+                        model=_model,
+                        error=api_error,
+                        default_wait=wait_time,
+                    )
                if is_rate_limited:
-                    agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
+                    _policy_note = ""
+                    if _backoff_policy == "zai_coding_overload_long":
+                        _policy_note = " (Z.AI Coding overload adaptive long backoff)"
+                    elif _backoff_policy == "zai_coding_overload_short":
+                        _policy_note = " (Z.AI Coding overload short retry)"
+                    _rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..."
+                    # Normal retries are buffered to avoid noisy transient chatter. Long
+                    # Z.AI Coding waits are different: they can last minutes, so surface
+                    # progress immediately instead of making the TUI look frozen.
+                    if _backoff_policy == "zai_coding_overload_long":
+                        agent._emit_status(_rate_limit_status)
+                    else:
+                        agent._buffer_status(_rate_limit_status)
                else:
                    agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
                logger.warning(
-                    "Retrying API call in %ss (attempt %s/%s) %s error=%s",
+                    "Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s",
                    wait_time,
                    retry_count,
                    max_retries,
                    agent._client_log_context(),
+                    _backoff_policy or "default",
                    api_error,
                )
                # Sleep in small increments so we can respond to interrupts quickly
@@ -3553,10 +3795,12 @@ def run_conversation(
                while time.time() < sleep_end:
                    if agent._interrupt_requested:
                        agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
+                        _interrupt_text = f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries})."
+                        close_interrupted_tool_sequence(messages, _interrupt_text)
                        agent._persist_session(messages, conversation_history)
                        agent.clear_interrupt()
                        return {
-                            "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
+                            "final_response": _interrupt_text,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
@@ -3587,6 +3831,17 @@ def run_conversation(
            _retry.restart_with_compressed_messages = False
            continue

+        if _retry.restart_with_rebuilt_messages:
+            # A content-filter stream stall (#32421) was escalated to the
+            # fallback chain and the partial content rolled back.  Re-issue
+            # the API call against the now-active fallback provider.  Refund
+            # the budget/count for the stalled attempt so the fallback gets a
+            # fair turn.
+            api_call_count -= 1
+            agent.iteration_budget.refund()
+            _retry.restart_with_rebuilt_messages = False
+            continue
+
        if _retry.restart_with_length_continuation:
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
@@ -4047,6 +4302,19 @@ def run_conversation(

                messages.append(assistant_msg)
                agent._emit_interim_assistant_message(assistant_msg)
+                try:
+                    # Persist the assistant tool-call turn before any tool
+                    # side effects run. If a destructive tool restarts or
+                    # terminates Hermes mid-turn, resume logic still sees the
+                    # exact tool-call block that already executed.
+                    agent._flush_messages_to_session_db(messages, conversation_history)
+                except Exception as exc:
+                    logger.warning(
+                        "Incremental tool-call persistence failed before execution "
+                        "(session=%s): %s",
+                        agent.session_id or "none",
+                        exc,
+                    )

                # Close any open streaming display (response box, reasoning
                # box) before tool execution begins.  Intermediate turns may
@@ -4148,10 +4416,9 @@ def run_conversation(
                        approx_tokens=agent.context_compressor.last_prompt_tokens,
                        task_id=effective_task_id,
                    )
-                    # Compression created a new session — clear history so
-                    # _flush_messages_to_session_db writes compressed messages
-                    # to the new session (see preflight compression comment).
-                    conversation_history = None
+                    conversation_history = conversation_history_after_compression(
+                        agent, messages
+                    )
                
                # Save session log incrementally (so progress is visible even if interrupted)
                agent._session_messages = messages
@@ -4193,7 +4460,11 @@ def run_conversation(
                            "as final response"
                        )
                        final_response = _recovered
-                        agent._response_was_previewed = True
+                        # Streaming delivered a fragment, not a confirmed
+                        # final preview. Leave response_previewed false so
+                        # gateway fallback delivery can send the recovered
+                        # text plus the abnormal-turn explanation.
+                        agent._response_was_previewed = False
                        break

                    # If the previous turn already delivered real content alongside
@@ -4438,14 +4709,20 @@ def run_conversation(
                # status from earlier failed attempts in this turn.
                agent._clear_status_buffer()

+                from agent.agent_runtime_helpers import (
+                    intent_ack_continuation_mode,
+                )
+
+                _ack_mode = intent_ack_continuation_mode(agent)
                if (
-                    agent.api_mode == "codex_responses"
+                    _ack_mode != "off"
                    and agent.valid_tool_names
                    and codex_ack_continuations < 2
                    and agent._looks_like_codex_intermediate_ack(
                        user_message=user_message,
                        assistant_content=final_response,
                        messages=messages,
+                        require_workspace=(_ack_mode == "codex_only"),
                    )
                ):
                    codex_ack_continuations += 1
@@ -4476,9 +4753,10 @@ def run_conversation(
                final_msg = agent._build_assistant_message(assistant_message, finish_reason)

                # Pop thinking-only prefill and empty-response retry
-                # scaffolding before appending the final response.  These
-                # internal turns are only for the next API retry and should
-                # not become durable transcript context.
+                # scaffolding before appending either a final response or a
+                # verification-stop follow-up. These internal turns are only
+                # for the next API retry and should not become durable
+                # transcript context.
                while (
                    messages
                    and isinstance(messages[-1], dict)
@@ -4490,6 +4768,97 @@ def run_conversation(
                ):
                    messages.pop()

+                try:
+                    from agent.verification_stop import (
+                        build_verify_on_stop_nudge,
+                        verify_on_stop_enabled,
+                    )
+
+                    if verify_on_stop_enabled():
+                        _verify_nudge = build_verify_on_stop_nudge(
+                            session_id=getattr(agent, "session_id", None),
+                            changed_paths=getattr(agent, "_turn_file_mutation_paths", set()),
+                            attempts=getattr(agent, "_verification_stop_nudges", 0),
+                        )
+                    else:
+                        _verify_nudge = None
+                except Exception:
+                    logger.debug("verification stop-loop check failed", exc_info=True)
+                    _verify_nudge = None
+
+                if _verify_nudge:
+                    agent._verification_stop_nudges = (
+                        getattr(agent, "_verification_stop_nudges", 0) + 1
+                    )
+                    final_msg["finish_reason"] = "verification_required"
+                    messages.append(final_msg)
+                    # Keep the attempted final answer in model history so the
+                    # synthetic user nudge preserves role alternation, but do
+                    # not surface it to the user as an interim answer. The
+                    # whole point of this guard is to prevent premature
+                    # "done" claims before checks run.
+                    messages.append({
+                        "role": "user",
+                        "content": _verify_nudge,
+                        "_verification_stop_synthetic": True,
+                    })
+                    agent._session_messages = messages
+                    # Run the verification-stop loop silently — the nudge is an
+                    # internal turn that should not add noise to the user's
+                    # terminal. Keep a debug breadcrumb in agent.log for tracing.
+                    logger.debug("verification stop-loop nudge issued (attempt %d)",
+                                 agent._verification_stop_nudges)
+                    continue
+
+                # User verification-loop gate: when the agent edited code this
+                # turn, let a registered `pre_verify` hook (plugin/shell) keep it
+                # going one more turn. The shipped guidance is folded into the
+                # evidence-based verify-on-stop nudge above, so this path has no
+                # default continuation cost.
+                _verify_nudge2 = None
+                _edited = sorted(getattr(agent, "_turn_file_mutation_paths", set()) or [])
+                _attempt = getattr(agent, "_pre_verify_nudges", 0)
+                try:
+                    from agent.verify_hooks import max_verify_nudges
+                    from hermes_cli.plugins import get_pre_verify_continue_message, has_hook
+
+                    if _edited and has_hook("pre_verify") and _attempt < max_verify_nudges():
+                        # Posture is fixed for the session — resolve once + cache.
+                        coding = getattr(agent, "_resolved_is_coding", None)
+                        if coding is None:
+                            from agent.coding_context import is_coding_context
+                            coding = bool(is_coding_context(platform=getattr(agent, "platform", "") or ""))
+                            agent._resolved_is_coding = coding
+                        _verify_nudge2 = get_pre_verify_continue_message(
+                            session_id=getattr(agent, "session_id", None) or "",
+                            platform=getattr(agent, "platform", "") or "",
+                            model=getattr(agent, "model", "") or "",
+                            coding=coding,
+                            attempt=_attempt,
+                            final_response=final_response,
+                            changed_paths=_edited,
+                        )
+                except Exception:
+                    logger.debug("pre_verify hook check failed", exc_info=True)
+                    _verify_nudge2 = None
+
+                if _verify_nudge2:
+                    agent._pre_verify_nudges = _attempt + 1
+                    final_msg["finish_reason"] = "verify_hook_continue"
+                    # Same alternation contract as verify-on-stop: keep the
+                    # attempted answer in history, follow it with a synthetic
+                    # user nudge, and don't surface the premature answer.
+                    messages.append(final_msg)
+                    messages.append({
+                        "role": "user",
+                        "content": _verify_nudge2,
+                        "_pre_verify_synthetic": True,
+                    })
+                    agent._session_messages = messages
+                    logger.debug("pre_verify nudge issued (attempt %d)",
+                                 agent._pre_verify_nudges)
+                    continue
+
                messages.append(final_msg)
                
                _turn_exit_reason = f"text_response(finish_reason={finish_reason})"
--- a/agent/copilot_acp_client.py
+++ b/agent/copilot_acp_client.py
@@ -21,8 +21,14 @@ from pathlib import Path
 from types import SimpleNamespace
 from typing import Any

+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
 from agent.file_safety import get_read_block_error, is_write_denied
 from agent.redact import redact_sensitive_text
+from tools.environments.local import hermes_subprocess_env

 ACP_MARKER_BASE_URL = "acp://copilot"
 _DEFAULT_TIMEOUT_SECONDS = 900.0
@@ -94,7 +100,10 @@ def _resolve_home_dir() -> str:


 def _build_subprocess_env() -> dict[str, str]:
-    env = os.environ.copy()
+    # Copilot ACP is a model-driving CLI executor: it legitimately needs LLM
+    # provider credentials. Route through the central helper so Tier-1 secrets
+    # (gateway bot tokens, GitHub auth, infra) are still stripped (#29157).
+    env = hermes_subprocess_env(inherit_credentials=True)
    home = _resolve_home_dir()
    env["HOME"] = home
    from hermes_constants import apply_subprocess_home_env
@@ -224,11 +233,73 @@ def _render_message_content(content: Any) -> str:
    return str(content).strip()


-def _extract_tool_calls_from_text(text: str) -> tuple[list[SimpleNamespace], str]:
+def _build_openai_tool_call(
+    *,
+    call_id: str,
+    name: str,
+    arguments: str,
+) -> ChatCompletionMessageToolCall:
+    """Build an OpenAI-compatible tool-call object for downstream handling."""
+    return ChatCompletionMessageToolCall(
+        id=call_id,
+        call_id=call_id,
+        response_item_id=None,
+        type="function",
+        function=Function(name=name, arguments=arguments),
+    )
+
+
+def _completion_to_stream_chunks(completion: SimpleNamespace) -> list[SimpleNamespace]:
+    """Convert a one-shot ACP response into OpenAI-style stream chunks."""
+    choice = completion.choices[0]
+    message = choice.message
+    tool_call_deltas = None
+    if message.tool_calls:
+        tool_call_deltas = []
+        for index, tool_call in enumerate(message.tool_calls):
+            tool_call_deltas.append(
+                SimpleNamespace(
+                    index=index,
+                    id=getattr(tool_call, "id", None),
+                    type=getattr(tool_call, "type", "function"),
+                    function=SimpleNamespace(
+                        name=getattr(tool_call.function, "name", None),
+                        arguments=getattr(tool_call.function, "arguments", None),
+                    ),
+                )
+            )
+
+    delta = SimpleNamespace(
+        role="assistant",
+        content=message.content or None,
+        tool_calls=tool_call_deltas,
+        reasoning_content=message.reasoning_content,
+        reasoning=message.reasoning,
+    )
+    data_chunk = SimpleNamespace(
+        choices=[
+            SimpleNamespace(
+                index=0,
+                delta=delta,
+                finish_reason=choice.finish_reason,
+            )
+        ],
+        model=completion.model,
+        usage=None,
+    )
+    usage_chunk = SimpleNamespace(
+        choices=[],
+        model=completion.model,
+        usage=completion.usage,
+    )
+    return [data_chunk, usage_chunk]
+
+
+def _extract_tool_calls_from_text(text: str) -> tuple[list[ChatCompletionMessageToolCall], str]:
    if not isinstance(text, str) or not text.strip():
        return [], ""

-    extracted: list[SimpleNamespace] = []
+    extracted: list[ChatCompletionMessageToolCall] = []
    consumed_spans: list[tuple[int, int]] = []

    def _try_add_tool_call(raw_json: str) -> None:
@@ -252,12 +323,10 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[SimpleNamespace], str
            call_id = f"acp_call_{len(extracted)+1}"

        extracted.append(
-            SimpleNamespace(
-                id=call_id,
+            _build_openai_tool_call(
                call_id=call_id,
-                response_item_id=None,
-                type="function",
-                function=SimpleNamespace(name=fn_name.strip(), arguments=fn_args),
+                name=fn_name.strip(),
+                arguments=fn_args,
            )
        )

@@ -376,6 +445,7 @@ class CopilotACPClient:
        timeout: float | None = None,
        tools: list[dict[str, Any]] | None = None,
        tool_choice: Any = None,
+        stream: bool = False,
        **_: Any,
    ) -> Any:
        prompt_text = _format_messages_as_prompt(
@@ -422,11 +492,14 @@ class CopilotACPClient:
        )
        finish_reason = "tool_calls" if tool_calls else "stop"
        choice = SimpleNamespace(message=assistant_message, finish_reason=finish_reason)
-        return SimpleNamespace(
+        completion = SimpleNamespace(
            choices=[choice],
            usage=usage,
            model=model or "copilot-acp",
        )
+        if stream:
+            return _completion_to_stream_chunks(completion)
+        return completion

    def _run_prompt(self, prompt_text: str, *, timeout_seconds: float) -> tuple[str, str]:
        try:
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -11,6 +11,7 @@ import uuid
 import re
 from dataclasses import dataclass, fields, replace
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
@@ -447,6 +448,63 @@ def get_pool_strategy(provider: str) -> str:
 DEFAULT_MAX_CONCURRENT_PER_CREDENTIAL = 1


+def _write_through_provider_state_to_global_root(
+    provider_id: str, state: Dict[str, Any]
+) -> None:
+    """Persist a rotated OAuth ``state`` into the global-root auth.json.
+
+    Best-effort write-through for the multi-profile rotation hazard
+    (#48415 / #43589): nous, openai-codex, and xai-oauth rotate the
+    refresh_token on refresh, so when a profile pool refresh rotates a grant
+    it resolved from the root fallback, the rotated chain must land back in
+    root. Otherwise root keeps a now-revoked refresh token and every other
+    profile reading the stale root grant dies with ``refresh_token_reused`` /
+    ``invalid_grant`` once its access token expires.
+
+    Only updates ``providers.<provider_id>`` in the root store; never touches
+    the profile store (the caller already saved that). Swallows all errors — a
+    failed write-through degrades to the pre-existing behavior (root stale), it
+    must never break the profile's own successful save. Mirrors
+    ``hermes_cli.auth._write_through_xai_oauth_to_global_root`` (which covers
+    the non-pool xAI refresh path) for the credential-pool refresh path.
+    """
+    try:
+        global_path = auth_mod._global_auth_file_path()
+    except Exception:
+        return
+    if global_path is None:
+        # Classic mode (profile == root); the profile save already hit root.
+        return
+    # Seat belt: under pytest, refuse to write the real user's
+    # ~/.hermes/auth.json even when HERMES_HOME points at a profile path
+    # (mirrors the read-side guard in _load_global_auth_store). Uses the
+    # unmodified HOME env, not Path.home() which fixtures may monkeypatch.
+    if os.environ.get("PYTEST_CURRENT_TEST"):
+        real_home_env = os.environ.get("HOME", "")
+        if real_home_env:
+            real_root = Path(real_home_env) / ".hermes" / "auth.json"
+            try:
+                if global_path.resolve(strict=False) == real_root.resolve(strict=False):
+                    return
+            except Exception:
+                return
+    try:
+        if global_path.exists():
+            global_store = _load_auth_store(global_path)
+        else:
+            global_store = {}
+        if not isinstance(global_store, dict):
+            return
+        _store_provider_state(global_store, provider_id, dict(state), set_active=False)
+        auth_mod._save_auth_store(global_store, global_path)
+    except Exception as exc:  # pragma: no cover - best effort
+        logger.debug(
+            "%s pool refresh: write-through to global root failed: %s",
+            provider_id,
+            exc,
+        )
+
+
 class CredentialPool:
    def __init__(self, provider: str, entries: List[PooledCredential]):
        self.provider = provider
@@ -479,10 +537,11 @@ class CredentialPool:
                self._entries[idx] = new
                return

-    def _persist(self) -> None:
+    def _persist(self, *, removed_ids: Optional[List[str]] = None) -> None:
        write_credential_pool(
            self.provider,
            [entry.to_dict() for entry in self._entries],
+            removed_ids=removed_ids,
        )

    def _is_terminal_auth_failure(
@@ -800,6 +859,28 @@ class CredentialPool:
        try:
            with _auth_store_lock():
                auth_store = _load_auth_store()
+                # Decide BEFORE writing whether this profile is reading the
+                # grant from the global root (no own providers.<id> block) vs.
+                # genuinely shadowing it. A pool refresh rotates single-use
+                # OAuth refresh tokens, so a profile that resolved the grant
+                # from root MUST write the rotated chain back to root too —
+                # otherwise root keeps a revoked refresh token and every other
+                # profile reading the stale root grant dies with
+                # refresh_token_reused / invalid_grant once its access token
+                # expires. This mirrors the xAI write-through in
+                # hermes_cli.auth._save_xai_oauth_tokens (#43589); the pool
+                # refresh path is the Codex/xAI analog reported in #48415.
+                _wt_provider_id = {
+                    "nous": "nous",
+                    "openai-codex": "openai-codex",
+                    "xai-oauth": "xai-oauth",
+                }.get(self.provider)
+                write_through_to_root = bool(_wt_provider_id) and not (
+                    isinstance(auth_store.get("providers"), dict)
+                    and isinstance(
+                        auth_store["providers"].get(_wt_provider_id), dict
+                    )
+                )
                if self.provider == "nous":
                    state = _load_provider_state(auth_store, "nous")
                    if state is None:
@@ -855,6 +936,10 @@ class CredentialPool:
                    return

                _save_auth_store(auth_store)
+                if write_through_to_root and _wt_provider_id:
+                    _write_through_provider_state_to_global_root(
+                        _wt_provider_id, state
+                    )
        except Exception as exc:
            logger.debug("Failed to sync %s pool entry back to auth store: %s", self.provider, exc)

@@ -1040,13 +1125,17 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal xAI OAuth state: %s", clear_exc
                        )
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source == "loopback_pkce"
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "loopback_pkce"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            # For openai-codex: same race as xAI/nous — another Hermes process
            # may have consumed the refresh token between our proactive sync
@@ -1106,13 +1195,17 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
                        )
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source == "device_code"
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "device_code"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
@@ -1169,13 +1262,17 @@ class CredentialPool:
                        auth_mod.NOUS_DEVICE_CODE_SOURCE,
                        f"manual:{auth_mod.NOUS_DEVICE_CODE_SOURCE}",
                    }
+                    removed_ids = [
+                        item.id for item in self._entries
+                        if item.source in singleton_sources
+                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source not in singleton_sources
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist()
+                    self._persist(removed_ids=removed_ids)
                    return None
            self._mark_exhausted(entry, None)
            return None
@@ -1337,7 +1434,7 @@ class CredentialPool:
            pruned_ids = set(entries_to_prune)
            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
-            self._persist()
+            self._persist(removed_ids=entries_to_prune)
        return available

    def _select_unlocked(self) -> Optional[PooledCredential]:
@@ -1511,7 +1608,11 @@ class CredentialPool:
            replace(entry, priority=new_priority)
            for new_priority, entry in enumerate(self._entries)
        ]
-        self._persist()
+        write_credential_pool(
+            self.provider,
+            [entry.to_dict() for entry in self._entries],
+            removed_ids=[removed.id],
+        )
        if self._current_id == removed.id:
            self._current_id = None
        return removed
@@ -2173,6 +2274,11 @@ def _seed_custom_pool(pool_key: str, entries: List[PooledCredential]) -> Tuple[b
 def load_pool(provider: str) -> CredentialPool:
    provider = (provider or "").strip().lower()
    raw_entries = read_credential_pool(provider)
+    disk_ids = {
+        entry.get("id")
+        for entry in raw_entries
+        if isinstance(entry, dict) and entry.get("id")
+    }
    raw_needs_sanitization = any(
        isinstance(payload, dict)
        and sanitize_borrowed_credential_payload(payload, provider) != payload
@@ -2201,8 +2307,10 @@ def load_pool(provider: str) -> CredentialPool:
        changed |= _normalize_pool_priorities(provider, entries)

    if changed:
+        new_ids = {entry.id for entry in entries}
        write_credential_pool(
            provider,
            [entry.to_dict() for entry in sorted(entries, key=lambda item: item.priority)],
+            removed_ids=disk_ids - new_ids,
        )
    return CredentialPool(provider, entries)
--- a/agent/curator.py
+++ b/agent/curator.py
@@ -273,6 +273,21 @@ def should_run_now(now: Optional[datetime] = None) -> bool:
 # Automatic state transitions (pure function, no LLM)
 # ---------------------------------------------------------------------------

+def _cron_referenced_skills() -> Set[str]:
+    """Skill names referenced by any cron job (incl. paused/disabled).
+
+    Best-effort: a cron-module import error or corrupt jobs store must never
+    break the curator, so any failure yields an empty set (no protection,
+    but no crash).
+    """
+    try:
+        from cron.jobs import referenced_skill_names as _refs
+        return _refs()
+    except Exception as e:
+        logger.debug("Curator could not read cron skill references: %s", e, exc_info=True)
+        return set()
+
+
 def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int]:
    """Walk every curator-managed skill and move active/stale/archived based on
    the latest real activity timestamp. Pinned skills are never touched.
@@ -292,6 +307,8 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
    stale_cutoff = now - timedelta(days=get_stale_after_days())
    archive_cutoff = now - timedelta(days=get_archive_after_days())

+    cron_referenced = _cron_referenced_skills()
+
    counts = {"marked_stale": 0, "archived": 0, "reactivated": 0, "checked": 0, "seeded": 0}

    for row in _u.agent_created_report():
@@ -300,6 +317,15 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
        if row.get("pinned"):
            continue

+        # A skill referenced by any cron job (incl. paused/disabled) is in
+        # use by definition — resuming or the next fire must find it. The
+        # scheduler only bumps usage when a job actually fires, so jobs that
+        # fire less often than archive_after_days, paused jobs, and far-future
+        # one-shots would otherwise have their skills aged out from under
+        # them. Treat referenced skills like pinned: never auto-transition.
+        if name in cron_referenced:
+            continue
+
        # First sight of a curation-eligible skill with no persisted record
        # (e.g. a newly-eligible built-in): anchor its clock to now and defer.
        if not row.get("_persisted", True):
@@ -316,6 +342,18 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int

        current = row.get("state", _u.STATE_ACTIVE)

+        # Never-used skills (use_count == 0) get a grace floor: don't archive
+        # one until it is at least stale_after_days old. A use=0 skill is
+        # absence of evidence, not evidence of staleness — a skill created
+        # recently may simply not have had its trigger come up yet.
+        never_used = int(row.get("use_count", 0) or 0) == 0
+        if never_used and anchor > stale_cutoff:
+            # Younger than the stale window — leave it alone entirely.
+            if current == _u.STATE_STALE:
+                _u.set_state(name, _u.STATE_ACTIVE)
+                counts["reactivated"] += 1
+            continue
+
        if anchor <= archive_cutoff and current != _u.STATE_ARCHIVED:
            ok, _msg = _u.archive_skill(name)
            if ok:
@@ -377,8 +415,10 @@ CURATOR_REVIEW_PROMPT = (
    "bodies + `references/`, `templates/`, and `scripts/` subfiles for "
    "session-specific detail — not one-session-one-skill micro-entries.\n\n"
    "Hard rules — do not violate:\n"
-    "1. DO NOT touch bundled or hub-installed skills. The candidate list "
-    "below is already filtered to agent-created skills only.\n"
+    "1. DO NOT touch bundled, hub-installed, or external-dir skills "
+    "(`skills.external_dirs`). The candidate list below is already filtered "
+    "to local curator-managed skills only; external skills are externally "
+    "owned and read-only to this background curator.\n"
    "2. DO NOT delete any skill. Archiving (moving the skill's directory "
    "into ~/.hermes/skills/.archive/) is the maximum destructive action. "
    "Archives are recoverable; deletion is not.\n"
@@ -388,10 +428,19 @@ CURATOR_REVIEW_PROMPT = (
    "back load-bearing UX (slash-command entry points referenced in docs and "
    "tips) and are filtered out of the candidate list below — never resurrect "
    "one as an archive or absorb target.\n"
+    "3c. DO NOT archive or prune any skill marked `cron=yes` in the candidate "
+    "list. A cron job depends on it and will fail to load it on its next "
+    "run. You MAY still consolidate it into an umbrella — but only because "
+    "the curator rewrites cron job skill references to follow consolidations; "
+    "never simply prune it.\n"
    "4. DO NOT use usage counters as a reason to skip consolidation. The "
    "counters are new and often mostly zero. Judge overlap on CONTENT, "
    "not on use_count. 'use=0' is not evidence a skill is valuable; it's "
-    "absence of evidence either way.\n"
+    "absence of evidence either way. Corollary: 'use=0' is ALSO not a "
+    "reason to PRUNE a skill. Never archive a never-used skill (use=0) "
+    "unless it is at least 30 days old (check last_activity / created date) "
+    "AND its content is genuinely obsolete or fully absorbed elsewhere — a "
+    "recently-created skill simply may not have had its trigger come up yet.\n"
    "5. DO NOT reject consolidation on the grounds that 'each skill has "
    "a distinct trigger'. Pairwise distinctness is the wrong bar. The "
    "right bar is: 'would a human maintainer write this as N separate "
@@ -469,8 +518,9 @@ CURATOR_REVIEW_PROMPT = (
    "skill, or `absorbed_into=\"\"` when you're truly pruning with no "
    "forwarding target. This drives cron-job skill-reference migration — "
    "guessing from your YAML summary after the fact is fragile.\n"
-    "  - terminal                       — mv a sibling into the archive "
-    "OR move its content into a support subfile\n\n"
+    "  - terminal                       — move LOCAL candidate content into "
+    "a support subfile when package integrity requires it; never mv, cp, rm, "
+    "patch, or rewrite bundled, hub-installed, or external-dir skills\n\n"
    "'keep' is a legitimate decision ONLY when the skill is already a "
    "class-level umbrella and none of the proposed merges would improve "
    "discoverability. 'This is narrow but distinct from its siblings' "
@@ -1410,12 +1460,14 @@ def _render_candidate_list() -> str:
    rows = skill_usage.agent_created_report()
    if not rows:
        return "No agent-created skills to review."
+    cron_referenced = _cron_referenced_skills()
    lines = [f"Agent-created skills ({len(rows)}):\n"]
    for r in rows:
        lines.append(
            f"- {r['name']}  "
            f"state={r['state']}  "
            f"pinned={'yes' if r.get('pinned') else 'no'}  "
+            f"cron={'yes' if r['name'] in cron_referenced else 'no'}  "
            f"activity={r.get('activity_count', 0)}  "
            f"use={r.get('use_count', 0)}  "
            f"view={r.get('view_count', 0)}  "
@@ -1843,6 +1895,14 @@ def _run_llm_review(prompt: str) -> Dict[str, Any]:
        # Disable recursive nudges — the curator must never spawn its own review.
        review_agent._memory_nudge_interval = 0
        review_agent._skill_nudge_interval = 0
+        # Tag this fork as autonomous background curation so skill_manage's
+        # background-review write guard fires. Without this the fork inherits
+        # the default "assistant_tool" origin, is_background_review() is False,
+        # and the external/bundled/hub-installed skill_manage guards never
+        # trigger during the curation pass they exist to protect against.
+        # turn_context.py binds this onto the write-origin ContextVar at turn
+        # start (see agent/turn_context.py).
+        review_agent._memory_write_origin = "background_review"

        # Redirect the forked agent's stdout/stderr to /dev/null while it
        # runs so its tool-call chatter doesn't pollute the foreground
--- a/agent/display.py
+++ b/agent/display.py
@@ -6,6 +6,7 @@ Used by AIAgent._execute_tool_calls for CLI feedback.

 import logging
 import os
+import re
 import sys
 import threading
 import time
@@ -15,6 +16,7 @@ from pathlib import Path
 from typing import Any

 from utils import safe_json_loads
+from agent.redact import redact_sensitive_text
 from agent.tool_result_classification import file_mutation_result_landed

 # ANSI escape codes for coloring tool failure indicators
@@ -177,6 +179,223 @@ def _truncate_preview(text: str, max_len: int | None) -> str:
    return text


+_SHELL_SILENT_HEADS = {"cd", "pushd", "popd", "export", "set", "unset", "source", ".", "true", "false", ":"}
+_SHELL_PIPE_TAIL_HEADS = {"head", "tail", "wc", "sort", "uniq"}
+
+
+def _shell_basename(head: str) -> str:
+    return head.rsplit("/", 1)[-1] if head else ""
+
+
+def _split_shell_words(segment: str) -> list[str]:
+    words: list[str] = []
+    buf: list[str] = []
+    quote: str | None = None
+
+    for i, ch in enumerate(segment):
+        if quote:
+            buf.append(ch)
+            if ch == quote and (i == 0 or segment[i - 1] != "\\"):
+                quote = None
+            continue
+
+        if ch in {"'", '"'}:
+            quote = ch
+            buf.append(ch)
+            continue
+
+        if ch.isspace():
+            if buf:
+                words.append("".join(buf))
+                buf = []
+            continue
+
+        buf.append(ch)
+
+    if buf:
+        words.append("".join(buf))
+
+    return words
+
+
+def _strip_shell_pipe_tail(segment: str) -> str:
+    words = _split_shell_words(segment)
+    out: list[str] = []
+
+    for i, word in enumerate(words):
+        if word == "|" and _shell_basename(words[i + 1] if i + 1 < len(words) else "") in _SHELL_PIPE_TAIL_HEADS:
+            break
+        out.append(word)
+
+    return " ".join(out).strip()
+
+
+def _split_shell_compound(command: str) -> list[str]:
+    segments: list[str] = []
+    buf: list[str] = []
+    quote: str | None = None
+    i = 0
+
+    while i < len(command):
+        ch = command[i]
+
+        if quote:
+            buf.append(ch)
+            if ch == quote and (i == 0 or command[i - 1] != "\\"):
+                quote = None
+            i += 1
+            continue
+
+        if ch in {"'", '"'}:
+            quote = ch
+            buf.append(ch)
+            i += 1
+            continue
+
+        op_len = 2 if command.startswith("&&", i) or command.startswith("||", i) else 1 if ch in {";", "\n"} else 0
+        if op_len:
+            segment = _strip_shell_pipe_tail("".join(buf).strip())
+            if segment:
+                segments.append(segment)
+            buf = []
+            i += op_len
+            continue
+
+        buf.append(ch)
+        i += 1
+
+    segment = _strip_shell_pipe_tail("".join(buf).strip())
+    if segment:
+        segments.append(segment)
+
+    return segments
+
+
+def _shell_head_word(segment: str) -> str:
+    words = _split_shell_words(segment)
+    index = 0
+    while index < len(words) and re.match(r"^[A-Za-z_]\w*=", words[index]):
+        index += 1
+    return _shell_basename(words[index] if index < len(words) else "")
+
+
+def _clean_shell_segment(segment: str) -> str:
+    words = _split_shell_words(segment)
+    out: list[str] = []
+    i = 0
+    while i < len(words):
+        word = words[i]
+        if re.match(r"^\d*(?:>>?|<)$", word):
+            i += 2
+            continue
+        if re.match(r"^\d*(?:>&|<&)\d+$", word) or re.match(r"^\d*>&\d+$", word):
+            i += 1
+            continue
+        out.append(word)
+        i += 1
+    return " ".join(out).strip()
+
+
+def _is_shell_boundary_echo(segment: str) -> bool:
+    words = _split_shell_words(segment)
+    if _shell_basename(words[0] if words else "") != "echo":
+        return False
+    rest = " ".join(words[1:])
+    return bool(re.search(r"-{2,}|_exit=|(?:^|\s|=)\$[?{]|PIPESTATUS", rest))
+
+
+def summarize_shell_command(command: str) -> str:
+    """Compact shell wrapper/plumbing for display while preserving raw command elsewhere."""
+    original = _oneline(command)
+    if not original:
+        return ""
+
+    segments = _split_shell_compound(original)
+    if len(segments) <= 1:
+        return _clean_shell_segment(segments[0] if segments else original) or original
+
+    core: list[str] = []
+    for segment in segments:
+        cleaned = _clean_shell_segment(segment)
+        head = _shell_head_word(cleaned)
+        if cleaned and head not in _SHELL_SILENT_HEADS and not _is_shell_boundary_echo(cleaned):
+            core.append(cleaned)
+
+    if not core:
+        return original
+    if len(core) == 1:
+        return core[0]
+
+    count = len(core) - 1
+    return f"{core[0]} + {count} {'command' if count == 1 else 'commands'}"
+
+
+def _read_file_line_label(args: dict) -> str:
+    offset = args.get("offset")
+    limit = args.get("limit")
+    if not isinstance(offset, int) or offset <= 0:
+        return ""
+    if not isinstance(limit, int) or limit <= 1:
+        return f"L{offset}"
+    return f"L{offset}-{offset + limit - 1}"
+
+
+def redact_browser_typed_text_for_display(value: Any, typed_text: Any) -> Any:
+    """Apply secret redaction to browser_type text in display-facing payloads.
+
+    Backends sometimes echo the attempted input in error strings or fallback
+    metadata.  When the raw typed value contains a recognizable secret (API
+    key, token, JWT, etc.) the redacted form differs from the raw value, so we
+    replace every occurrence of the raw value with its redacted form before a
+    browser_type result reaches logs, callbacks, the model, or chat history.
+
+    Normal typed text (search queries, addresses, form fields) matches no
+    secret pattern, so it passes through unchanged and stays readable.
+
+    Redaction is forced here regardless of the global ``security.redact_secrets``
+    preference: a typed credential leaking into chat history is a security
+    boundary, not mere log hygiene.
+    """
+    if typed_text is None:
+        return value
+    needle = str(typed_text)
+    if needle == "":
+        return value
+    redacted = redact_sensitive_text(needle, force=True)
+    if redacted == needle:
+        # Nothing secret-looking in the typed text; leave payload untouched.
+        return value
+    if isinstance(value, str):
+        return value.replace(needle, redacted)
+    if isinstance(value, dict):
+        return {
+            key: redact_browser_typed_text_for_display(item, typed_text)
+            for key, item in value.items()
+        }
+    if isinstance(value, list):
+        return [redact_browser_typed_text_for_display(item, typed_text) for item in value]
+    if isinstance(value, tuple):
+        return tuple(redact_browser_typed_text_for_display(item, typed_text) for item in value)
+    return value
+
+
+def redact_tool_args_for_display(tool_name: str, args: dict | None) -> dict | None:
+    """Return a copy of tool args safe for logs/progress UI.
+
+    For ``browser_type`` the ``text`` argument is run through the same
+    secret-pattern redactor used for logs.  Recognizable credentials (API
+    keys, tokens) are masked before the value reaches tool progress
+    notifications; normal typed text is left intact for debuggability.
+    """
+    if not isinstance(args, dict):
+        return args
+    if tool_name == "browser_type" and isinstance(args.get("text"), str):
+        safe_args = dict(args)
+        safe_args["text"] = redact_sensitive_text(args["text"], force=True)
+        return safe_args
+    return args
+
+
 def _delegate_task_goal_parts(tasks: Any, *, per_goal_len: int) -> tuple[int, list[str]]:
    if not isinstance(tasks, list):
        return 0, []
@@ -200,13 +419,14 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
        max_len = _tool_preview_max_len
    if not args:
        return None
+    args = redact_tool_args_for_display(tool_name, args) or args
    primary_args = {
        "terminal": "command", "web_search": "query", "web_extract": "urls",
        "read_file": "path", "write_file": "path", "patch": "path",
        "search_files": "pattern", "browser_navigate": "url",
        "browser_click": "ref", "browser_type": "text",
        "image_generate": "prompt", "text_to_speech": "text",
-        "vision_analyze": "question", "mixture_of_agents": "user_prompt",
+        "vision_analyze": "question",
        "skill_view": "name", "skills_list": "category",
        "cronjob": "action",
        "execute_code": "code", "delegate_task": "goal",
@@ -253,6 +473,23 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
        else:
            return f"planning {len(todos_arg)} task(s)"

+    if tool_name in {"terminal", "execute_code"}:
+        key = "code" if tool_name == "execute_code" else "command"
+        command = args.get(key)
+        if command is None:
+            return None
+        preview = summarize_shell_command(str(command))
+        return _truncate_preview(preview, max_len) if preview else None
+
+    if tool_name == "read_file":
+        path = args.get("path") or args.get("file") or args.get("filepath")
+        if path is None:
+            return None
+        label = Path(str(path).replace("\\", "/")).name or str(path)
+        line_label = _read_file_line_label(args)
+        preview = f"{label} {line_label}".strip()
+        return _truncate_preview(preview, max_len) if preview else None
+
    if tool_name == "session_search":
        query = _oneline(args.get("query", ""))
        return f"recall: \"{query[:25]}{'...' if len(query) > 25 else ''}\""
@@ -300,6 +537,122 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
    return preview


+# =========================================================================
+# Friendly tool labels (human-phrased verbs for built-in tools)
+#
+# Turns "web_search <query>" into "Searching the web for <query>" — the
+# ChatGPT-style "Searching…/Reading…" surface.  Curated and built-in only:
+# we know each core tool's semantics, so the verb is fixed, not computed.
+# Custom/plugin/MCP tools have no entry and fall back to the raw preview.
+# =========================================================================
+
+# Each entry maps a built-in tool name to its present-participle verb phrase.
+# A trailing space-then-preview is appended by build_tool_label() when the
+# tool's argument preview is available (e.g. "Reading docs/api.md").
+_TOOL_VERBS: dict[str, str] = {
+    "web_search": "Searching the web",
+    "web_extract": "Reading",
+    "browser_navigate": "Browsing",
+    "browser_click": "Clicking",
+    "browser_type": "Typing",
+    "read_file": "Reading",
+    "write_file": "Writing",
+    "patch": "Editing",
+    "search_files": "Searching files",
+    "terminal": "Running",
+    "execute_code": "Running code",
+    "image_generate": "Generating image",
+    "video_generate": "Generating video",
+    "text_to_speech": "Generating speech",
+    "vision_analyze": "Looking at the image",
+    "session_search": "Searching past sessions",
+    "skill_view": "Reading skill",
+    "skills_list": "Listing skills",
+    "skill_manage": "Updating skill",
+    "delegate_task": "Delegating",
+    "cronjob": "Scheduling",
+    "clarify": "Asking",
+    "memory": "Updating memory",
+    "todo": "Updating tasks",
+}
+
+# Verbs that read better without the raw argument preview appended.
+_TOOL_VERBS_NO_PREVIEW: frozenset[str] = frozenset({
+    "skills_list",
+    "session_search",
+})
+
+# Verbs that take a "for" connector before the preview (search-style phrasing):
+# "Searching the web for <query>" reads better than "Searching the web <query>".
+_TOOL_VERBS_FOR_CONNECTOR: frozenset[str] = frozenset({
+    "web_search",
+    "search_files",
+})
+
+_friendly_tool_labels: bool = True
+
+
+def set_friendly_tool_labels(enabled: bool) -> None:
+    """Toggle friendly human-phrased tool labels (display.friendly_tool_labels)."""
+    global _friendly_tool_labels
+    _friendly_tool_labels = bool(enabled)
+
+
+def get_friendly_tool_labels() -> bool:
+    """Return whether friendly tool labels are enabled."""
+    return _friendly_tool_labels
+
+
+def get_tool_verb(tool_name: str) -> str | None:
+    """Return the friendly verb for a built-in tool, or None.
+
+    Returns None when friendly labels are disabled or the tool has no curated
+    verb (custom/plugin/MCP tools).  Callers that already hold a computed
+    argument preview can compose ``f"{verb} {preview}"`` themselves; use
+    :func:`tool_verb_connector` to pick the right joiner.
+    """
+    if not _friendly_tool_labels:
+        return None
+    return _TOOL_VERBS.get(tool_name)
+
+
+def tool_verb_connector(tool_name: str) -> str:
+    """Return the connector between a verb and its preview (" for " or " ")."""
+    return " for " if tool_name in _TOOL_VERBS_FOR_CONNECTOR else " "
+
+
+def verb_drops_preview(tool_name: str) -> bool:
+    """Whether the verb should render alone, without the argument preview."""
+    return tool_name in _TOOL_VERBS_NO_PREVIEW
+
+
+def build_tool_label(tool_name: str, args: dict, max_len: int | None = None) -> str | None:
+    """Build a human-phrased status label for a tool call.
+
+    For built-in tools with a known verb (``web_search`` -> "Searching the
+    web for ..."), returns the verb optionally followed by the argument
+    preview.  For everything else (custom/plugin/MCP tools, or when friendly
+    labels are disabled) returns the raw preview, so callers can use this as a
+    drop-in replacement for :func:`build_tool_preview`.
+    """
+    if not _friendly_tool_labels:
+        return build_tool_preview(tool_name, args, max_len=max_len)
+
+    verb = _TOOL_VERBS.get(tool_name)
+    if not verb:
+        return build_tool_preview(tool_name, args, max_len=max_len)
+
+    if tool_name in _TOOL_VERBS_NO_PREVIEW:
+        return verb
+
+    preview = build_tool_preview(tool_name, args, max_len=max_len)
+    if not preview:
+        return verb
+    if tool_name in _TOOL_VERBS_FOR_CONNECTOR:
+        return f"{verb} for {preview}"
+    return f"{verb} {preview}"
+
+
 # =========================================================================
 # Inline diff previews for write actions
 # =========================================================================
@@ -906,6 +1259,7 @@ def get_cute_tool_message(
    When *result* is provided the line is checked for failure indicators.
    Failed tool calls get a red prefix and an informational suffix.
    """
+    args = redact_tool_args_for_display(tool_name, args) or args
    dur = f"{duration:.1f}s"
    is_failure, failure_suffix = _detect_tool_failure(tool_name, result)
    skin_prefix = get_skin_tool_prefix()
@@ -943,7 +1297,7 @@ def get_cute_tool_message(
            return _wrap(f"┊ 📄 fetch     {_trunc(domain, 35)}{extra}  {dur}")
        return _wrap(f"┊ 📄 fetch     pages  {dur}")
    if tool_name == "terminal":
-        return _wrap(f"┊ 💻 $         {_trunc(args.get('command', ''), 42)}  {dur}")
+        return _wrap(f"┊ 💻 $         {_trunc(build_tool_preview(tool_name, args) or args.get('command', ''), 42)}  {dur}")
    if tool_name == "process":
        action = args.get("action", "?")
        sid = args.get("session_id", "")[:12]
@@ -951,7 +1305,7 @@ def get_cute_tool_message(
                  "wait": f"wait {sid}", "kill": f"kill {sid}", "write": f"write {sid}", "submit": f"submit {sid}"}
        return _wrap(f"┊ ⚙️  proc      {labels.get(action, f'{action} {sid}')}  {dur}")
    if tool_name == "read_file":
-        return _wrap(f"┊ 📖 read      {_path(args.get('path', ''))}  {dur}")
+        return _wrap(f"┊ 📖 read      {_trunc(build_tool_preview(tool_name, args) or args.get('path', ''), 42)}  {dur}")
    if tool_name == "write_file":
        return _wrap(f"┊ ✍️  write     {_path(args.get('path', ''))}  {dur}")
    if tool_name == "patch":
@@ -1037,8 +1391,6 @@ def get_cute_tool_message(
        return _wrap(f"┊ 🔊 speak     {_trunc(args.get('text', ''), 30)}  {dur}")
    if tool_name == "vision_analyze":
        return _wrap(f"┊ 👁️  vision    {_trunc(args.get('question', ''), 30)}  {dur}")
-    if tool_name == "mixture_of_agents":
-        return _wrap(f"┊ 🧠 reason    {_trunc(args.get('user_prompt', ''), 30)}  {dur}")
    if tool_name == "send_message":
        return _wrap(f"┊ 📨 send      {args.get('target', '?')}: \"{_trunc(args.get('message', ''), 25)}\"  {dur}")
    if tool_name == "cronjob":
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [
    "servicequotaexceededexception",
 ]

+# Patterns that indicate provider-side overload, NOT a per-credential rate
+# limit or billing problem.  The credential is valid — the server is just
+# busy — so the correct recovery is "back off and retry the same key", never
+# "rotate the credential" (rotating exhausts the pool while the endpoint is
+# still busy; a single-key user has nothing to rotate to).  Some providers
+# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
+# status path matches the body against this list before falling through to
+# the rate_limit default.  Phrases are kept narrow and overload-flavoured so a
+# normal rate-limit message ("you have been rate-limited") doesn't hit this
+# bucket. (#14038, #15297)
+_OVERLOADED_PATTERNS = [
+    "overloaded",
+    "temporarily overloaded",
+    "service is temporarily overloaded",
+    "service may be temporarily overloaded",
+    "server is overloaded",
+    "server overloaded",
+    "service overloaded",
+    "service is overloaded",
+    "upstream overloaded",
+    "currently overloaded",
+    "at capacity",
+    "over capacity",
+]
+
 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
 _USAGE_LIMIT_PATTERNS = [
    "usage limit",
@@ -330,6 +355,14 @@ _CONTENT_POLICY_BLOCKED_PATTERNS = [
    # echo back; the underscore form is provider-specific enough.
    "content_filter",
    "responsibleaipolicyviolation",
+    # MiniMax output-layer safety filter. The error string is surfaced
+    # verbatim by MiniMax SDK / OpenAI-compatible endpoints, usually in the
+    # form "output new_sensitive (1027)" when the model's *output* (often a
+    # large tool-call argument block) trips the upstream safety filter and
+    # the SSE stream is truncated mid-flight. ``new_sensitive`` is the
+    # filter name and is narrow enough that billing / format / auth error
+    # strings will not collide. See #32421.
+    "new_sensitive",
 ]

 # Auth patterns (non-status-code signals)
@@ -717,6 +750,26 @@ def classify_api_error(

    is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS)
    if is_disconnect and not status_code:
+        # Reasoning-model override: a transport disconnect on a reasoning
+        # model is much more likely the upstream proxy idle-killing a
+        # long thinking stream than a true context overflow — even on
+        # large sessions.  The default disconnect+large-session routing
+        # below would otherwise send the user into the compression
+        # branch (should_compress=True) and silently delete
+        # conversation history on a phantom context-length error.
+        # Reasoning models have multi-minute thinking phases that
+        # routinely exceed the cloud gateway's idle window (NVIDIA
+        # NIM ~120s — first-party repro at NVIDIA/NemoClaw#4846;
+        # OpenAI worker / Anthropic stream-idle similar).  The
+        # per-reasoning-model stale-timeout floor in
+        # agent/reasoning_timeouts.py raises the stale-detector
+        # threshold to tolerate long thinking, so a true
+        # transport-layer failure here is recoverable via the retry
+        # path — not via context compression.  Reclassify as timeout.
+        # (Part 1 of Fixes #52310.)
+        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
+        if get_reasoning_stale_timeout_floor(model) is not None:
+            return _result(FailoverReason.timeout, retryable=True)
        # Absolute token/message-count thresholds are only a proxy for smaller
        # context windows.  Large-context sessions can have hundreds of
        # messages while still being far below their actual token budget.
@@ -843,7 +896,19 @@ def _classify_by_status(
        )

    if status_code == 429:
-        # Already checked long_context_tier above; this is a normal rate limit
+        # Already checked long_context_tier above. Some providers (notably
+        # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
+        # code as a true per-credential rate limit, but the credential is
+        # valid and the correct recovery is "back off and retry the same key",
+        # NOT "rotate the credential" (which exhausts the pool while the
+        # endpoint is still busy, and does nothing for a single-key user).
+        # Disambiguate on the error body so an overload 429 takes the
+        # transient-overload path instead of burning the pool. (#14038)
+        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+            return result_fn(
+                FailoverReason.overloaded,
+                retryable=True,
+            )
        return result_fn(
            FailoverReason.rate_limit,
            retryable=True,
@@ -1194,6 +1259,17 @@ def _classify_by_message(
            should_fallback=True,
        )

+    # Overloaded / server-busy patterns — must come BEFORE the rate_limit and
+    # billing checks so that a message-only "overloaded" (no 503/529 status,
+    # e.g. some Anthropic-compatible proxies) classifies as a transient
+    # overload (backoff + retry) instead of falling through to `unknown` or
+    # incorrectly triggering credential rotation.
+    if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+        return result_fn(
+            FailoverReason.overloaded,
+            retryable=True,
+        )
+
    # Billing patterns
    if any(p in error_msg for p in _BILLING_PATTERNS):
        return result_fn(
@@ -1283,19 +1359,25 @@ def _extract_status_code(error: Exception) -> Optional[int]:


 def _extract_error_body(error: Exception) -> dict:
-    """Extract the structured error body from an SDK exception."""
-    body = getattr(error, "body", None)
-    if isinstance(body, dict):
-        return body
-    # Some errors have .response.json()
-    response = getattr(error, "response", None)
-    if response is not None:
-        try:
-            json_body = response.json()
-            if isinstance(json_body, dict):
-                return json_body
-        except Exception:
-            pass
+    """Extract the structured error body from an SDK exception or its cause chain."""
+    current = error
+    for _ in range(5):  # Match _extract_status_code() traversal depth.
+        body = getattr(current, "body", None)
+        if isinstance(body, dict):
+            return body
+        # Some errors have .response.json()
+        response = getattr(current, "response", None)
+        if response is not None:
+            try:
+                json_body = response.json()
+                if isinstance(json_body, dict):
+                    return json_body
+            except Exception:
+                pass
+        cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
+        if cause is None or cause is current:
+            break
+        current = cause
    return {}


--- a/agent/file_safety.py
+++ b/agent/file_safety.py
@@ -77,15 +77,22 @@ def build_write_denied_prefixes(home: str) -> list[str]:
    ]


-def get_safe_write_root() -> Optional[str]:
-    """Return the resolved HERMES_WRITE_SAFE_ROOT path, or None if unset."""
-    root = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
-    if not root:
-        return None
-    try:
-        return os.path.realpath(os.path.expanduser(root))
-    except Exception:
-        return None
+def get_safe_write_roots() -> set[str]:
+    """Return resolved HERMES_WRITE_SAFE_ROOT paths. Supports multiple directories
+    separated by ``os.pathsep`` (``:`` on Unix, ``;`` on Windows).
+    E.g., ``/opt/data:/var/www/html`` on Unix, ``C:\\data;D:\\www`` on Windows."""
+    env = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
+    if not env:
+        return set()
+    roots: set[str] = set()
+    for path in env.split(os.pathsep):
+        if path:
+            try:
+                resolved = os.path.realpath(os.path.expanduser(path))
+                roots.add(resolved)
+            except (OSError, ValueError):
+                continue
+    return roots


 def is_write_denied(path: str) -> bool:
@@ -124,9 +131,15 @@ def is_write_denied(path: str) -> bool:
        except Exception:
            pass

-    safe_root = get_safe_write_root()
-    if safe_root and not (resolved == safe_root or resolved.startswith(safe_root + os.sep)):
-        return True
+    safe_roots = get_safe_write_roots()
+    if safe_roots:
+        allowed = False
+        for safe_root in safe_roots:
+            if resolved == safe_root or resolved.startswith(safe_root + os.sep):
+                allowed = True
+                break
+        if not allowed:
+            return True

    return False

--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@@ -251,6 +251,78 @@ def _supports_vision_override(
    return None


+def _resolve_inference_base_url(
+    cfg: Optional[Dict[str, Any]],
+    provider: str,
+) -> str:
+    """Best-effort base URL for the active inference provider."""
+    try:
+        from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL
+
+        runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip()
+        if runtime:
+            return runtime
+    except Exception:
+        pass
+
+    if not isinstance(cfg, dict):
+        return ""
+
+    model_cfg_raw = cfg.get("model")
+    model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
+    base_url = str(model_cfg.get("base_url") or "").strip()
+    if base_url:
+        return base_url
+
+    config_provider = str(model_cfg.get("provider") or "").strip()
+    candidate_names: set[str] = set()
+    for p in filter(None, (provider, config_provider)):
+        candidate_names.add(p)
+        if p.lower().startswith("custom:"):
+            candidate_names.add(p.split(":", 1)[1])
+        else:
+            candidate_names.add(f"custom:{p}")
+
+    providers_cfg = cfg.get("providers")
+    if isinstance(providers_cfg, dict):
+        for name in candidate_names:
+            entry = providers_cfg.get(name)
+            if isinstance(entry, dict):
+                bu = str(entry.get("base_url") or "").strip()
+                if bu:
+                    return bu
+
+    custom_providers = cfg.get("custom_providers")
+    if isinstance(custom_providers, list):
+        lowered = {n.lower() for n in candidate_names}
+        for entry_raw in custom_providers:
+            if not isinstance(entry_raw, dict):
+                continue
+            entry_name = str(entry_raw.get("name") or "").strip()
+            if entry_name not in candidate_names and entry_name.lower() not in lowered:
+                continue
+            bu = str(entry_raw.get("base_url") or "").strip()
+            if bu:
+                return bu
+
+    return ""
+
+
+def _should_probe_ollama_vision(provider: str, base_url: str) -> bool:
+    """True when the active provider likely fronts a local Ollama server."""
+    p = (provider or "").strip().lower()
+    if p == "ollama":
+        return True
+    if not base_url:
+        return False
+    try:
+        from agent.model_metadata import detect_local_server_type
+
+        return detect_local_server_type(base_url) == "ollama"
+    except Exception:
+        return False
+
+
 def _coerce_mode(raw: Any) -> str:
    """Normalize a config value into one of the valid modes."""
    if not isinstance(raw, str):
@@ -302,15 +374,33 @@ def _lookup_supports_vision(
        return override
    if not provider or not model:
        return None
+    caps = None
    try:
        from agent.models_dev import get_model_capabilities
        caps = get_model_capabilities(provider, model)
    except Exception as exc:  # pragma: no cover - defensive
        logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
-        return None
-    if caps is None:
-        return None
-    return bool(caps.supports_vision)
+    if caps is not None:
+        return bool(caps.supports_vision)
+
+    base_url = _resolve_inference_base_url(cfg, provider)
+    if not base_url and (provider or "").strip().lower() == "ollama":
+        base_url = "http://localhost:11434/v1"
+    if _should_probe_ollama_vision(provider, base_url):
+        try:
+            from agent.model_metadata import query_ollama_supports_vision
+
+            ollama_vision = query_ollama_supports_vision(model, base_url)
+            if ollama_vision is not None:
+                return ollama_vision
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.debug(
+                "image_routing: ollama vision probe failed for %s:%s — %s",
+                provider,
+                model,
+                exc,
+            )
+    return None


 def decide_image_input_mode(
@@ -388,14 +478,98 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
    # BMP: "BM"
    if raw.startswith(b"BM"):
        return "image/bmp"
-    # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc.
-    if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in {
-        b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis",
-    }:
-        return "image/heic"
+    # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
+    if len(raw) >= 12 and raw[4:8] == b"ftyp":
+        brand = raw[8:12]
+        if brand in {b"avif", b"avis"}:
+            return "image/avif"
+        if brand in {
+            b"heic", b"heix", b"hevc", b"hevx",
+            b"mif1", b"msf1", b"heim", b"heis",
+        }:
+            return "image/heic"
+    # TIFF: II*\0 (little-endian) or MM\0* (big-endian)
+    if raw[:4] in {b"II*\x00", b"MM\x00*"}:
+        return "image/tiff"
+    # ICO: 00 00 01 00 (reserved=0, type=1=icon)
+    if raw[:4] == b"\x00\x00\x01\x00":
+        return "image/x-icon"
+    # SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
+    head = raw[:512].lstrip().lower()
+    if head.startswith(b"<?xml") or head.startswith(b"<svg"):
+        if b"<svg" in head:
+            return "image/svg+xml"
    return None


+# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
+# accepts natively. Anything outside this set has to be transcoded to PNG
+# before we declare media_type, otherwise the provider returns HTTP 400
+# ("Could not process image" / "Unsupported image media type") and the
+# whole turn fails with no salvage path.
+#
+# Discord (and a few other chat platforms) freely accept attachments in
+# formats outside this set -- AVIF screenshots from Chromium, HEIC from
+# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
+# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
+# it is skipped (logged) rather than transcoded.
+_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
+    "image/png", "image/jpeg", "image/gif", "image/webp",
+})
+
+
+def _transcode_to_png(raw: bytes) -> Optional[bytes]:
+    """Decode arbitrary image bytes with Pillow and re-encode as PNG.
+
+    Returns None if Pillow isn't installed or can't decode the input
+    (rare formats, corrupted bytes, missing optional decoder plugin for
+    HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
+    the image so the rest of the turn still works.
+
+    HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
+    them on demand and swallow ImportError so a missing plugin just
+    looks like 'Pillow can't decode this' rather than crashing.
+    """
+    try:
+        from PIL import Image
+    except ImportError:
+        logger.info(
+            "image_routing: Pillow not installed; cannot transcode "
+            "non-standard image format to PNG. Install with `pip install Pillow` "
+            "(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
+        )
+        return None
+    # Optional plugin registration. Silent on failure: an unsupported
+    # format will just fall through to Image.open raising below.
+    try:
+        import pillow_heif  # type: ignore
+
+        pillow_heif.register_heif_opener()
+    except Exception:
+        pass
+    try:
+        import pillow_avif  # type: ignore  # noqa: F401  -- registers AVIF on import
+    except Exception:
+        pass
+    try:
+        from io import BytesIO
+
+        with Image.open(BytesIO(raw)) as im:
+            # Pick an output mode PNG can serialise. Anything other than
+            # the standard set gets normalised to RGBA so transparency is
+            # preserved where the source had it.
+            if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
+                im = im.convert("RGBA")
+            buf = BytesIO()
+            im.save(buf, format="PNG", optimize=False)
+            return buf.getvalue()
+    except Exception as exc:
+        logger.info(
+            "image_routing: Pillow could not transcode image to PNG -- %s", exc
+        )
+        return None
+
+
 def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
    """Return image MIME type for *path*.

@@ -431,8 +605,18 @@ def _file_to_data_url(path: Path) -> Optional[str]:
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

-    Returns None only if the file can't be read (missing, permission
-    denied, etc.); the caller reports those paths in ``skipped``.
+    Format compatibility IS handled here: if the sniffed MIME isn't one
+    of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
+    HEIC, BMP, TIFF, or ICO that some providers reject outright), we
+    transcode to PNG with Pillow before declaring media_type. This fixes
+    the user-visible "Could not process image" HTTP 400 from Anthropic on
+    Discord-attached AVIF/HEIC/BMP files.
+
+    Returns None if the file can't be read OR if the format isn't
+    universally supported AND Pillow can't transcode it (Pillow missing,
+    HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
+    caller reports those paths in ``skipped`` and the rest of the turn
+    proceeds.
    """
    try:
        raw = path.read_bytes()
@@ -440,6 +624,22 @@ def _file_to_data_url(path: Path) -> Optional[str]:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path, raw=raw)
+    if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
+        transcoded = _transcode_to_png(raw)
+        if transcoded is None:
+            logger.warning(
+                "image_routing: %s is %s which is not accepted by all major "
+                "vision providers and could not be transcoded to PNG; "
+                "skipping this attachment.",
+                path, mime,
+            )
+            return None
+        logger.info(
+            "image_routing: transcoded %s (%s) -> image/png for provider compatibility",
+            path.name, mime,
+        )
+        raw = transcoded
+        mime = "image/png"
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"

--- a/agent/learn_prompt.py
+++ b/agent/learn_prompt.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""``/learn`` — build the standards-guided prompt that turns whatever the user
+described into a reusable skill.
+
+``/learn`` is open-ended. The user can point it at anything they can describe:
+a directory of code, an API doc URL, a workflow they just walked the agent
+through in this conversation, or pasted notes. This module builds ONE prompt
+that instructs the live agent to:
+
+  1. Gather the sources the user named, using the tools it already has
+     (``read_file`` / ``search_files`` for dirs, ``web_extract`` for URLs, the
+     current conversation for "what I just did", the user's text for pasted
+     material).
+  2. Author a single ``SKILL.md`` via ``skill_manage`` that follows the Hermes
+     skill-authoring standards (description <=60 chars, the modern section
+     order, Hermes-tool framing, no invented commands).
+
+There is no separate distillation engine and no model-tool footprint: the
+agent does the work with its existing toolset, so this works identically on
+local, Docker, and remote terminal backends. Every surface (CLI ``/learn``,
+gateway ``/learn``, the dashboard "Learn a skill" panel) calls
+:func:`build_learn_prompt` and feeds the result to the agent as a normal turn.
+"""
+
+from __future__ import annotations
+
+# The house-style rules, distilled from AGENTS.md "Skill authoring standards
+# (HARDLINE)" and the hermes-agent-dev new-skill salvage reference. Embedded in
+# the prompt so the agent authors skills the way a maintainer would by hand.
+_AUTHORING_STANDARDS = """\
+Follow the Hermes skill-authoring standards exactly. These are the same
+HARDLINE rules a maintainer enforces in review:
+
+Frontmatter:
+- name: lowercase-hyphenated, <=64 chars, no spaces.
+- description: ONE sentence, **<=60 characters**, ends with a period. State the
+  capability, not the implementation. No marketing words (powerful,
+  comprehensive, seamless, advanced, robust). Do NOT repeat the skill name. If
+  the description contains a colon, wrap the whole value in double quotes.
+  This is the most-violated rule and it is NOT cosmetic: the system-prompt
+  skill index truncates the description to 60 chars and loads it every
+  session, so anything past char 60 is silently cut and never routes. After
+  you write the description, COUNT the characters; if it is over 60, cut it
+  down before saving — do not ship a sentence and hope.
+    Good (<=60): `Search arXiv papers by keyword, author, or ID.`
+    Bad (123):   `A comprehensive skill that lets the agent search arXiv for
+                  academic papers using keywords, authors, and categories.`
+- version: 0.1.0
+- author: always the literal value `Hermes`. NEVER fill it from the host
+  environment — the OS/login username (e.g. the `user=` line in your
+  environment hints), git config, or any identity you can probe must not be
+  written. Skills get shared and published, so an environment-derived name is
+  a privacy leak the user never opted into; the skill names itself as Hermes.
+- platforms: declare `[macos]`, `[linux]`, and/or `[windows]` IF the skill
+  uses OS-bound primitives (osascript/apt/systemctl => the matching OS; /proc,
+  os.setsid, signal.SIGKILL => linux; fcntl/termios => POSIX). Prefer fixing it
+  cross-platform first (tempfile.gettempdir(), pathlib.Path, psutil); gate only
+  when the dependency is genuinely platform-bound. Omit the field for portable
+  skills.
+- metadata.hermes.tags: a few Capitalized, Relevant, Tags.
+
+Body section order (omit a section only if it genuinely has no content):
+1. "# <Human Title>" then a 2-3 sentence intro: what it does, what it does NOT
+   do, and the key dependency stance (e.g. "stdlib only").
+2. "## When to Use" — bullet list of concrete trigger phrases.
+3. "## Prerequisites" — exact env vars, install steps, credentials.
+4. "## How to Run" — the canonical invocation, framed through Hermes tools.
+5. "## Quick Reference" — a flat command/endpoint list, no narration.
+6. "## Procedure" — numbered steps with copy-paste-exact commands.
+7. "## Pitfalls" — known limits, rate limits, things that look broken but aren't.
+8. "## Verification" — a single command/check that proves the skill worked.
+
+Hermes-tool framing (this is what makes it a skill, not shell docs):
+- Frame running scripts as "invoke through the `terminal` tool".
+- Reference Hermes tools by name in backticks: `terminal`, `read_file`,
+  `write_file`, `search_files`, `patch`, `web_extract`, `web_search`,
+  `vision_analyze`, `browser_navigate`, `delegate_task`, `image_generate`,
+  `text_to_speech`, `cronjob`, `memory`, `skill_view`, `execute_code`.
+- Do NOT name shell utilities the agent already has wrapped: say `read_file`
+  not cat/head/tail, `search_files` not grep/rg/find/ls, `patch` not sed/awk,
+  `web_extract` not curl-to-scrape, `write_file` not echo>file or heredocs.
+- Third-party CLIs (ffmpeg, gh, an SDK) are fine inside a script file, but the
+  prose still frames them as "invoke through the `terminal` tool". If the
+  skill needs an MCP server, name it and document its setup in Prerequisites.
+
+Quality bar:
+- Prefer exact commands, endpoint URLs, function signatures, and config keys
+  that appear VERBATIM in the source. NEVER invent flags, paths, or APIs — if
+  you didn't see it in the source, don't write it.
+- Keep it tight and scannable: ~100 lines for a simple skill, ~200 for a
+  complex one. Don't re-paste the source docs.
+- Don't write a router/index/hub skill that only points at other skills.
+- Larger scripts/parsers belong in a `scripts/` file (add via
+  `skill_manage` write_file), referenced from SKILL.md by relative path — not
+  inlined for the agent to re-type every run. References go in `references/`,
+  templates in `templates/`."""
+
+
+def build_learn_prompt(user_request: str) -> str:
+    """Build the agent prompt for an open-ended ``/learn`` request.
+
+    Args:
+        user_request: the free-text the user gave after ``/learn`` — a
+            description of the workflow, paths, URLs, or "what I just did".
+
+    Returns:
+        A complete instruction the agent runs as a normal turn. The agent
+        gathers the described sources with its existing tools and authors the
+        skill via ``skill_manage``.
+    """
+    req = (user_request or "").strip()
+    if not req:
+        req = (
+            "the workflow we just went through in this conversation — review "
+            "the steps taken and distill them into a reusable skill"
+        )
+
+    return (
+        "[/learn] The user wants you to learn a reusable skill from the "
+        "source(s) they described below, and save it.\n\n"
+        f"WHAT TO LEARN FROM:\n{req}\n\n"
+        "Do this:\n"
+        "1. Gather the material. Resolve whatever the user named using the "
+        "tools you already have — `read_file`/`search_files` for local files "
+        "or directories, `web_extract` for URLs, the current conversation "
+        "history if they referred to something you just did, and the text "
+        "they pasted as-is. If the request is ambiguous about scope, make a "
+        "reasonable choice and note it; do not stall.\n"
+        "2. Author ONE SKILL.md and save it with the `skill_manage` tool "
+        "(action=\"create\"). Pick a sensible category. If the procedure needs "
+        "a non-trivial script, add it under the skill's `scripts/` with "
+        "`skill_manage` write_file and reference it by relative path.\n\n"
+        f"{_AUTHORING_STANDARDS}\n\n"
+        "When done, tell the user the skill name, its category, and a "
+        "one-line summary of what it captured."
+    )
--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@@ -46,6 +46,39 @@ logger = logging.getLogger(__name__)
 _SYNC_DRAIN_TIMEOUT_S = 5.0


+def normalize_tool_schema(schema: Any) -> Optional[Dict[str, Any]]:
+    """Return a function-tool dict with a resolvable top-level ``name``.
+
+    Context engines and memory providers expose tool schemas via
+    ``get_tool_schemas()``. The expected shape is a bare function schema
+    (``{"name": ..., "description": ..., "parameters": ...}``) which callers
+    wrap as ``{"type": "function", "function": schema}``.
+
+    Some providers instead return an entry that is *already* in OpenAI tool
+    form (``{"type": "function", "function": {"name": ...}}``). Wrapping that
+    a second time produces ``{"type": "function", "function": {"type":
+    "function", "function": {...}}}`` whose ``function`` has no top-level
+    ``name``. Strict providers (e.g. DeepSeek) reject the *entire* request
+    with ``tools[N].function: missing field name`` (HTTP 400), so one bad
+    schema disables the whole toolset and breaks every turn (#47707).
+
+    This helper normalizes both shapes to the bare function schema and
+    returns ``None`` for anything without a resolvable name, so callers can
+    skip-with-warning rather than appending a nameless tool.
+    """
+    if not isinstance(schema, dict):
+        return None
+    # Unwrap an already-wrapped OpenAI tool entry.
+    if schema.get("type") == "function" and isinstance(schema.get("function"), dict):
+        schema = schema["function"]
+        if not isinstance(schema, dict):
+            return None
+    name = schema.get("name", "")
+    if not name or not isinstance(name, str):
+        return None
+    return schema
+
+
 def memory_provider_tools_enabled(enabled_toolsets: Optional[List[str]]) -> bool:
    """Return whether external memory-provider tools should be exposed."""
    if enabled_toolsets is None:
@@ -92,11 +125,17 @@ def inject_memory_provider_tools(agent: Any) -> int:
        agent.valid_tool_names = valid_tool_names

    added = 0
-    for schema in get_schemas():
-        if not isinstance(schema, dict):
+    for raw_schema in get_schemas():
+        schema = normalize_tool_schema(raw_schema)
+        if schema is None:
+            logger.warning(
+                "Memory provider returned a tool schema with no resolvable "
+                "name; skipping to avoid poisoning the request (%r)",
+                raw_schema,
+            )
            continue
-        tool_name = schema.get("name", "")
-        if not tool_name or tool_name in existing_tool_names:
+        tool_name = schema["name"]
+        if tool_name in existing_tool_names:
            continue
        tools.append({"type": "function", "function": schema})
        valid_tool_names.add(tool_name)
@@ -370,8 +409,11 @@ class MemoryManager:
        _core_tool_names = set(_HERMES_CORE_TOOLS)

        # Index tool names → provider for routing
-        for schema in provider.get_tool_schemas():
-            tool_name = schema.get("name", "")
+        for raw_schema in provider.get_tool_schemas():
+            schema = normalize_tool_schema(raw_schema)
+            if schema is None:
+                continue
+            tool_name = schema["name"]
            if tool_name in _core_tool_names:
                logger.warning(
                    "Memory provider '%s' tool '%s' shadows a reserved core "
@@ -658,11 +700,19 @@ class MemoryManager:
        seen = set()
        for provider in self._providers:
            try:
-                for schema in provider.get_tool_schemas():
-                    name = schema.get("name", "")
+                for raw_schema in provider.get_tool_schemas():
+                    schema = normalize_tool_schema(raw_schema)
+                    if schema is None:
+                        logger.warning(
+                            "Memory provider '%s' returned a tool schema with "
+                            "no resolvable name; skipping (%r)",
+                            provider.name, raw_schema,
+                        )
+                        continue
+                    name = schema["name"]
                    if name in _core_tool_names:
                        continue
-                    if name and name not in seen:
+                    if name not in seen:
                        schemas.append(schema)
                        seen.add(name)
            except Exception as e:
--- a/agent/message_sanitization.py
+++ b/agent/message_sanitization.py
@@ -279,6 +279,38 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
    return "{}"


+def close_interrupted_tool_sequence(messages: list, final_response: Any = None) -> bool:
+    """Append a synthetic assistant turn when an interrupted tail is a tool result.
+
+    A turn cut short by ``/stop`` can leave the transcript ending on a raw
+    ``tool`` message (a tool finished, or its execution was cancelled, but the
+    model never streamed a closing assistant turn). Persisting that tail means
+    the next user message lands as ``… tool → user`` — a role-alternation
+    violation that strict providers (Gemini, Claude) react to by hallucinating
+    a continuation of the user's message and ignoring prior context, which
+    reads to the user as "lost context" (#48879).
+
+    ``finalize_turn`` closes this on the happy interrupt path, but the
+    retry/backoff/error interrupt aborts in ``conversation_loop`` ``return``
+    early and never reach it — this shared helper closes the sequence on all of
+    them. ``final_response`` is usually empty on an interrupt, so an explicit
+    placeholder is used rather than an empty-content assistant turn.
+
+    Mutates ``messages`` in place. Returns True if a closing turn was appended.
+    """
+    if not messages:
+        return False
+    last = messages[-1]
+    if not isinstance(last, dict) or last.get("role") != "tool":
+        return False
+    text = final_response if isinstance(final_response, str) else ""
+    messages.append({
+        "role": "assistant",
+        "content": text.strip() or "Operation interrupted.",
+    })
+    return True
+
+
 def _strip_non_ascii(text: str) -> str:
    """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.

@@ -431,6 +463,7 @@ def _sanitize_structure_non_ascii(payload: Any) -> bool:

 __all__ = [
    "_SURROGATE_RE",
+    "close_interrupted_tool_sequence",
    "_sanitize_surrogates",
    "_sanitize_structure_surrogates",
    "_sanitize_messages_surrogates",
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -0,0 +1,586 @@
+"""Mixture-of-Agents runtime helpers for /moa turns.
+
+The slash command is deliberately not a model tool. It marks one user turn as
+MoA-enabled; the normal Hermes agent loop still owns tool calling and turn
+termination, while this module gathers reference-model context before each model
+iteration.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+from agent.auxiliary_client import call_llm
+from agent.transports import get_transport
+
+logger = logging.getLogger(__name__)
+
+# Upper bound on concurrent reference-model calls. References are independent
+# advisory calls (no tools, no inter-dependence), so we fan them out the same
+# way delegate_task runs a batch: all in flight at once, results collected when
+# every reference finishes. Presets rarely list more than a handful of
+# references; this cap just protects against a pathologically large preset
+# opening dozens of sockets at once.
+_MAX_REFERENCE_WORKERS = 8
+
+# Per-tool-result character budget for the advisory reference view. Tool
+# results can be huge (a full diff, a 5000-line file dump); replaying them
+# verbatim per reference per tool-loop step would blow the reference model's
+# context window and cost. We keep the agent's *actions* (tool calls) in full —
+# they are cheap, high-signal, and tell the reference what the agent did — but
+# preview each tool *result* head+tail so the reference still sees what came
+# back without replaying megabytes. The acting aggregator always gets the full,
+# untrimmed transcript; this budget only shapes the advisory copy.
+_REFERENCE_TOOL_RESULT_BUDGET = 4000
+
+# System prompt prepended to every reference-model call. References are
+# advisory — they do NOT act, call tools, or own the task. Without this
+# framing a reference receives the bare trimmed conversation and assumes it is
+# the acting agent: it then refuses ("I can't access repositories / URLs from
+# here") or tries to call tools it doesn't have. The prompt reframes the model
+# as an analyst whose job is to reason about the presented state and hand its
+# best thinking to the aggregator/orchestrator that will actually act.
+_REFERENCE_SYSTEM_PROMPT = (
+    "You are a reference advisor in a Mixture of Agents (MoA) process. You are "
+    "NOT the acting agent and you do NOT execute anything: you cannot call "
+    "tools, run commands, browse, or access files, repositories, or URLs, and "
+    "you should not try to or apologize for being unable to. A separate "
+    "aggregator/orchestrator model holds those capabilities and will take the "
+    "actual actions.\n\n"
+    "The conversation below is the current state of a task handled by that "
+    "acting agent. Your job is to give your most intelligent analysis of that "
+    "state: understand the goal, reason about the problem, and advise on what "
+    "to do next. Surface the best approach, concrete next steps and tool-use "
+    "strategy, likely pitfalls and risks, and anything the acting agent may "
+    "have missed or gotten wrong. Assume any referenced files, URLs, or "
+    "systems exist and reason about them from the context given rather than "
+    "asking for access.\n\n"
+    "Respond with your advice directly — no preamble, no disclaimers about "
+    "tools or access. Your response is private guidance handed to the "
+    "aggregator, not an answer shown to the user."
+)
+
+
+
+def _slot_label(slot: dict[str, str]) -> str:
+    return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"
+
+
+def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
+    """Resolve a reference/aggregator slot to real runtime call kwargs.
+
+    A MoA slot is just a model selection — it must be called the same way any
+    model is called elsewhere, not through a bare ``call_llm(provider=...,
+    model=...)`` that leaves base_url/api_key/api_mode unresolved and lets the
+    auxiliary auto-detector guess. We route the slot's provider through
+    ``resolve_runtime_provider`` (the canonical provider→api_mode/base_url/
+    api_key resolver the CLI, gateway, and delegate_task all use), so the slot
+    gets its provider's real API surface — e.g. MiniMax → anthropic_messages,
+    GPT-5/o-series → max_completion_tokens, custom endpoints → their base_url.
+
+    Returns the kwargs to pass through to ``call_llm`` (provider/model plus the
+    resolved base_url/api_key when available). Falls back to the bare
+    provider/model on any resolution error so a misconfigured slot still
+    attempts the call rather than aborting the whole MoA turn.
+    """
+    provider = str(slot.get("provider") or "").strip()
+    model = str(slot.get("model") or "").strip()
+    out: dict[str, Any] = {"provider": provider, "model": model}
+    try:
+        from hermes_cli.runtime_provider import resolve_runtime_provider
+
+        rt = resolve_runtime_provider(requested=provider, target_model=model)
+        resolved_provider = str(rt.get("provider") or provider).strip().lower()
+        # call_llm treats an explicit base_url as a custom endpoint. That is
+        # correct for ordinary OpenAI-compatible targets, but wrong for OAuth /
+        # provider-backed targets whose provider branch adds auth refresh,
+        # request metadata, or request-shape adapters. Keep those providers
+        # identified by name.
+        if resolved_provider in {"nous", "openai-codex", "xai-oauth"}:
+            return out
+        # Pass the resolved endpoint through so call_llm builds the request for
+        # the provider's actual API surface instead of auto-detecting. base_url
+        # routes call_llm to the right adapter (incl. anthropic_messages mode);
+        # api_key is the resolved credential for that provider.
+        if rt.get("base_url"):
+            out["base_url"] = rt["base_url"]
+        if rt.get("api_key"):
+            out["api_key"] = rt["api_key"]
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.debug("MoA slot runtime resolution failed for %s: %s", _slot_label(slot), exc)
+    return out
+
+
+def _run_reference(
+    slot: dict[str, str],
+    ref_messages: list[dict[str, Any]],
+    *,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+) -> tuple[str, str]:
+    """Call one reference model and return ``(label, text)``.
+
+    The slot is resolved to its provider's real runtime (via ``_slot_runtime``)
+    and called through the same ``call_llm`` request-building path any model
+    uses, so per-model wire-format handling (anthropic_messages,
+    max_completion_tokens, fixed/forbidden temperature) applies identically to
+    a reference as it would if that model were the acting model. MoA imposes no
+    cap of its own (``max_tokens`` defaults to ``None`` → omitted → the model's
+    real maximum); ``temperature`` is only the user's configured preset value,
+    which call_llm may still override per model.
+
+    Never raises: a failed reference becomes a labelled note so the aggregator
+    can still act with partial context. Designed to run inside a thread pool —
+    ``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right
+    concurrency primitive, mirroring ``delegate_task``'s batch fan-out.
+    """
+    label = _slot_label(slot)
+    try:
+        # Prepend the advisory-role system prompt so the reference understands
+        # it is analyzing state for an aggregator, not acting on the task. The
+        # trimmed view (_reference_messages) already strips the agent's own
+        # system prompt, so this is the only system message the reference sees.
+        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
+        response = call_llm(
+            task="moa_reference",
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            **_slot_runtime(slot),
+        )
+        return label, _extract_text(response) or "(empty response)"
+    except Exception as exc:
+        logger.warning("MoA reference model %s failed: %s", label, exc)
+        return label, f"[failed: {exc}]"
+
+
+def _run_references_parallel(
+    reference_models: list[dict[str, str]],
+    ref_messages: list[dict[str, Any]],
+    *,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+) -> list[tuple[str, str]]:
+    """Fan out all reference models in parallel, returning outputs in order.
+
+    Like ``delegate_task``'s batch mode, every reference is dispatched at once
+    and we block until all of them finish before handing the joined results to
+    the aggregator. Output order matches ``reference_models`` so the
+    ``Reference {idx}`` labelling stays stable. MoA presets that reference
+    another MoA preset are skipped here (recursion guard) with a labelled note.
+    """
+    if not reference_models:
+        return []
+
+    results: list[tuple[str, str] | None] = [None] * len(reference_models)
+    futures = {}
+    workers = min(_MAX_REFERENCE_WORKERS, len(reference_models))
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        for idx, slot in enumerate(reference_models):
+            if slot.get("provider") == "moa":
+                results[idx] = (
+                    _slot_label(slot),
+                    "[skipped: MoA presets cannot recursively reference MoA]",
+                )
+                continue
+            futures[
+                executor.submit(
+                    _run_reference,
+                    slot,
+                    ref_messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                )
+            ] = idx
+        # Collect every reference before returning — the aggregator needs the
+        # complete set, so there is no early-exit / first-completed path here.
+        for future, idx in futures.items():
+            results[idx] = future.result()
+
+    return [r for r in results if r is not None]
+
+
+def _truncate_tool_result(text: str, budget: int = _REFERENCE_TOOL_RESULT_BUDGET) -> str:
+    """Head+tail preview of a tool result for the advisory view.
+
+    Keeps the first and last halves of the budget with a ``[... N chars
+    omitted ...]`` marker between them, so a reference sees both how the result
+    started and how it ended without replaying the whole payload.
+    """
+    if not text or len(text) <= budget:
+        return text
+    half = budget // 2
+    omitted = len(text) - 2 * half
+    return f"{text[:half]}\n[... {omitted} chars omitted ...]\n{text[-half:]}"
+
+
+def _render_tool_calls(tool_calls: Any) -> str:
+    """Render an assistant turn's tool_calls as readable text lines.
+
+    The advisory view cannot carry real ``tool_calls`` payloads (strict
+    providers reject tool_calls the reference never produced), so the agent's
+    actions are flattened to text the reference can read and reason about.
+    """
+    lines: list[str] = []
+    for tc in tool_calls or []:
+        fn = (tc.get("function") or {}) if isinstance(tc, dict) else {}
+        name = fn.get("name") or (tc.get("name") if isinstance(tc, dict) else "") or "tool"
+        args = fn.get("arguments")
+        if isinstance(args, str):
+            args_text = args
+        elif args is not None:
+            try:
+                import json
+
+                args_text = json.dumps(args, ensure_ascii=False)
+            except Exception:
+                args_text = str(args)
+        else:
+            args_text = ""
+        lines.append(f"[called tool: {name}({args_text})]" if args_text else f"[called tool: {name}]")
+    return "\n".join(lines)
+
+
+def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build an advisory view of the conversation for reference models.
+
+    A reference gives an INFORMED judgement on the current state, so it must
+    see what the agent actually did — its tool calls AND the tool results that
+    came back — not just the agent's narration. We therefore preserve the whole
+    conversation flow, but flatten it into clean user/assistant *text* turns:
+
+      - system prompt: dropped (8K of Hermes boilerplate, not advisory signal).
+      - assistant turns: kept; any ``tool_calls`` are rendered inline as
+        ``[called tool: name(args)]`` text lines appended to the turn's text.
+      - ``tool``-role results: NOT dropped. Each is folded (head+tail preview,
+        see ``_truncate_tool_result``) into the *preceding* assistant turn as a
+        ``[tool result: ...]`` block, so the reference sees what came back.
+
+    This emits ZERO ``tool``-role messages and ZERO ``tool_calls`` arrays — only
+    plain user/assistant text — so strict providers (Mistral, Fireworks) that
+    reject orphan tool messages / unproduced tool_calls don't 400, while the
+    reference still has the full picture.
+
+    The view MUST end with a ``user`` turn. Anthropic (and OpenRouter→Anthropic)
+    interpret a trailing assistant turn as an assistant *prefill* to continue,
+    and no-prefill models (e.g. Claude Opus 4.8) reject it with
+    ``400 ... must end with a user message``. Rather than DELETE the agent's
+    latest context to satisfy that (which would blind the reference to the
+    current state), we APPEND a synthetic user turn asking the reference to
+    judge the state above. End-on-user is satisfied and no context is lost.
+
+    The acting aggregator always receives the full, untrimmed transcript; this
+    function only shapes the disposable advisory copy.
+    """
+    advisory_instruction = (
+        "[The conversation above is the current state of the task. Give your "
+        "most intelligent judgement: what is going on, what should happen next, "
+        "what risks or mistakes you see, and how the acting agent should "
+        "proceed.]"
+    )
+
+    rendered: list[dict[str, Any]] = []
+    last_user_content: str | None = None
+    for msg in messages:
+        role = msg.get("role")
+        content = msg.get("content")
+        text = content if isinstance(content, str) else ""
+
+        if role == "system":
+            continue
+        if role == "user":
+            if text.strip():
+                last_user_content = text
+            rendered.append({"role": "user", "content": text})
+        elif role == "assistant":
+            parts: list[str] = []
+            if text.strip():
+                parts.append(text.strip())
+            calls_text = _render_tool_calls(msg.get("tool_calls"))
+            if calls_text:
+                parts.append(calls_text)
+            # Empty assistant turns (no text, no calls) carry nothing advisory.
+            if parts:
+                rendered.append({"role": "assistant", "content": "\n".join(parts)})
+        elif role == "tool":
+            # Fold the tool result into the preceding assistant turn as text so
+            # the reference sees what came back, without emitting a tool-role
+            # message a reference never produced.
+            result_text = _truncate_tool_result(text)
+            block = f"[tool result: {result_text}]"
+            if rendered and rendered[-1].get("role") == "assistant":
+                rendered[-1]["content"] = rendered[-1]["content"] + "\n" + block
+            else:
+                # No assistant turn to attach to (e.g. a leading tool result);
+                # keep it as advisory context on its own assistant-role line.
+                rendered.append({"role": "assistant", "content": block})
+        # Any other role is ignored.
+
+    # End on a user turn: append a synthetic advisory request rather than
+    # deleting the agent's latest assistant context. This satisfies Anthropic's
+    # no-trailing-assistant-prefill rule while preserving full state.
+    if rendered and rendered[-1].get("role") == "assistant":
+        rendered.append({"role": "user", "content": advisory_instruction})
+    elif rendered and rendered[-1].get("role") == "user":
+        # Already ends on a user turn (fresh user prompt, no agent action yet).
+        # Leave it — the reference answers that prompt directly.
+        pass
+
+    if not rendered:
+        # Degenerate case: nothing rendered. Fall back to the latest user turn.
+        if last_user_content is not None:
+            return [{"role": "user", "content": last_user_content}]
+        for msg in reversed(messages):
+            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
+                return [{"role": "user", "content": msg["content"]}]
+    return rendered
+
+
+
+def _extract_text(response: Any) -> str:
+    try:
+        transport = get_transport("chat_completions")
+        if transport is None:
+            raise RuntimeError("chat_completions transport unavailable")
+        normalized = transport.normalize_response(response)
+        text = (normalized.content or "").strip()
+        if text:
+            return text
+    except Exception:
+        pass
+    try:
+        content = response.choices[0].message.content
+        return (content or "").strip()
+    except Exception:
+        return ""
+
+
+def aggregate_moa_context(
+    *,
+    user_prompt: str,
+    api_messages: list[dict[str, Any]],
+    reference_models: list[dict[str, str]],
+    aggregator: dict[str, str],
+    temperature: float = 0.6,
+    aggregator_temperature: float = 0.4,
+    max_tokens: int | None = None,
+) -> str:
+    """Run configured reference models and synthesize their advice.
+
+    Failures are returned as model-specific notes instead of aborting the normal
+    agent loop; the main model can still act with partial context.
+
+    ``max_tokens`` is ``None`` by default: MoA does not cap reference or
+    aggregator output, so each model uses its own maximum. ``call_llm`` omits
+    the parameter entirely when it is ``None`` (see its docstring), which also
+    sidesteps providers that reject ``max_tokens`` outright. A hardcoded cap
+    here previously truncated long aggregator syntheses.
+    """
+    reference_outputs: list[tuple[str, str]] = []
+    ref_messages = _reference_messages(api_messages)
+    reference_outputs = _run_references_parallel(
+        reference_models,
+        ref_messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+
+    joined = "\n\n".join(
+        f"Reference {idx} — {label}:\n{text}"
+        for idx, (label, text) in enumerate(reference_outputs, start=1)
+    )
+    synth_prompt = (
+        "You are the aggregator in a Mixture of Agents process. Synthesize the "
+        "reference responses into concise, actionable guidance for the main "
+        "Hermes agent. Focus on next steps, tool-use strategy, risks, and any "
+        "disagreements. Do not answer the user directly unless that is all that "
+        "is needed; produce context the main agent should use in its normal loop.\n\n"
+        f"Original user prompt:\n{user_prompt}\n\n"
+        f"Reference responses:\n{joined}"
+    )
+
+    agg_label = _slot_label(aggregator)
+    try:
+        response = call_llm(
+            task="moa_aggregator",
+            messages=[{"role": "user", "content": synth_prompt}],
+            temperature=aggregator_temperature,
+            max_tokens=max_tokens,
+            **_slot_runtime(aggregator),
+        )
+        synthesis = _extract_text(response)
+    except Exception as exc:
+        logger.warning("MoA aggregator model %s failed: %s", agg_label, exc)
+        synthesis = ""
+
+    if not synthesis:
+        synthesis = joined
+
+    return (
+        "[Mixture of Agents context — use this as private guidance for the "
+        "normal Hermes agent loop. You may call tools, continue reasoning, or "
+        "finish normally.]\n"
+        f"Aggregator: {agg_label}\n"
+        f"References: {', '.join(_slot_label(slot) for slot in reference_models)}\n\n"
+        f"{synthesis.strip()}"
+    )
+
+
+class MoAChatCompletions:
+    """OpenAI-chat-compatible facade where the aggregator is the acting model."""
+
+    def __init__(self, preset_name: str, reference_callback: Any = None):
+        self.preset_name = preset_name or "default"
+        # Optional display hook. Called as reference outputs become available so
+        # frontends can show each reference model's answer as a labelled block
+        # before the aggregator acts. Signature:
+        #   reference_callback(event, **kwargs)
+        # where event is one of:
+        #   "moa.reference"   kwargs: index, count, label, text
+        #   "moa.aggregating" kwargs: aggregator (label), ref_count
+        # Never raises into the model call — display is best-effort.
+        self.reference_callback = reference_callback
+        # State-scoped reference cache. The agent loop calls create() once per
+        # tool-loop iteration; references should re-run whenever the task STATE
+        # advances — i.e. on every new user message AND every new tool result —
+        # so each reference judges the latest state. The advisory view
+        # (_reference_messages) now renders tool calls + results as text, so its
+        # signature changes on every new tool response; the cache key is that
+        # signature, so a new tool result is a cache MISS (references re-run)
+        # while a redundant create() call with identical state is a HIT (no
+        # re-run, no re-emit). This gives "fire on every user/tool response"
+        # for free, without re-firing on a pure no-op re-call.
+        self._ref_cache_key: tuple | None = None
+        self._ref_cache_outputs: list[tuple[str, str]] = []
+
+    def _emit(self, event: str, **kwargs: Any) -> None:
+        cb = self.reference_callback
+        if cb is None:
+            return
+        try:
+            cb(event, **kwargs)
+        except Exception as exc:  # pragma: no cover - display must never break the turn
+            logger.debug("MoA reference_callback failed for %s: %s", event, exc)
+
+    def create(self, **api_kwargs: Any) -> Any:
+        from hermes_cli.config import load_config
+        from hermes_cli.moa_config import resolve_moa_preset
+
+        preset = resolve_moa_preset(load_config().get("moa") or {}, self.preset_name)
+        messages = list(api_kwargs.get("messages") or [])
+        reference_models = preset.get("reference_models") or []
+        aggregator = preset.get("aggregator") or {}
+        # MoA does not cap reference or aggregator output: each model uses its
+        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
+        # (it never caps by default), so a long aggregator synthesis is never
+        # truncated and providers that reject max_tokens don't 400.
+        temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
+        aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
+
+        # When the preset is disabled, skip the reference fan-out and let the
+        # configured aggregator act alone — it is the preset's acting model, so
+        # a disabled MoA preset is simply "use the aggregator directly."
+        if not preset.get("enabled", True):
+            reference_models = []
+
+        reference_outputs: list[tuple[str, str]] = []
+        ref_messages = _reference_messages(messages)
+
+        # Turn-scoped cache: only run + display references when the advisory
+        # view changed (i.e. a new user turn). Within one turn the agent loop
+        # calls create() once per tool iteration with the same advisory view;
+        # reuse the cached outputs and skip both the re-run and the re-emit.
+        _sig = hashlib.sha256(
+            "\u0000".join(
+                f"{m.get('role')}:{m.get('content')}" for m in ref_messages
+            ).encode("utf-8", "replace")
+        ).hexdigest()
+        _cache_key = (self.preset_name, _sig, tuple(_slot_label(s) for s in reference_models))
+        _refs_from_cache = _cache_key == self._ref_cache_key and bool(self._ref_cache_outputs)
+
+        if _refs_from_cache:
+            reference_outputs = list(self._ref_cache_outputs)
+        else:
+            reference_outputs = _run_references_parallel(
+                reference_models,
+                ref_messages,
+                temperature=temperature,
+                max_tokens=None,
+            )
+            self._ref_cache_key = _cache_key
+            self._ref_cache_outputs = list(reference_outputs)
+
+            # Surface each reference model's answer to the display BEFORE the
+            # aggregator acts — once per turn (only on the iteration that
+            # actually ran them). The user sees one labelled block per
+            # reference (rendered like a thinking block) so the MoA process is
+            # visible rather than a silent pause. Best-effort: never blocks the
+            # turn.
+            _ref_count = len(reference_outputs)
+            for _idx, (_label, _text) in enumerate(reference_outputs, start=1):
+                self._emit(
+                    "moa.reference",
+                    index=_idx,
+                    count=_ref_count,
+                    label=_label,
+                    text=_text,
+                )
+            if _ref_count:
+                self._emit(
+                    "moa.aggregating",
+                    aggregator=_slot_label(aggregator),
+                    ref_count=_ref_count,
+                )
+
+        agg_messages = [dict(m) for m in messages]
+        if reference_outputs:
+            joined = "\n\n".join(
+                f"Reference {idx} — {label}:\n{text}"
+                for idx, (label, text) in enumerate(reference_outputs, start=1)
+            )
+            guidance = (
+                "[Mixture of Agents reference context]\n"
+                f"Preset: {self.preset_name}\n"
+                f"Aggregator/acting model: {_slot_label(aggregator)}\n"
+                f"References: {', '.join(label for label, _ in reference_outputs)}\n\n"
+                "Use the reference responses below as private context. You are the aggregator and acting model: "
+                "answer the user directly or call tools as needed.\n\n"
+                f"{joined}"
+            )
+            for msg in reversed(agg_messages):
+                if msg.get("role") == "user" and isinstance(msg.get("content"), str):
+                    msg["content"] = msg["content"] + "\n\n" + guidance
+                    break
+            else:
+                agg_messages.append({"role": "user", "content": guidance})
+
+        if aggregator.get("provider") == "moa":
+            raise RuntimeError("MoA aggregator cannot be another MoA preset")
+        agg_kwargs = dict(api_kwargs)
+        agg_kwargs["messages"] = agg_messages
+        # The aggregator is the acting model. Resolve its slot to the provider's
+        # real runtime (base_url/api_key/api_mode) and call it through the same
+        # request-building path any model uses — so per-model wire-format
+        # handling (anthropic_messages, max_completion_tokens, fixed/forbidden
+        # temperature) applies identically to it. MoA imposes no output cap:
+        # max_tokens is passed through from the caller (normally None → omitted
+        # → the model's real maximum). The preset's old hardcoded 4096 default
+        # is gone — it truncated long syntheses.
+        return call_llm(
+            task="moa_aggregator",
+            messages=agg_messages,
+            temperature=aggregator_temperature,
+            max_tokens=agg_kwargs.get("max_tokens"),
+            tools=agg_kwargs.get("tools"),
+            extra_body=agg_kwargs.get("extra_body"),
+            **_slot_runtime(aggregator),
+        )
+
+
+class MoAClient:
+    def __init__(self, preset_name: str, reference_callback: Any = None):
+        self.chat = type("_MoAChat", (), {})()
+        self.chat.completions = MoAChatCompletions(preset_name, reference_callback=reference_callback)
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -478,6 +478,16 @@ def _infer_provider_from_url(base_url: str) -> Optional[str]:
    return None


+def _lmstudio_server_root(base_url: str) -> str:
+    """Return the LM Studio server root for native ``/api/v1`` endpoints."""
+    root = _normalize_base_url(base_url).rstrip("/")
+    for suffix in ("/api/v1", "/api", "/v1"):
+        if root.endswith(suffix):
+            root = root[: -len(suffix)].rstrip("/")
+            break
+    return root
+
+
 def _is_known_provider_base_url(base_url: str) -> bool:
    return _infer_provider_from_url(base_url) is not None

@@ -549,6 +559,7 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
    server_url = normalized
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
+    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -556,7 +567,7 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
        with httpx.Client(timeout=2.0, headers=headers) as client:
            # LM Studio exposes /api/v1/models — check first (most specific)
            try:
-                r = client.get(f"{server_url}/api/v1/models")
+                r = client.get(f"{lmstudio_url}/api/v1/models")
                if r.status_code == 200:
                    return "lm-studio"
            except Exception:
@@ -774,7 +785,7 @@ def fetch_endpoint_model_metadata(
    if is_local_endpoint(normalized):
        try:
            if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
-                server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
+                server_url = _lmstudio_server_root(normalized)
                response = requests.get(
                    server_url.rstrip("/") + "/api/v1/models",
                    headers=headers,
@@ -1188,6 +1199,56 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option
    return None


+def query_ollama_supports_vision(model: str, base_url: str, api_key: str = "") -> Optional[bool]:
+    """Return True/False when Ollama ``/api/show`` reports vision support.
+
+    Uses the ``capabilities`` field on Ollama 0.6.0+ and falls back to
+    ``model_info.*.vision.block_count`` on older servers. Returns None when
+    the server is unreachable, not Ollama, or the model is unknown.
+    """
+    import httpx
+
+    bare_model = _strip_provider_prefix(model)
+    if not bare_model or not base_url:
+        return None
+
+    try:
+        if detect_local_server_type(base_url, api_key=api_key) != "ollama":
+            return None
+    except Exception:
+        return None
+
+    server_url = base_url.rstrip("/")
+    if server_url.endswith("/v1"):
+        server_url = server_url[:-3]
+
+    headers = _auth_headers(api_key)
+
+    try:
+        with httpx.Client(timeout=3.0, headers=headers) as client:
+            resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
+            if resp.status_code != 200:
+                return None
+            data = resp.json()
+    except Exception:
+        return None
+
+    caps = data.get("capabilities")
+    if isinstance(caps, list):
+        if any(str(cap).lower() == "vision" for cap in caps):
+            return True
+        if caps:
+            return False
+
+    model_info = data.get("model_info")
+    if isinstance(model_info, dict):
+        for key in model_info:
+            if "vision.block_count" in str(key).lower():
+                return True
+
+    return None
+
+
 def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query an Ollama server's native ``/api/show`` for context length.

@@ -1297,6 +1358,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
    server_url = base_url.rstrip("/")
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
+    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -1340,7 +1402,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
            # Use _model_id_matches for fuzzy matching: LM Studio stores models as
            # "publisher/slug" but users configure only "slug" after "local:" prefix.
            if server_type == "lm-studio":
-                resp = client.get(f"{server_url}/api/v1/models")
+                resp = client.get(f"{lmstudio_url}/api/v1/models")
                if resp.status_code == 200:
                    data = resp.json()
                    for m in data.get("models", []):
@@ -1646,6 +1708,34 @@ def get_model_context_length(
    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
        return config_context_length

+    # 0a. MoA virtual provider — ``model`` is a preset name, not a real model,
+    # and ``base_url`` is the local virtual endpoint, so every probe below would
+    # miss and fall through to the 256K default. The aggregator is the acting
+    # model, so resolve the context window from the aggregator slot's real
+    # provider+model instead. References are advisory-only and never bound the
+    # acting context, so they're ignored here.
+    if (provider or "").strip().lower() == "moa":
+        try:
+            from hermes_cli.config import load_config
+            from hermes_cli.moa_config import resolve_moa_preset
+            from hermes_cli.runtime_provider import resolve_runtime_provider
+
+            preset = resolve_moa_preset(load_config().get("moa") or {}, model)
+            agg = preset.get("aggregator") or {}
+            agg_provider = str(agg.get("provider") or "").strip()
+            agg_model = str(agg.get("model") or "").strip()
+            if agg_model and agg_provider and agg_provider.lower() != "moa":
+                rt = resolve_runtime_provider(requested=agg_provider, target_model=agg_model)
+                return get_model_context_length(
+                    agg_model,
+                    base_url=rt.get("base_url", "") or "",
+                    api_key=rt.get("api_key", "") or "",
+                    provider=agg_provider,
+                )
+        except Exception:
+            logger.debug("MoA aggregator context-length resolution failed", exc_info=True)
+        # Fall through to the generic default if aggregator resolution failed.
+
    # 0b. custom_providers per-model override — check before any probe.
    # This closes the gap where /model switch and display paths used to fall
    # back to 128K despite the user having a per-model context_length set.
--- a/agent/pet/init.py
+++ b/agent/pet/init.py
@@ -0,0 +1,51 @@
+"""Petdex pet engine — shared core for the CLI, TUI, and desktop surfaces.
+
+Petdex (https://github.com/crafter-station/petdex) is a public gallery of
+animated sprite "pets" for coding agents.  Each pet is a ``pet.json`` plus a
+``spritesheet.{webp,png}`` of 192×208 px cells. Current Codex/petdex sheets use
+an 8-column × 9-row atlas; older Hermes/petdex sheets used an 8-row atlas.
+Hermes infers the row taxonomy from the sheet and maps agent activity onto
+idle/run/review/failed/wave/jump.
+
+This package is the **single source of truth** for the feature so the base
+CLI (Python) and TUI (Ink, via ``tui_gateway``) never duplicate the hard
+parts:
+
+- :mod:`agent.pet.constants` — frame geometry + the :class:`PetState` enum.
+- :mod:`agent.pet.state`     — map agent activity → a :class:`PetState`.
+- :mod:`agent.pet.manifest`  — fetch the public petdex manifest.
+- :mod:`agent.pet.store`     — install / list / resolve pets on disk
+                               (profile-aware via ``get_hermes_home()``).
+- :mod:`agent.pet.render`    — decode a spritesheet and encode frames for a
+                               terminal (kitty / iTerm2 / sixel graphics
+                               protocols, with a Unicode half-block
+                               fallback).
+
+Rendering in the Electron desktop is necessarily TypeScript (canvas), but it
+reuses the same on-disk store and the same state semantics.
+
+The whole feature is a *display* concern: it adds no model tool, mutates no
+system prompt or toolset, and therefore has zero effect on prompt caching.
+"""
+
+from agent.pet.constants import (
+    DEFAULT_SCALE,
+    FRAME_H,
+    FRAME_W,
+    FRAMES_PER_STATE,
+    LOOP_MS,
+    STATE_ROWS,
+    PetState,
+)
+from agent.pet.state import derive_pet_state
+
+__all__ = [
+    "DEFAULT_SCALE",
+    "FRAME_H",
+    "FRAME_W",
+    "FRAMES_PER_STATE",
+    "LOOP_MS",
+    "STATE_ROWS",
+    "PetState",
+    "derive_pet_state",
+]
--- a/agent/pet/constants.py
+++ b/agent/pet/constants.py
@@ -0,0 +1,167 @@
+"""Pet sprite geometry + animation-state taxonomy.
+
+These values are the common petdex/Codex pet geometry. The real ``pet.json``
+usually only carries ``id``/``displayName``/``description``/``spritesheetPath``;
+row taxonomy is inferred from the atlas shape so Hermes can render both legacy
+8-row sheets and current 9-row Codex sheets.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+
+# Frame geometry (pixels). Current Codex/petdex spritesheets are 8 columns x 9
+# rows (1536x1872), while older Hermes/petdex sheets used 9 columns x 8 rows
+# (1728x1664). Renderers derive both row taxonomy and real column count from the
+# concrete sheet, so either shape works.
+FRAME_W = 192
+FRAME_H = 208
+
+# Frames consumed per animation state (the petdex web app uses CSS
+# ``steps(6)``).  A sheet may physically contain more columns; we only step
+# through the first ``FRAMES_PER_STATE``.
+FRAMES_PER_STATE = 6
+
+# Full-loop duration for one state, milliseconds (petdex default).
+LOOP_MS = 1100
+
+# Default on-screen scale relative to native frame size.  ``display.pet.scale``
+# is the single master scalar: the desktop canvas multiplies its native pixels
+# by it and every terminal surface derives its half-block/kitty column width
+# from it (see :func:`cols_for_scale`), so one number shrinks all three
+# interfaces together.  (petdex's own clients render at 0.7; we default smaller
+# so the kitty/GUI mascot stays a glanceable corner sprite.  The half-block
+# fallback can't shrink as far — see ``UNICODE_MIN_COLS`` — and clamps to its
+# legibility floor instead.)
+DEFAULT_SCALE = 0.33
+
+# User-settable scale bounds (``/pet scale``, desktop slider).  Floor keeps the
+# pet clickable/visible; ceiling stops a fat-fingered value from filling the
+# screen.  The unicode fallback additionally clamps to ``UNICODE_MIN_COLS``.
+MIN_SCALE = 0.1
+MAX_SCALE = 3.0
+
+
+def clamp_scale(scale: float) -> float:
+    """Clamp *scale* to ``[MIN_SCALE, MAX_SCALE]`` (the single validation point)."""
+    return max(MIN_SCALE, min(MAX_SCALE, scale))
+
+# Terminal cells one native frame spans at ``scale == 1.0``.  A cell is ~8px
+# wide, a frame is ``FRAME_W`` (192) px → 24 cells.  This mirrors the kitty
+# graphics placement (``scaled_px // 8``) so at full scale every renderer agrees.
+BASE_UNICODE_COLS = FRAME_W // 8
+
+# Legibility floor for the half-block fallback.  A half-block cell samples the
+# sprite at only 1 horizontal + 2 vertical taps, so below this width a 192×208
+# pet collapses into an unreadable blob *regardless* of scale.  kitty/GUI draw
+# true pixels and have no such floor — that's why the same ``scale: 0.33`` is
+# crisp there but mush in half-blocks.  ``scale`` shrinks the unicode pet down
+# TO this floor (and grows it above), instead of past it into noise.
+UNICODE_MIN_COLS = 16
+
+
+def cols_for_scale(scale: float) -> int:
+    """Half-block width implied by *scale*, clamped to the legibility floor.
+
+    Above the floor it tracks the kitty cell box (``scaled_px // 8``) so the two
+    renderers converge at larger sizes; below it the floor keeps the sprite
+    readable rather than letting it devolve into a blob.
+    """
+    return max(UNICODE_MIN_COLS, round(BASE_UNICODE_COLS * (scale or DEFAULT_SCALE)))
+
+
+def resolve_cols(scale: float, unicode_cols: int = 0) -> int:
+    """Resolve terminal width: explicit *unicode_cols* override, else from *scale*."""
+    return int(unicode_cols) if unicode_cols and int(unicode_cols) > 0 else cols_for_scale(scale)
+
+
+class PetState(str, Enum):
+    """Animation state a pet can be shown in.
+
+    These are Hermes' activity state names. They are not always identical to the
+    source atlas row names: Codex-format pets use rows like ``jumping`` /
+    ``running`` while the UI keeps the shorter ``jump`` / ``run`` names.
+    """
+
+    IDLE = "idle"
+    WAVE = "wave"
+    RUN = "run"
+    FAILED = "failed"
+    REVIEW = "review"
+    JUMP = "jump"
+    WAITING = "waiting"
+
+
+# Legacy Hermes/petdex row order (top -> bottom) used by the older 8-row,
+# 9-column atlas shape.
+LEGACY_STATE_ROWS: list[str] = [
+    PetState.IDLE.value,
+    PetState.WAVE.value,
+    PetState.RUN.value,
+    PetState.FAILED.value,
+    PetState.REVIEW.value,
+    PetState.JUMP.value,
+    "extra1",
+    "extra2",
+]
+
+# Current Petdex row order (top -> bottom) used by 1536x1872 atlases:
+# 8 columns x 9 rows of 192x208 cells.
+CODEX_STATE_ROWS: list[str] = [
+    PetState.IDLE.value,
+    "running-right",
+    "running-left",
+    "waving",
+    "jumping",
+    PetState.FAILED.value,
+    PetState.WAITING.value,
+    "running",
+    PetState.REVIEW.value,
+]
+
+# Default/fallback for callers without a sheet. Prefer the current 9-row Codex
+# format because generated pets and the public Codex pet contract use it.
+STATE_ROWS: list[str] = CODEX_STATE_ROWS
+
+# Canonical Hermes activity names -> accepted row-name aliases in descending
+# preference. This keeps our internal state names stable (`wave`/`jump`/`run`)
+# while matching Petdex's current `waving`/`jumping`/`running` taxonomy.
+STATE_ALIASES: dict[str, tuple[str, ...]] = {
+    PetState.IDLE.value: (PetState.IDLE.value,),
+    PetState.WAVE.value: (PetState.WAVE.value, "waving"),
+    PetState.JUMP.value: (PetState.JUMP.value, "jumping"),
+    PetState.RUN.value: (PetState.RUN.value, "running"),
+    PetState.FAILED.value: (PetState.FAILED.value,),
+    PetState.REVIEW.value: (PetState.REVIEW.value,),
+    PetState.WAITING.value: (PetState.WAITING.value,),
+}
+
+
+def state_aliases_for(state: "PetState | str") -> tuple[str, ...]:
+    """Return accepted row-name aliases for *state* (always non-empty)."""
+    value = state.value if isinstance(state, PetState) else str(state)
+    aliases = STATE_ALIASES.get(value)
+    return aliases if aliases else (value,)
+
+
+def state_rows_for_grid(row_count: int | None) -> list[str]:
+    """Return the row taxonomy for a spritesheet with *row_count* rows."""
+    try:
+        rows = int(row_count or 0)
+    except (TypeError, ValueError):
+        rows = 0
+
+    if rows >= len(CODEX_STATE_ROWS):
+        return CODEX_STATE_ROWS
+    return LEGACY_STATE_ROWS
+
+
+def state_row_index(state: "PetState | str", row_count: int | None = None) -> int:
+    """Return the spritesheet row index for *state* (clamped, never raises)."""
+    rows = state_rows_for_grid(row_count)
+    for name in state_aliases_for(state):
+        try:
+            return rows.index(name)
+        except ValueError:
+            continue
+    return 0  # fall back to the idle row
--- a/agent/pet/generate/init.py
+++ b/agent/pet/generate/init.py
@@ -0,0 +1,29 @@
+"""Pet generation — base-draft → hatch pipeline.
+
+Public surface used by the gateway RPCs, the CLI ``hermes pets generate``
+command, and tests:
+
+- :func:`generate_base_drafts` / :func:`hatch_pet` — the two-step flow.
+- :class:`HatchResult`, :class:`GenerationError`.
+- :mod:`atlas` — deterministic frame extraction + atlas composition/validation.
+
+Image generation is delegated to the active reference-capable
+:class:`~agent.image_gen_provider.ImageGenProvider` (OpenAI gpt-image-2 or Krea);
+atlas assembly is fully deterministic so it's testable without any API calls.
+"""
+
+from __future__ import annotations
+
+from agent.pet.generate.imagegen import GenerationError
+from agent.pet.generate.orchestrate import (
+    HatchResult,
+    generate_base_drafts,
+    hatch_pet,
+)
+
+__all__ = [
+    "GenerationError",
+    "HatchResult",
+    "generate_base_drafts",
+    "hatch_pet",
+]
--- a/agent/pet/generate/atlas.py
+++ b/agent/pet/generate/atlas.py
--- a/agent/pet/generate/imagegen.py
+++ b/agent/pet/generate/imagegen.py
@@ -0,0 +1,251 @@
+"""Thin image-generation layer for pet sprites.
+
+Wraps the active :class:`~agent.image_gen_provider.ImageGenProvider` with the
+two things sprite generation needs that the agent-facing ``image_generate`` tool
+doesn't expose: **N variants** (loop) and **reference-image grounding** (so each
+animation row stays the same character as the chosen base).
+
+Reference grounding only works on providers that support it — currently OpenAI
+``gpt-image-2`` (image edits) and Krea (style references). We resolve to one of
+those and surface a clear, actionable error otherwise rather than silently
+producing an ungrounded, drifting pet.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Providers that can ground generation on a reference image, in preference order
+# (Nous Portal → OpenAI → OpenRouter → …). OpenRouter/Nous run a quality-first
+# model chain and may fall back depending on account access and endpoint behavior,
+# so fidelity can vary by configured backend + model availability.
+_REF_CAPABLE = ("nous", "openai", "openai-codex", "openrouter", "krea")
+
+# Friendly display label per reference-capable provider, surfaced in the desktop
+# pet-gen picker.
+_PROVIDER_LABELS: dict[str, str] = {
+    "nous": "Nous Portal",
+    "openrouter": "OpenRouter",
+    "openai": "OpenAI",
+    "openai-codex": "OpenAI (Codex)",
+    "krea": "Krea",
+}
+
+
+def _forced_provider_from_env() -> str | None:
+    """Optional QA override to force a pet-gen backend.
+
+    `HERMES_PET_IMAGE_PROVIDER=<name>` (e.g. `openrouter`) bypasses the normal
+    active/default provider resolution for pet generation only. Unknown values are
+    ignored so existing users are unaffected.
+    """
+    forced = os.environ.get("HERMES_PET_IMAGE_PROVIDER", "").strip().lower()
+    return forced if forced in _REF_CAPABLE else None
+
+
+class GenerationError(RuntimeError):
+    """Raised on any image-generation failure (no provider, API error, IO)."""
+
+
+@dataclass(frozen=True)
+class SpriteProvider:
+    """Resolved provider plus whether it can take reference images."""
+
+    name: str
+    provider: object
+    supports_references: bool
+
+
+def _discover() -> None:
+    try:
+        from hermes_cli.plugins import _ensure_plugins_discovered
+
+        _ensure_plugins_discovered()
+    except Exception as exc:  # noqa: BLE001 - discovery is best-effort
+        logger.debug("image-gen plugin discovery failed: %s", exc)
+
+
+def resolve_provider(*, require_references: bool = True, prefer: str | None = None) -> SpriteProvider:
+    """Pick the image provider to use for sprite work.
+
+    Preference: an explicit *prefer* choice (the desktop pet-gen picker) when it's
+    reference-capable and configured, then the configured/active provider when
+    it's reference-capable, else the first available reference-capable provider.
+    With *require_references* off we fall back to any available provider (used for
+    prompt-only base drafts).
+    """
+    _discover()
+    from agent.image_gen_registry import get_active_provider, get_provider
+
+    # QA override: force one provider for pet-gen iteration regardless of the
+    # globally active image_gen backend.
+    forced = _forced_provider_from_env()
+    if forced:
+        chosen = get_provider(forced)
+        if chosen is not None and chosen.is_available():
+            return SpriteProvider(name=forced, provider=chosen, supports_references=True)
+
+    # An explicit user pick wins when it's reference-capable and has credentials;
+    # otherwise we ignore it and fall through to the normal resolution.
+    if prefer:
+        chosen = get_provider(prefer)
+        if prefer in _REF_CAPABLE and chosen is not None and chosen.is_available():
+            return SpriteProvider(name=prefer, provider=chosen, supports_references=True)
+
+    # Configured / active provider first.
+    active = None
+    try:
+        active = get_active_provider()
+    except Exception:  # noqa: BLE001
+        active = None
+    if active is not None:
+        name = getattr(active, "name", "")
+        if name in _REF_CAPABLE and active.is_available():
+            return SpriteProvider(name=name, provider=active, supports_references=True)
+
+    # Any available reference-capable provider.
+    for name in _REF_CAPABLE:
+        provider = get_provider(name)
+        if provider is not None and provider.is_available():
+            return SpriteProvider(name=name, provider=provider, supports_references=True)
+
+    if not require_references and active is not None and active.is_available():
+        return SpriteProvider(
+            name=getattr(active, "name", "unknown"), provider=active, supports_references=False
+        )
+
+    raise GenerationError(
+        "Pet generation needs an image backend that supports reference images. "
+        "Open `hermes tools` → Image Generation and configure Nous Portal, "
+        "OpenRouter, or OpenAI (gpt-image-2) with an API key."
+    )
+
+
+def list_sprite_providers() -> list[dict]:
+    """The reference-capable providers available to pick for pet generation.
+
+    Returns ``[{name, label, default}]`` for every ref-capable provider the user
+    actually has credentials for, in preference order, marking the one
+    :func:`resolve_provider` would choose with no explicit preference. Empty when
+    none is configured (the picker hides itself). Best-effort: discovery hiccups
+    yield an empty list.
+    """
+    _discover()
+    from agent.image_gen_registry import get_provider
+
+    try:
+        default_name = resolve_provider(require_references=True).name
+    except GenerationError:
+        default_name = ""
+
+    out: list[dict] = []
+    for name in _REF_CAPABLE:
+        provider = get_provider(name)
+        if provider is None or not provider.is_available():
+            continue
+        out.append(
+            {
+                "name": name,
+                "label": _PROVIDER_LABELS.get(name, name),
+                "default": name == default_name,
+            }
+        )
+    return out
+
+
+def _save_local(image_ref: str, *, prefix: str) -> Path:
+    """Return a local path for *image_ref*, downloading it if it's a URL."""
+    if image_ref.startswith(("http://", "https://")):
+        from agent.image_gen_provider import save_url_image
+
+        return Path(save_url_image(image_ref, prefix=prefix))
+    return Path(image_ref)
+
+
+def _rejected_background(error: str) -> bool:
+    """True when a provider error is specifically about the ``background`` param.
+
+    Transparent backgrounds are a per-model capability (e.g. some gpt-image tiers
+    reject ``background=transparent`` outright). We detect that one rejection so
+    we can retry without the flag rather than failing the whole pet — our chroma
+    key pass makes the result transparent regardless.
+    """
+    lowered = (error or "").lower()
+    return "background" in lowered and ("not supported" in lowered or "transparent" in lowered)
+
+
+def generate(
+    prompt: str,
+    *,
+    n: int = 1,
+    reference_images: list[Path] | None = None,
+    provider: SpriteProvider | None = None,
+    prefix: str = "pet_gen",
+    aspect_ratio: str = "square",
+) -> list[Path]:
+    """Generate *n* sprite images and return their local paths.
+
+    *reference_images* grounds the output on a base image (required for rows).
+    *aspect_ratio* picks the canvas: ``"square"`` for single-character base
+    drafts, ``"landscape"`` for multi-frame row strips (the wider 1536px canvas
+    gives every frame real horizontal room so winged poses don't have to be
+    shrunk to avoid touching their neighbors).
+    We *ask* for a transparent background, but fall back to an opaque generation
+    (cleaned up downstream by the chroma-key pass) on models that reject the
+    flag. Raises :class:`GenerationError` if nothing usable comes back.
+    """
+    sprite = provider or resolve_provider(require_references=bool(reference_images))
+    if reference_images and not sprite.supports_references:
+        raise GenerationError(
+            f"image backend '{sprite.name}' cannot use reference images; "
+            "configure OpenAI gpt-image-2 or Krea for pet generation"
+        )
+
+    refs = [str(p) for p in (reference_images or [])]
+
+    def _run(extra: dict) -> tuple[Path | None, str]:
+        kwargs: dict = {"aspect_ratio": aspect_ratio, **extra}
+        if refs:
+            # Providers disagree on the ref kwarg name: our OpenRouter/Nous
+            # backends read ``reference_images``, OpenAI's gpt-image-2 reads
+            # ``reference_image_urls``. Send both; each ignores the other.
+            kwargs["reference_images"] = refs
+            kwargs["reference_image_urls"] = refs
+        try:
+            result = sprite.provider.generate(prompt, **kwargs)
+        except Exception as exc:  # noqa: BLE001 - normalize provider crashes
+            logger.debug("provider.generate crashed: %s", exc)
+            return None, str(exc)
+        if not isinstance(result, dict) or not result.get("success"):
+            return None, (result or {}).get("error", "unknown error") if isinstance(result, dict) else "no result"
+        image_ref = result.get("image")
+        if not image_ref:
+            return None, "provider returned no image"
+        try:
+            return _save_local(str(image_ref), prefix=prefix), ""
+        except Exception as exc:  # noqa: BLE001
+            return None, f"could not save generated image: {exc}"
+
+    out: list[Path] = []
+    last_error = ""
+    allow_transparent = True
+    for _ in range(max(1, n)):
+        path, err = _run({"background": "transparent"} if allow_transparent else {})
+        # Model doesn't support the transparent flag → drop it for this and every
+        # remaining variant (no point re-probing a capability we just disproved).
+        if path is None and allow_transparent and _rejected_background(err):
+            allow_transparent = False
+            path, err = _run({})
+        if path is not None:
+            out.append(path)
+        else:
+            last_error = err
+
+    if not out:
+        raise GenerationError(last_error or "image generation produced no output")
+    return out
--- a/agent/pet/generate/orchestrate.py
+++ b/agent/pet/generate/orchestrate.py
@@ -0,0 +1,358 @@
+"""Pet generation orchestration — the base-draft → hatch flow.
+
+Two steps, mirroring the UX across every surface:
+
+1. :func:`generate_base_drafts` — a handful of prompt-only "what should this pet
+   look like" variants. Cheap; the user picks one (or retries for a fresh set).
+2. :func:`hatch_pet` — takes the chosen base and generates one grounded row
+   strip per Hermes state, slices each into frames, composes the atlas, validates
+   it, and writes the pet into the store.
+
+Splitting it this way bounds cost (4 cheap base calls per round; the ~6 row
+calls happen once, on the pet you actually keep) and gives each UI a natural
+preview/loading point.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+
+from agent.pet.generate import atlas, imagegen, prompts
+from agent.pet.generate.imagegen import GenerationError, SpriteProvider
+
+logger = logging.getLogger(__name__)
+
+# (event, detail) — e.g. ("row", "idle"), ("compose", ""), ("save", "<slug>").
+ProgressFn = Callable[[str, str], None]
+
+# Image generations are independent network calls, so we fan them out instead of
+# blocking on each in turn — a hatch is ~8 row calls that would otherwise run
+# back-to-back and routinely blow past the client's RPC timeout. Capped so we
+# don't hammer the provider's rate limit (one cold call can still be slow).
+_MAX_PARALLEL_GENERATIONS = 4
+# How many times to (re)generate a single row before accepting a best-effort
+# slice. Early attempts demand clean per-pose gutters; the last is lenient so a
+# stubborn row still yields frames instead of dropping out entirely.
+_ROW_GEN_ATTEMPTS = 3
+_MIN_FILLED_STATES = 6
+_REQUIRED_STATES = frozenset({"idle", "running-right", "waving"})
+
+
+@dataclass(frozen=True)
+class HatchResult:
+    """Outcome of a successful :func:`hatch_pet`."""
+
+    slug: str
+    display_name: str
+    spritesheet: Path
+    states: list[str]
+    validation: dict
+
+
+def _harden_transparency(path: Path) -> Path:
+    """Key out any solid backdrop the provider painted; save as an RGBA PNG.
+
+    ``background=transparent`` is requested on every call, but image models honor
+    it inconsistently — some still paint a flat (often near-white) backdrop. We
+    run the same chroma-key pass the row extractor uses so every base draft the
+    user picks between (and the reference the rows are grounded on) is a clean
+    cutout. Best-effort: a decode failure leaves the original untouched.
+    """
+    from PIL import Image
+
+    try:
+        with Image.open(path) as opened:
+            keyed = atlas.remove_background(opened.convert("RGBA"))
+        # Zero the RGB of any leftover semi-transparent edge pixels so a keyed
+        # draft has no colored halo when composited on the dark UI.
+        keyed = atlas._clear_transparent_rgb(keyed)
+        out = path.with_suffix(".png")
+        keyed.save(out, format="PNG")
+        return out
+    except Exception as exc:  # noqa: BLE001 - cosmetic; fall back to the raw image
+        logger.debug("base draft transparency hardening failed for %s: %s", path, exc)
+        return path
+
+
+def generate_base_drafts(
+    concept: str,
+    *,
+    n: int = 4,
+    style: str = "auto",
+    reference_images: list[Path] | None = None,
+    provider: SpriteProvider | None = None,
+    on_draft: Callable[[int, Path], None] | None = None,
+    is_cancelled: Callable[[], bool] | None = None,
+) -> list[Path]:
+    """Generate *n* candidate base looks for *concept*; returns image paths.
+
+    Each draft is hardened to a transparent cutout (see :func:`_harden_transparency`).
+    Drafts are generated concurrently and *on_draft(index, path)* fires as each
+    one finishes (not at the end) so callers can stream previews to the UI
+    instead of leaving it blank until the whole batch is done.
+
+    *is_cancelled*, when supplied, is polled cooperatively: a draft that hasn't
+    started yet is skipped, and once it trips we stop staging/streaming further
+    drafts and cancel any queued work (already-in-flight provider calls can't be
+    hard-killed, but their results are dropped).
+    """
+    # A user reference image (e.g. their own pet) grounds every draft, so it
+    # needs a reference-capable provider — same requirement as the row passes.
+    refs = reference_images or None
+    sprite = provider or imagegen.resolve_provider(require_references=bool(refs))
+    cancelled = is_cancelled or (lambda: False)
+
+    # Each draft is its own one-shot generation, run concurrently so the user
+    # waits for one image, not N. A single draft failing must not sink the set.
+    # Each gets a distinct variation nudge so the options aren't near-duplicates.
+    logger.info("pet generate: drafting %d base looks for %r (style=%s)", n, concept, style)
+
+    def _one(index: int) -> tuple[int, Path | None, str | None]:
+        if cancelled():
+            return index, None, None
+        t0 = time.monotonic()
+        variation = prompts.BASE_VARIATIONS[index % len(prompts.BASE_VARIATIONS)]
+        prompt = prompts.build_base_prompt(concept, style=style, variation=variation)
+        try:
+            out = imagegen.generate(prompt, n=1, reference_images=refs, provider=sprite, prefix="pet_base")
+        except Exception as exc:  # noqa: BLE001 - tolerate a single failed draft
+            logger.warning("pet generate: draft %d failed after %.1fs: %s", index, time.monotonic() - t0, exc)
+            return index, None, str(exc)
+        if not out:
+            logger.warning("pet generate: draft %d produced no image", index)
+            return index, None, "the image provider returned no image"
+        logger.info("pet generate: draft %d ready in %.1fs", index, time.monotonic() - t0)
+        return index, _harden_transparency(out[0]), None
+
+    workers = max(1, min(n, _MAX_PARALLEL_GENERATIONS))
+    results: dict[int, Path] = {}
+    errors: list[str] = []
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = [pool.submit(_one, i) for i in range(n)]
+        # as_completed runs in *this* (the caller's) thread, so on_draft — and any
+        # gateway event it emits — inherits the request's bound transport, unlike
+        # the worker threads above.
+        for fut in as_completed(futures):
+            if cancelled():
+                logger.info("pet generate: cancelled — dropping remaining drafts")
+                for pending in futures:
+                    pending.cancel()
+                break
+            index, path, err = fut.result()
+            if path is None:
+                if err:
+                    errors.append(err)
+                continue
+            results[index] = path
+            if on_draft is not None:
+                try:
+                    on_draft(index, path)
+                except Exception as exc:  # noqa: BLE001 - progress is best-effort
+                    logger.debug("on_draft callback failed: %s", exc)
+
+    drafts = [results[i] for i in sorted(results)]
+    if not drafts and not cancelled():
+        # Surface *why* — every draft failed for a reason (a content-policy refusal
+        # on a name like "minion", a provider/auth error, …); the most common one
+        # is the representative cause. Far more useful than "no usable drafts".
+        raise GenerationError(_drafts_failed_reason(errors))
+    return drafts
+
+
+def _drafts_failed_reason(errors: list[str]) -> str:
+    """The representative reason a draft round produced nothing, humanized."""
+    if not errors:
+        return "image generation produced no usable drafts"
+    from collections import Counter
+
+    return _humanize_image_error(Counter(errors).most_common(1)[0][0])
+
+
+def _humanize_image_error(error: str) -> str:
+    """Turn a raw provider error into a friendly, actionable sentence.
+
+    The big one is moderation: image models refuse trademarked characters and
+    real people (e.g. "minion"), which reads as an opaque 400 otherwise.
+    """
+    low = error.lower()
+    if any(s in low for s in ("moderation_blocked", "safety system", "content policy", "content_policy")):
+        return (
+            "The image provider blocked this prompt — its safety filter rejects "
+            "trademarked characters and real people. Try an original description."
+        )
+    if any(s in low for s in ("api key", "unauthorized", "401", "auth")):
+        return "The image provider rejected the request — check your API key in Settings → Providers."
+    if "rate limit" in low or "429" in low:
+        return "The image provider is rate-limiting — wait a moment and try again."
+    # Otherwise the first line, trimmed of the noisy provider envelope.
+    return error.splitlines()[0].strip()[:200]
+
+
+def hatch_pet(
+    *,
+    base_image: str | Path,
+    slug: str,
+    display_name: str = "",
+    description: str = "",
+    concept: str = "",
+    style: str = "auto",
+    on_progress: ProgressFn | None = None,
+    provider: SpriteProvider | None = None,
+    is_cancelled: Callable[[], bool] | None = None,
+) -> HatchResult:
+    """Turn an approved base image into a full, installed Hermes pet.
+
+    Generates a grounded row strip per state, extracts frames, composes +
+    validates the atlas, and registers it. The idle row falls back to the base
+    look so the pet always renders. Raises :class:`GenerationError` on failure.
+
+    *is_cancelled*, when supplied, is polled cooperatively: rows that haven't
+    started are skipped, queued rows are cancelled, and once every row is done we
+    abort (raising :class:`GenerationError`) before composing/saving so a stopped
+    hatch never writes a half-built pet.
+    """
+    base = Path(base_image)
+    if not base.is_file():
+        raise GenerationError(f"base image not found: {base}")
+
+    sprite = provider or imagegen.resolve_provider(require_references=True)
+    progress = on_progress or (lambda *_: None)
+    cancelled = is_cancelled or (lambda: False)
+    label = concept or display_name or slug
+
+    frames_by_state: dict[str, list] = {}
+    total_rows = len(atlas.ROW_SPECS)
+    logger.info("pet hatch %r: generating %d animation rows", slug, total_rows)
+
+    # Generate every state's row strip concurrently — they're independent
+    # grounded calls, so the hatch waits for the slowest row, not their sum. A
+    # single row failing is tolerated (idle is guaranteed below).
+    def _gen_row(spec: tuple[str, int, int]) -> tuple[str, list | None]:
+        state, _row, count = spec
+        if cancelled():
+            return state, None
+        t0 = time.monotonic()
+        last_exc: Exception | None = None
+        # Self-healing: a model occasionally returns a row whose poses are touching
+        # (no clean gutters), which slices badly. We retry such rolls; only the
+        # final attempt falls back to lenient ``auto`` slicing so a stubborn row
+        # still yields *something* rather than dropping the whole row.
+        for attempt in range(_ROW_GEN_ATTEMPTS):
+            if cancelled():
+                return state, None
+            strict = attempt < _ROW_GEN_ATTEMPTS - 1
+            try:
+                strips = imagegen.generate(
+                    prompts.build_row_prompt(state, count, label, style=style),
+                    n=1,
+                    reference_images=[base],
+                    provider=sprite,
+                    prefix=f"pet_row_{state}",
+                    # Wider canvas → each frame gets real horizontal room, so winged
+                    # poses keep a full, healthy size and still leave clean gutters.
+                    aspect_ratio="landscape",
+                )
+                # ``components`` requires clean per-pose gutters (raises otherwise),
+                # so a touching roll is rejected and regenerated; the last attempt
+                # uses ``auto`` (equal-slot fallback, never raises). Raw (fit=False)
+                # so normalize_cells registers the whole pet at once.
+                method = "components" if strict else "auto"
+                frames = atlas.extract_strip_frames(strips[0], count, method=method, fit=False)
+                logger.info(
+                    "pet hatch %r: row %r ready in %.1fs (attempt %d)",
+                    slug, state, time.monotonic() - t0, attempt + 1,
+                )
+                return state, frames
+            except Exception as exc:  # noqa: BLE001 - retried; one bad row is tolerated
+                last_exc = exc
+                logger.warning(
+                    "pet hatch %r: row %r attempt %d/%d failed: %s",
+                    slug, state, attempt + 1, _ROW_GEN_ATTEMPTS, exc,
+                )
+        logger.warning(
+            "pet hatch %r: row %r gave up after %.1fs: %s",
+            slug, state, time.monotonic() - t0, last_exc,
+        )
+        return state, None
+
+    # running-left is derived by mirroring running-right (guaranteed-consistent
+    # and one fewer generation), so we don't generate it directly.
+    generated_specs = [spec for spec in atlas.ROW_SPECS if spec[0] != "running-left"]
+
+    workers = max(1, min(len(generated_specs), _MAX_PARALLEL_GENERATIONS))
+    done = 0
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = [pool.submit(_gen_row, spec) for spec in generated_specs]
+        # as_completed runs on the caller (request) thread, so progress events
+        # emitted here inherit the request transport — unlike the worker threads.
+        for fut in as_completed(futures):
+            if cancelled():
+                logger.info("pet hatch %r: cancelled — dropping remaining rows", slug)
+                for pending in futures:
+                    pending.cancel()
+                break
+            state, frames = fut.result()
+            done += 1
+            progress("row", f"{state}:{done}:{total_rows}")
+            if frames:
+                frames_by_state[state] = frames
+
+    if cancelled():
+        raise GenerationError("hatch cancelled")
+
+    # Derive running-left from the approved running-right row (per-frame mirror,
+    # preserving order/timing). Missing running-right is rejected below; a pet
+    # without its canonical walk cycle is a failed hatch, not a shippable mascot.
+    right = frames_by_state.get("running-right")
+    if right:
+        done += 1
+        progress("row", f"running-left:{done}:{total_rows}")
+        frames_by_state["running-left"] = atlas.mirror_frames(right)
+        logger.info("pet hatch %r: row 'running-left' mirrored from running-right", slug)
+    else:
+        logger.warning("pet hatch %r: no running-right to mirror; left walk left empty", slug)
+
+    # Idle is the resting state the renderer falls back to — guarantee it.
+    if not frames_by_state.get("idle"):
+        progress("row", "idle-fallback")
+        frames_by_state["idle"] = [atlas.single_frame(base, fit=False)]
+
+    progress("compose", "")
+    logger.info("pet hatch %r: composing atlas from %d states", slug, len(frames_by_state))
+    # One shared scale + baseline across every state so the pet never slides or
+    # pulses size between frames; compose just packs the normalized cells.
+    sheet = atlas.compose_atlas(atlas.normalize_cells(frames_by_state))
+    validation = atlas.validate_atlas(sheet)
+    if not validation["ok"]:
+        raise GenerationError("; ".join(validation["errors"]) or "atlas validation failed")
+    filled_states = set(validation["filled_states"])
+    missing_required = sorted(_REQUIRED_STATES - filled_states)
+    if missing_required:
+        raise GenerationError(f"missing required animation row(s): {', '.join(missing_required)}")
+    if len(filled_states) < _MIN_FILLED_STATES:
+        raise GenerationError(
+            f"only {len(filled_states)}/{len(atlas.ROW_SPECS)} animation rows were usable; regenerate"
+        )
+
+    from agent.pet import store
+
+    progress("save", slug)
+    logger.info("pet hatch %r: saving pet", slug)
+    pet = store.register_local_pet(
+        sheet,
+        slug=slug,
+        display_name=display_name or slug,
+        description=description,
+    )
+    return HatchResult(
+        slug=pet.slug,
+        display_name=pet.display_name,
+        spritesheet=pet.spritesheet,
+        states=validation["filled_states"],
+        validation=validation,
+    )
--- a/agent/pet/generate/prompts.py
+++ b/agent/pet/generate/prompts.py
@@ -0,0 +1,183 @@
+"""Prompt builders for pet generation.
+
+Two prompt shapes: a *base* prompt (prompt-only, produces the canonical look the
+user picks between) and per-*state* *row* prompts (grounded on the chosen base,
+produce one horizontal strip of N poses). Prompts stay concise and
+sprite-production oriented; the identity lock and "one transparent row" framing
+matter more than flowery description.
+
+We generate the full petdex/Codex nine-state set (see
+:data:`agent.pet.generate.atlas.ROW_SPECS`) so a hatched pet is a valid
+``petdex submit`` spritesheet.
+"""
+
+from __future__ import annotations
+
+# What each petdex/Codex state should depict (kept short — these go straight into
+# the row prompt). Phrased to avoid the common sprite-gen failure modes (detached
+# effects, motion lines, shadows). Critical distinction: ``running`` is the
+# *working* state (in place), while ``running-right`` / ``running-left`` are the
+# actual directional walk/run cycles.
+STATE_ACTIONS: dict[str, str] = {
+    "idle": "a calm idle loop: subtle breathing, a tiny blink or gentle bob, no big gestures",
+    "running-right": (
+        "a sideways walk/run locomotion cycle moving to the RIGHT: the character "
+        "faces and travels right with clear directional steps, a smooth gait loop"
+    ),
+    "running-left": (
+        "a sideways walk/run locomotion cycle moving to the LEFT: the character "
+        "faces and travels left with clear directional steps (the mirror of the "
+        "right-facing run)"
+    ),
+    "waving": "a friendly greeting: raising a paw/hand/limb to wave, clear up-and-down gesture",
+    "jumping": "a happy celebration jump: anticipation, lift off the ground, peak, and land",
+    "failed": "a sad or deflated reaction: slumped, dejected, small frown — readable but not noisy",
+    "waiting": (
+        "an expectant 'waiting on you' pose: looking up/out as if asking for input "
+        "or approval — distinct from idle and review"
+    ),
+    "running": (
+        "focused active work, staying IN PLACE (NOT walking or foot-running): "
+        "leaning in, concentrating, busy 'thinking / processing / typing' energy"
+    ),
+    "review": "careful inspection: a focused lean, head tilt, studying something intently",
+}
+
+_STYLE_HINTS: dict[str, str] = {
+    # Default to the popular petdex look: crisp 16-bit PIXEL ART, not the smooth
+    # 2D illustration (let alone 3D render) gpt-image reaches for by default.
+    "auto": (
+        " Style: crisp 16-bit PIXEL-ART game sprite — visible square pixels, a small "
+        "limited palette, clean dark outline, flat cel shading, chunky chibi "
+        "proportions, like a classic SNES/JRPG party member or a petdex.dev mascot. "
+        "Absolutely NOT 3D-rendered, NOT a smooth painted or vector illustration, "
+        "NOT photorealistic — no soft gradients, no realistic lighting, no figurine look."
+    ),
+    "pixel": " Render in clean 16-bit pixel-art style with visible square pixels and a limited palette.",
+    "plush": " Render as a soft plush toy.",
+    "clay": " Render as a claymation / soft 3D clay figure.",
+    "sticker": " Render as a glossy die-cut sticker.",
+    "flat-vector": " Render in flat vector mascot style.",
+    "3d-toy": " Render as a glossy 3D toy.",
+    "painterly": " Render in a soft painterly style.",
+}
+
+_BACKGROUND = (
+    "Center the character on a SINGLE flat, uniform, high-contrast chroma-key "
+    "background — pure hot magenta #FF00FF (only if magenta appears on the "
+    "character, use pure green #00FF00 instead). The background is ONE continuous "
+    "even color that completely surrounds the character with NO gradient, "
+    "vignette, texture, pattern, scenery, shadow, ground line, frame, border, "
+    "panel, comic cell, gutter line, grid, or divider of any kind, so it keys out "
+    "cleanly. The background color must not appear anywhere on the character. "
+    "No text, no labels, no speech bubbles, no UI."
+)
+
+
+def style_hint(style: str | None) -> str:
+    return _STYLE_HINTS.get((style or "auto").strip().lower(), "")
+
+
+# Row strips are generated on the wider landscape canvas (see imagegen.generate /
+# orchestrate). The extra width is what lets each pose stay a healthy size AND
+# leave a real gutter — used here only to cite concrete pixel numbers.
+_ASSUMED_STRIP_WIDTH = 1536
+
+
+def _spacing_spec(frame_count: int) -> tuple[int, int]:
+    """(per-pose width px, gap px) for a row of *frame_count* poses.
+
+    Pixel counts alone don't hold — the model fills each slot edge-to-edge with
+    the full wingspan, so neighbors touch even when bodies are spaced. The lever
+    that works is proportional containment on a wide canvas: give each pose its
+    own equal cell and keep the ENTIRE silhouette (wings/tail/halo included)
+    inside it. On the 1536px landscape strip ~70% occupancy still leaves a
+    generous gutter, so the pet stays a normal, good-looking size — no shrinking.
+    """
+    slots = max(1, frame_count)
+    slot_w = _ASSUMED_STRIP_WIDTH / slots
+    pose_px = round(slot_w * 0.7)
+    gap_px = max(48, round(slot_w * 0.3))
+    return pose_px, gap_px
+
+
+# Per-draft nudges so the 4 base options are actually distinct — gpt-image returns
+# near-duplicates for a single prompt. We vary the *look* (palette, build,
+# expression, accents), NOT the pose, so the chosen base still grounds clean,
+# consistent animation rows.
+BASE_VARIATIONS: tuple[str, ...] = (
+    "",
+    "a distinctly different colour palette and markings",
+    "a heavier, broader silhouette with sturdier proportions",
+    "a different facial structure and expression matching the concept tone, with unique accent/accessory details",
+    "a leaner, taller build and an alternate colour scheme",
+    "bolder, more saturated colours and a stronger expression matching the concept tone",
+)
+
+
+def build_base_prompt(concept: str, *, style: str | None = "auto", variation: str = "") -> str:
+    """The base look: a single, clean, centered full-body mascot.
+
+    *variation* differentiates one draft from the next (see :data:`BASE_VARIATIONS`).
+    """
+    concept = (concept or "a distinctive mascot creature").strip()
+    nudge = f" Make this design distinct: {variation}." if variation else ""
+    return (
+        f"A stylized mascot pet character: {concept}. "
+        "Honor the requested tone and mood exactly (cute, eerie, scary, menacing, whimsical, etc.) "
+        "while staying non-graphic. "
+        "Compact, whole-body silhouette that reads clearly at small size, "
+        "clear readable facial features, simple consistent palette. "
+        # A neutral, symmetric, at-rest stance makes the cleanest identity anchor
+        "Neutral front-facing standing pose, upright and symmetric, arms/limbs "
+        "relaxed at the sides, feet together on the ground, any cape/accessories "
+        "hanging straight and still."
+        f"{nudge} "
+        f"{_BACKGROUND}{style_hint(style)}"
+    )
+
+
+def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str | None = "auto") -> str:
+    """A row strip: *frame_count* poses of the SAME character, left→right.
+
+    The attached base image is the identity source of truth; the prompt locks
+    species, palette, face, and props to it.
+    """
+    action = STATE_ACTIONS.get(state, "a simple idle pose")
+    concept = (concept or "the mascot").strip()
+    pose_px, gap_px = _spacing_spec(frame_count)
+    return (
+        f"Using the attached reference image as the exact same character "
+        f"(same species, face, colors, markings, proportions, and props), "
+        "preserving the same emotional tone/mood (e.g., scary stays scary, cute stays cute), "
+        f"draw a single WIDE horizontal strip of {frame_count} animation frames showing {action}. "
+        f"LAYOUT: arrange {frame_count} poses in ONE horizontal row at equal spacing, "
+        "each pose centered in its own imaginary equal region. Draw NO panel borders, "
+        "NO comic cells, NO boxes, NO vertical divider/gutter lines, NO grid, NO frame "
+        "outlines between poses — the backdrop is one unbroken flat field behind all of them. "
+        "Fill the WHOLE strip with the SAME single flat chroma-key color as the attached "
+        "reference image's background (identical hue in every frame, no per-pose color shifts). "
+        f"SPACING (critical): draw each pose at a consistent, healthy, clearly "
+        f"visible size (roughly {pose_px}px wide on a {_ASSUMED_STRIP_WIDTH}px "
+        f"strip) — do NOT shrink it tiny — but keep its ENTIRE silhouette "
+        f"(wings, tail, halo, horns, cape, every appendage) fully INSIDE its own "
+        f"cell. Leave at least {gap_px}px of empty chroma-key background between "
+        f"neighboring silhouettes at their closest point (wingtip to wingtip), and "
+        f"the same empty margin before the first pose and after the last. If a wing, "
+        f"cape, or tail would reach into a neighbor, FOLD or angle it inward rather "
+        f"than letting it cross the gap. Silhouettes must NEVER touch, overlap, "
+        f"share a shadow, share a ground line, share motion trails, or merge into "
+        f"one connected shape. "
+        # Registration: a clean sprite sheet keeps the character locked in place
+        # so only the action moves — this is what stops the loop sliding/pulsing.
+        "REGISTRATION (critical): the character is the SAME height and SAME width "
+        "in every frame, drawn at the SAME scale, centered over the SAME point, "
+        "with all feet aligned to the SAME invisible horizontal baseline across the "
+        "whole strip — this baseline is conceptual ONLY: draw NO ground line, floor, "
+        "platform, horizon, or contact shadow beneath the feet. Keep the body's center, size, and stance fixed frame to "
+        "frame — ONLY the limbs/features the action needs may move. Capes, cloaks, "
+        "bags, and scarves stay in the SAME place and shape every frame (no "
+        "swinging, flowing, or drifting) unless the action itself requires it. No "
+        "pose is cropped at the strip edges. "
+        f"{_BACKGROUND}{style_hint(style)}"
+    )
--- a/agent/pet/manifest.py
+++ b/agent/pet/manifest.py
@@ -0,0 +1,165 @@
+"""Fetch the public petdex manifest.
+
+``https://petdex.dev/api/manifest`` 307-redirects to a JSON document on R2:
+
+    {
+      "generatedAt": "...",
+      "total": 2926,
+      "pets": [
+        {"slug": "boba", "displayName": "Boba", "kind": "creature",
+         "submittedBy": "railly",
+         "spritesheetUrl": "https://assets.petdex.dev/.../spritesheet.webp",
+         "petJsonUrl": "https://assets.petdex.dev/.../pet.json",
+         "zipUrl": "https://assets.petdex.dev/.../boba.zip"},
+        ...
+      ]
+    }
+
+Read-only and unauthenticated; no credentials involved.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+MANIFEST_URL = "https://petdex.dev/api/manifest"
+
+_DEFAULT_TIMEOUT = 10.0
+
+# In-process cache for the (large, slow, identical-per-call) manifest. The list
+# is a static CDN object that barely changes, yet a single session can ask for
+# it many times — every gallery open, plus a full re-fetch per install/select
+# (``find_entry``). A short TTL collapses those into one network hit without
+# going stale for long. Cleared by :func:`clear_cache` (tests).
+_MANIFEST_TTL = 300.0
+_cache: tuple[float, list[ManifestEntry]] | None = None
+
+_prefetch_lock = threading.Lock()
+_prefetching = False
+
+
+def clear_cache() -> None:
+    """Drop the cached manifest (forces the next fetch to hit the network)."""
+    global _cache
+    _cache = None
+
+
+def _cache_is_warm() -> bool:
+    return _cache is not None and time.monotonic() - _cache[0] < _MANIFEST_TTL
+
+
+def prefetch(*, timeout: float = _DEFAULT_TIMEOUT) -> None:
+    """Warm the manifest cache in a daemon thread — idempotent, never blocks.
+
+    The desktop picker calls this when it loads the (instant) local-only gallery
+    so the full petdex catalog is usually cached by the time it's requested,
+    without ever holding up the user's own pets on a network round-trip.
+    """
+    global _prefetching
+
+    if _cache_is_warm():
+        return
+
+    with _prefetch_lock:
+        if _prefetching:
+            return
+        _prefetching = True
+
+    def _run() -> None:
+        global _prefetching
+        try:
+            fetch_manifest(timeout=timeout)
+        except Exception as exc:  # noqa: BLE001 - best-effort warm
+            logger.debug("petdex manifest prefetch failed: %s", exc)
+        finally:
+            _prefetching = False
+
+    threading.Thread(target=_run, name="petdex-prefetch", daemon=True).start()
+
+
+@dataclass(frozen=True)
+class ManifestEntry:
+    """A single pet's row in the manifest."""
+
+    slug: str
+    display_name: str
+    kind: str
+    submitted_by: str
+    spritesheet_url: str
+    pet_json_url: str
+    zip_url: str
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "ManifestEntry":
+        return cls(
+            slug=str(data.get("slug", "")).strip(),
+            display_name=str(data.get("displayName", "") or data.get("slug", "")),
+            kind=str(data.get("kind", "") or "pet"),
+            submitted_by=str(data.get("submittedBy", "") or ""),
+            spritesheet_url=str(data.get("spritesheetUrl", "") or ""),
+            pet_json_url=str(data.get("petJsonUrl", "") or ""),
+            zip_url=str(data.get("zipUrl", "") or ""),
+        )
+
+
+class ManifestError(RuntimeError):
+    """Raised when the manifest can't be fetched or parsed."""
+
+
+def fetch_manifest(*, timeout: float = _DEFAULT_TIMEOUT, force: bool = False) -> list[ManifestEntry]:
+    """Return every approved pet from the public manifest.
+
+    Cached in-process for ``_MANIFEST_TTL`` seconds (pass ``force=True`` to
+    bypass). Follows the 307 redirect to R2.  Raises :class:`ManifestError` on
+    any network/parse failure so callers can surface a clean message.
+    """
+    global _cache
+
+    if not force and _cache is not None and time.monotonic() - _cache[0] < _MANIFEST_TTL:
+        return _cache[1]
+
+    try:
+        import httpx
+    except ImportError as exc:  # pragma: no cover - httpx is a core dep
+        raise ManifestError("httpx is required to fetch the petdex manifest") from exc
+
+    try:
+        resp = httpx.get(
+            MANIFEST_URL,
+            timeout=timeout,
+            follow_redirects=True,
+            headers={"User-Agent": "hermes-agent-petdex"},
+        )
+        resp.raise_for_status()
+        payload = resp.json()
+    except Exception as exc:  # noqa: BLE001 - normalize to one error type
+        raise ManifestError(f"could not fetch petdex manifest: {exc}") from exc
+
+    pets = payload.get("pets") if isinstance(payload, dict) else None
+    if not isinstance(pets, list):
+        raise ManifestError("petdex manifest had no 'pets' array")
+
+    entries: list[ManifestEntry] = []
+    for raw in pets:
+        if not isinstance(raw, dict):
+            continue
+        entry = ManifestEntry.from_dict(raw)
+        if entry.slug and entry.spritesheet_url:
+            entries.append(entry)
+
+    _cache = (time.monotonic(), entries)
+    return entries
+
+
+def find_entry(slug: str, *, timeout: float = _DEFAULT_TIMEOUT) -> ManifestEntry | None:
+    """Return the manifest entry for *slug*, or ``None`` if not listed."""
+    slug = slug.strip().lower()
+    for entry in fetch_manifest(timeout=timeout):
+        if entry.slug.lower() == slug:
+            return entry
+    return None
--- a/agent/pet/render.py
+++ b/agent/pet/render.py
@@ -0,0 +1,618 @@
+"""Decode a pet spritesheet and encode frames for a terminal.
+
+Shared by the base CLI (writes the escape bytes to its own stdout) and the
+TUI (``tui_gateway`` ships the encoded bytes to Ink, which writes them) so the
+decode + capability-detection + protocol-encoding logic exists exactly once.
+
+Supported output modes, in fidelity order:
+
+- ``kitty``   — the kitty graphics protocol (kitty, Ghostty, WezTerm).
+- ``iterm``   — iTerm2 inline images (iTerm2, WezTerm).
+- ``sixel``   — DEC sixel (xterm -ti vt340, foot, mlterm, WezTerm, …).
+- ``unicode`` — 24-bit half-block downscale; works in any truecolor terminal.
+
+Frame decoding requires Pillow (a core Hermes dependency).  If Pillow or the
+spritesheet is unavailable the renderer degrades to ``unicode`` text or an
+empty string rather than raising.
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+import logging
+import os
+import sys
+from functools import lru_cache
+from pathlib import Path
+
+from agent.pet.constants import (
+    DEFAULT_SCALE,
+    FRAME_H,
+    FRAME_W,
+    FRAMES_PER_STATE,
+    PetState,
+    state_row_index,
+)
+
+logger = logging.getLogger(__name__)
+
+# Public render-mode names accepted by ``display.pet.render_mode``.
+RENDER_MODES = ("auto", "kitty", "iterm", "sixel", "unicode", "off")
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Terminal capability detection
+# ─────────────────────────────────────────────────────────────────────────
+
+def detect_terminal_graphics() -> str:
+    """Best-effort detection of the richest graphics protocol available.
+
+    Env-based (non-blocking — we never issue a DA1/terminal query that could
+    hang a pipe).  Returns one of ``kitty`` / ``iterm`` / ``sixel`` /
+    ``unicode``.  Conservative: unknown terminals get ``unicode``, which works
+    anywhere with truecolor.
+    """
+    term = os.environ.get("TERM", "").lower()
+    term_program = os.environ.get("TERM_PROGRAM", "").lower()
+
+    # The VS Code / Cursor integrated terminal sets TERM_PROGRAM=vscode
+    # authoritatively but does NOT scrub the terminal env vars it inherits when
+    # launched from another emulator (ITERM_SESSION_ID, KITTY_WINDOW_ID, …).
+    # Trusting those leaks emits an image protocol the embedded xterm.js can't
+    # display — you get a blank frame. Inline images there are opt-in
+    # (terminal.integrated.enableImages), so default to half-blocks, which
+    # always render in its truecolor grid. Users who enabled images can pin
+    # display.pet.render_mode explicitly.
+    if term_program == "vscode":
+        return "unicode"
+
+    # kitty graphics protocol
+    if os.environ.get("KITTY_WINDOW_ID") or "kitty" in term or "ghostty" in term:
+        return "kitty"
+    if term_program in {"ghostty"}:
+        return "kitty"
+
+    # WezTerm speaks both kitty and iterm; prefer kitty (richer placement).
+    if term_program == "wezterm" or os.environ.get("WEZTERM_PANE"):
+        return "kitty"
+
+    # iTerm2 inline images
+    if term_program == "iterm.app" or os.environ.get("ITERM_SESSION_ID"):
+        return "iterm"
+
+    # sixel-capable terminals (env heuristics only)
+    if term_program in {"mintty"} or "foot" in term or "mlterm" in term:
+        return "sixel"
+    if "sixel" in term:
+        return "sixel"
+
+    return "unicode"
+
+
+def resolve_mode(configured: str | None, *, stream=None) -> str:
+    """Resolve the effective render mode from config + the environment.
+
+    ``configured`` is ``display.pet.render_mode`` (``auto`` → detect).  Returns
+    ``off`` when not attached to a TTY (no point emitting graphics into a pipe
+    or logfile).
+    """
+    mode = (configured or "auto").strip().lower()
+    if mode not in RENDER_MODES:
+        mode = "auto"
+    if mode == "off":
+        return "off"
+
+    stream = stream or sys.stdout
+    try:
+        if not (hasattr(stream, "isatty") and stream.isatty()):
+            return "off"
+    except (ValueError, OSError):
+        return "off"
+
+    if mode == "auto":
+        return detect_terminal_graphics()
+    return mode
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Frame decoding
+# ─────────────────────────────────────────────────────────────────────────
+
+def _open_sheet(path: Path):
+    from PIL import Image
+
+    img = Image.open(path)
+    return img.convert("RGBA")
+
+
+# Max alpha at/below which a frame counts as blank padding.  petdex sheets are
+# left-packed: a state with fewer real frames than ``FRAMES_PER_STATE`` fills
+# the trailing columns with fully transparent cells.  Animating into one flashes
+# the pet blank, so we stop the row at the first such gap.
+_BLANK_ALPHA = 8
+
+
+def _frame_is_blank(frame) -> bool:
+    """True if *frame* has no meaningfully opaque pixel (transparent padding)."""
+    return frame.getchannel("A").getextrema()[1] <= _BLANK_ALPHA
+
+
+@lru_cache(maxsize=16)
+def _raw_frames(
+    sheet_path: str,
+    state_value: str,
+    frame_w: int,
+    frame_h: int,
+    frames_per_state: int,
+) -> tuple:
+    """Cropped, padding-trimmed RGBA frames for one state row (unscaled).
+
+    Steps across the row until the first blank column so pets with ragged
+    per-state frame counts never animate into empty padding.  Cached; returns
+    ``()`` on any decode failure.
+    """
+    try:
+        sheet = _open_sheet(Path(sheet_path))
+        cols = max(1, sheet.width // frame_w)
+        rows = max(1, sheet.height // frame_h)
+        row = state_row_index(state_value, rows)
+        top = row * frame_h
+        # Clamp the row to the sheet (some pets ship fewer rows than the 8 the
+        # taxonomy reserves).
+        if top + frame_h > sheet.height:
+            top = max(0, sheet.height - frame_h)
+
+        frames = []
+        for i in range(min(frames_per_state, cols)):
+            left = i * frame_w
+            frame = sheet.crop((left, top, left + frame_w, top + frame_h))
+            if _frame_is_blank(frame):
+                break  # trailing transparent padding — real frames end here
+            frames.append(frame)
+        return tuple(frames)
+    except Exception as exc:  # noqa: BLE001 - cosmetic feature, never fatal
+        logger.debug("pet frame decode failed (%s, %s): %s", sheet_path, state_value, exc)
+        return ()
+
+
+@lru_cache(maxsize=8)
+def _frames_for(
+    sheet_path: str,
+    state_value: str,
+    frame_w: int,
+    frame_h: int,
+    frames_per_state: int,
+    scale_w: int,
+    scale_h: int,
+):
+    """Return padding-trimmed RGBA frames for one state row, scaled.
+
+    Thin scaling layer over :func:`_raw_frames`; both are cached so repeated
+    frame requests during animation are free.
+    """
+    raw = _raw_frames(sheet_path, state_value, frame_w, frame_h, frames_per_state)
+    if not raw or (scale_w, scale_h) == (frame_w, frame_h):
+        return list(raw)
+    from PIL import Image
+
+    return [f.resize((scale_w, scale_h), Image.LANCZOS) for f in raw]
+
+
+def state_frame_counts(
+    sheet_path: str | Path,
+    *,
+    frame_w: int = FRAME_W,
+    frame_h: int = FRAME_H,
+    frames_per_state: int = FRAMES_PER_STATE,
+) -> dict[str, int]:
+    """Map each driven :class:`PetState` → its real (padding-trimmed) frame count.
+
+    The single source of truth for "how many frames does this state actually
+    have?".  The CLI/TUI consume the trimmed frame lists directly; the gateway
+    ships this map to the desktop canvas, which steps its own loop.
+    """
+    return {
+        state.value: len(
+            _raw_frames(str(sheet_path), state.value, frame_w, frame_h, frames_per_state)
+        )
+        for state in PetState
+    }
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Encoders
+# ─────────────────────────────────────────────────────────────────────────
+
+def _png_bytes(frame) -> bytes:
+    buf = io.BytesIO()
+    frame.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def _kitty_apc(ctrl: str, data: str) -> str:
+    """Emit a kitty APC escape for *data*, chunked into ≤4096-byte ``m`` pieces."""
+    chunk = 4096
+    if len(data) <= chunk:
+        return f"\x1b_G{ctrl},m=0;{data}\x1b\\"
+    out = [f"\x1b_G{ctrl},m=1;{data[:chunk]}\x1b\\"]
+    rest = data[chunk:]
+    while rest:
+        piece, rest = rest[:chunk], rest[chunk:]
+        out.append(f"\x1b_Gm={1 if rest else 0};{piece}\x1b\\")
+    return "".join(out)
+
+
+def _encode_kitty(frame, *, cell_cols: int | None = None, cell_rows: int | None = None) -> str:
+    """Encode one frame via the kitty graphics protocol (transmit + display).
+
+    ``a=T`` transmits & displays at the cursor; ``c``/``r`` request a display
+    box in terminal cells so successive frames overwrite the same area.
+    """
+    ctrl = "f=100,a=T,q=2"
+    if cell_cols:
+        ctrl += f",c={cell_cols}"
+    if cell_rows:
+        ctrl += f",r={cell_rows}"
+    return _kitty_apc(ctrl, base64.standard_b64encode(_png_bytes(frame)).decode("ascii"))
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# kitty Unicode placeholders
+#
+# Ink (the TUI's React-for-terminal layer) owns the screen and measures every
+# cell's width, so it can't host raw kitty image escapes (no width to count,
+# clobbered on the next repaint). kitty's *Unicode placeholder* protocol is the
+# grid-safe path: transmit the image once (q=2, virtual placement U=1), then the
+# host app prints ordinary-width placeholder cells (U+10EEEE + diacritics) whose
+# foreground color encodes the image id. Ink counts those as width-1 text, so
+# layout stays correct and the terminal paints the image underneath.
+#   https://sw.kovidgoyal.net/kitty/graphics-protocol/#unicode-placeholders
+# ─────────────────────────────────────────────────────────────────────────
+
+_KITTY_PLACEHOLDER = "\U0010eeee"
+
+# Row/column diacritics, in order (index → diacritic). Verbatim from kitty's
+# gen/rowcolumn-diacritics.txt (Unicode 6.0.0, combining class 230). Index i is
+# the diacritic that encodes the number i; we only ever need the row index.
+_ROWCOL_DIACRITICS: tuple[int, ...] = (
+    0x0305, 0x030D, 0x030E, 0x0310, 0x0312, 0x033D, 0x033E, 0x033F, 0x0346, 0x034A,
+    0x034B, 0x034C, 0x0350, 0x0351, 0x0352, 0x0357, 0x035B, 0x0363, 0x0364, 0x0365,
+    0x0366, 0x0367, 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F,
+    0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0592, 0x0593, 0x0594, 0x0595, 0x0597,
+    0x0598, 0x0599, 0x059C, 0x059D, 0x059E, 0x059F, 0x05A0, 0x05A1, 0x05A8, 0x05A9,
+    0x05AB, 0x05AC, 0x05AF, 0x05C4, 0x0610, 0x0611, 0x0612, 0x0613, 0x0614, 0x0615,
+    0x0616, 0x0617, 0x0657, 0x0658, 0x0659, 0x065A, 0x065B, 0x065D, 0x065E, 0x06D6,
+    0x06D7, 0x06D8, 0x06D9, 0x06DA, 0x06DB, 0x06DC, 0x06DF, 0x06E0, 0x06E1, 0x06E2,
+    0x06E4, 0x06E7, 0x06E8, 0x06EB, 0x06EC, 0x0730, 0x0732, 0x0733, 0x0735, 0x0736,
+    0x073A, 0x073D, 0x073F, 0x0740, 0x0741, 0x0743, 0x0745, 0x0747, 0x0749, 0x074A,
+    0x07EB, 0x07EC, 0x07ED, 0x07EE, 0x07EF, 0x07F0, 0x07F1, 0x07F3, 0x0816, 0x0817,
+    0x0818, 0x0819, 0x081B, 0x081C, 0x081D, 0x081E, 0x081F, 0x0820, 0x0821, 0x0822,
+    0x0823, 0x0825, 0x0826, 0x0827, 0x0829, 0x082A, 0x082B, 0x082C, 0x082D, 0x0951,
+    0x0953, 0x0954, 0x0F82, 0x0F83, 0x0F86, 0x0F87, 0x135D, 0x135E, 0x135F, 0x17DD,
+    0x193A, 0x1A17, 0x1A75, 0x1A76, 0x1A77, 0x1A78, 0x1A79, 0x1A7A, 0x1A7B, 0x1A7C,
+    0x1B6B, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73, 0x1CD0, 0x1CD1,
+    0x1CD2, 0x1CDA, 0x1CDB, 0x1CE0, 0x1DC0, 0x1DC1, 0x1DC3, 0x1DC4, 0x1DC5, 0x1DC6,
+    0x1DC7, 0x1DC8, 0x1DC9, 0x1DCB, 0x1DCC, 0x1DD1, 0x1DD2, 0x1DD3, 0x1DD4, 0x1DD5,
+    0x1DD6, 0x1DD7, 0x1DD8, 0x1DD9, 0x1DDA, 0x1DDB, 0x1DDC, 0x1DDD, 0x1DDE, 0x1DDF,
+    0x1DE0, 0x1DE1, 0x1DE2, 0x1DE3, 0x1DE4, 0x1DE5, 0x1DE6, 0x1DFE, 0x20D0, 0x20D1,
+    0x20D4, 0x20D5, 0x20D6, 0x20D7, 0x20DB, 0x20DC, 0x20E1, 0x20E7, 0x20E9, 0x20F0,
+    0x2CEF, 0x2CF0, 0x2CF1, 0x2DE0, 0x2DE1, 0x2DE2, 0x2DE3, 0x2DE4, 0x2DE5, 0x2DE6,
+    0x2DE7, 0x2DE8, 0x2DE9, 0x2DEA, 0x2DEB, 0x2DEC, 0x2DED, 0x2DEE, 0x2DEF, 0x2DF0,
+    0x2DF1, 0x2DF2, 0x2DF3, 0x2DF4, 0x2DF5, 0x2DF6, 0x2DF7, 0x2DF8, 0x2DF9, 0x2DFA,
+    0x2DFB, 0x2DFC, 0x2DFD, 0x2DFE, 0x2DFF, 0xA66F, 0xA67C, 0xA67D, 0xA6F0, 0xA6F1,
+    0xA8E0, 0xA8E1, 0xA8E2, 0xA8E3, 0xA8E4, 0xA8E5, 0xA8E6, 0xA8E7, 0xA8E8, 0xA8E9,
+    0xA8EA, 0xA8EB, 0xA8EC, 0xA8ED, 0xA8EE, 0xA8EF, 0xA8F0, 0xA8F1, 0xAAB0, 0xAAB2,
+    0xAAB3, 0xAAB7, 0xAAB8, 0xAABE, 0xAABF, 0xAAC1, 0xFE20, 0xFE21, 0xFE22, 0xFE23,
+    0xFE24, 0xFE25, 0xFE26, 0x10A0F, 0x10A38, 0x1D185, 0x1D186, 0x1D187, 0x1D188,
+    0x1D189, 0x1D1AA, 0x1D1AB, 0x1D1AC, 0x1D1AD, 0x1D242, 0x1D243, 0x1D244,
+)
+
+
+def kitty_image_id(slug: str) -> int:
+    """Stable per-pet image id in ``[1, 0x7FFF]``.
+
+    The id is encoded in the placeholder's 24-bit foreground color, so it must
+    be non-zero and fit comfortably under ``0xFFFFFF``. A small CRC keeps it
+    deterministic per slug (so re-renders reuse the same terminal-side image)
+    while making collisions between two different pets unlikely.
+    """
+    import zlib
+
+    return (zlib.crc32(slug.encode("utf-8")) % 0x7FFE) + 1
+
+
+def kitty_color_hex(image_id: int) -> str:
+    """Hex foreground color (``#rrggbb``) that encodes *image_id* for kitty."""
+    return "#%06x" % (image_id & 0xFFFFFF)
+
+
+def kitty_placeholder_rows(cols: int, rows: int) -> list[str]:
+    """Build the placeholder text grid for an *rows*×*cols* image.
+
+    Each line is one row of the grid: the first cell carries the row diacritic
+    (column defaults to 0), and the remaining ``cols-1`` bare placeholders let
+    the terminal auto-increment the column. The foreground color (the image id)
+    is applied by the caller / Ink, not embedded here.
+    """
+    cols = max(1, cols)
+    out: list[str] = []
+    for r in range(max(1, rows)):
+        idx = min(r, len(_ROWCOL_DIACRITICS) - 1)
+        first = _KITTY_PLACEHOLDER + chr(_ROWCOL_DIACRITICS[idx])
+        out.append(first + _KITTY_PLACEHOLDER * (cols - 1))
+    return out
+
+
+def _encode_kitty_virtual(frame, *, image_id: int, cols: int, rows: int) -> str:
+    """Transmit a frame as a kitty *virtual* placement for Unicode placeholders.
+
+    ``a=T`` transmits and creates the placement in one shot; ``U=1`` marks it
+    virtual (no on-screen output, cursor untouched); ``q=2`` suppresses the
+    terminal's OK/error replies that would otherwise corrupt the host app's
+    output. Re-sending with the same ``i`` replaces the image, so the static
+    placeholder cells animate underneath.
+    """
+    ctrl = f"a=T,U=1,i={image_id},c={cols},r={rows},f=100,q=2"
+    return _kitty_apc(ctrl, base64.standard_b64encode(_png_bytes(frame)).decode("ascii"))
+
+
+def _encode_iterm(frame, *, cell_cols: int | None = None, cell_rows: int | None = None) -> str:
+    """Encode one frame as an iTerm2 inline image (OSC 1337 File)."""
+    payload = base64.standard_b64encode(_png_bytes(frame)).decode("ascii")
+    size = len(payload)
+    args = [f"inline=1", f"size={size}", "preserveAspectRatio=1"]
+    if cell_cols:
+        args.append(f"width={cell_cols}")
+    if cell_rows:
+        args.append(f"height={cell_rows}")
+    return f"\x1b]1337;File={';'.join(args)}:{payload}\x07"
+
+
+def _encode_sixel(frame) -> str:
+    """Encode one frame as DEC sixel.
+
+    Quantizes to an adaptive palette (≤255 colors) and emits the sixel band
+    stream.  Pillow has no sixel writer, so this is a compact hand-rolled
+    encoder.  Transparent pixels render as background (color register skipped).
+    """
+    from PIL import Image
+
+    rgba = frame
+    # Composite onto transparent-as-skip: track alpha to decide background.
+    pal = rgba.convert("RGB").quantize(colors=255, method=Image.MEDIANCUT)
+    palette = pal.getpalette() or []
+    px = pal.load()
+    alpha = rgba.getchannel("A").load()
+    w, h = pal.size
+
+    out = ["\x1bP0;1;0q", '"1;1;%d;%d' % (w, h)]
+    # Color register definitions (sixel uses 0..100 scale).
+    used = sorted({px[x, y] for y in range(h) for x in range(w)})
+    for idx in used:
+        r = palette[idx * 3] if idx * 3 < len(palette) else 0
+        g = palette[idx * 3 + 1] if idx * 3 + 1 < len(palette) else 0
+        b = palette[idx * 3 + 2] if idx * 3 + 2 < len(palette) else 0
+        out.append("#%d;2;%d;%d;%d" % (idx, r * 100 // 255, g * 100 // 255, b * 100 // 255))
+
+    # Emit in 6-row bands.
+    for band in range(0, h, 6):
+        for color_idx in used:
+            line = ["#%d" % color_idx]
+            run_char = None
+            run_len = 0
+
+            def flush():
+                nonlocal run_char, run_len
+                if run_char is None:
+                    return
+                if run_len > 3:
+                    line.append("!%d%s" % (run_len, run_char))
+                else:
+                    line.append(run_char * run_len)
+                run_char, run_len = None, 0
+
+            for x in range(w):
+                bits = 0
+                for bit in range(6):
+                    y = band + bit
+                    if y < h and alpha[x, y] > 32 and px[x, y] == color_idx:
+                        bits |= 1 << bit
+                ch = chr(63 + bits)
+                if ch == run_char:
+                    run_len += 1
+                else:
+                    flush()
+                    run_char, run_len = ch, 1
+            flush()
+            out.append("".join(line) + "$")  # carriage return within band
+        out.append("-")  # next band
+    out.append("\x1b\\")
+    return "".join(out)
+
+
+_HALF_BLOCK = "▀"
+
+# A single half-block cell: top pixel + bottom pixel as (r, g, b, a) tuples.
+Cell = tuple[tuple[int, int, int, int], tuple[int, int, int, int]]
+
+
+def _downscale_cells(frame, *, target_cols: int) -> list[list[Cell]]:
+    """Downscale a frame to a grid of half-block cells.
+
+    Each cell pairs a top and bottom pixel so one terminal row encodes two
+    pixel rows.  Returns rows of ``((tr,tg,tb,ta),(br,bg,bb,ba))`` — the
+    framework-neutral representation shared by the ANSI encoder (CLI) and the
+    structured ``cells`` API (Ink).
+    """
+    from PIL import Image
+
+    target_cols = max(4, target_cols)
+    aspect = frame.height / max(1, frame.width)
+    target_rows = max(2, int(round(target_cols * aspect * 0.5)) * 2)
+    small = frame.resize((target_cols, target_rows), Image.LANCZOS).convert("RGBA")
+    px = small.load()
+
+    grid: list[list[Cell]] = []
+    for y in range(0, target_rows, 2):
+        row: list[Cell] = []
+        for x in range(target_cols):
+            top = px[x, y]
+            bottom = px[x, y + 1] if y + 1 < target_rows else (0, 0, 0, 0)
+            row.append((top, bottom))
+        grid.append(row)
+    return grid
+
+
+def _encode_unicode(frame, *, target_cols: int) -> str:
+    """Downscale to truecolor ANSI half-blocks (one char = 2 vertical pixels)."""
+    lines: list[str] = []
+    for row in _downscale_cells(frame, target_cols=target_cols):
+        cells: list[str] = []
+        for (tr, tg, tb, ta), (br, bg, bb, ba) in row:
+            if ta < 32 and ba < 32:
+                cells.append("\x1b[0m ")  # fully transparent → blank
+                continue
+            cells.append(f"\x1b[38;2;{tr};{tg};{tb}m\x1b[48;2;{br};{bg};{bb}m{_HALF_BLOCK}")
+        lines.append("".join(cells) + "\x1b[0m")
+    return "\n".join(lines)
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Public renderer
+# ─────────────────────────────────────────────────────────────────────────
+
+class PetRenderer:
+    """Holds a pet's spritesheet and yields encoded frames per (state, index).
+
+    Construct once per pet, then call :meth:`frame` on an animation timer.
+    Cheap to call repeatedly — decoded frames are cached.
+    """
+
+    def __init__(
+        self,
+        spritesheet: str | Path,
+        *,
+        mode: str = "unicode",
+        scale: float = DEFAULT_SCALE,
+        unicode_cols: int = 20,
+        frame_w: int = FRAME_W,
+        frame_h: int = FRAME_H,
+        frames_per_state: int = FRAMES_PER_STATE,
+    ) -> None:
+        self.spritesheet = str(spritesheet)
+        self.mode = mode if mode in RENDER_MODES else "unicode"
+        self.scale = scale
+        self.unicode_cols = unicode_cols
+        self.frame_w = frame_w
+        self.frame_h = frame_h
+        self.frames_per_state = frames_per_state
+
+    @property
+    def available(self) -> bool:
+        return self.mode != "off" and Path(self.spritesheet).is_file()
+
+    def frame_count(self, state: PetState | str) -> int:
+        return len(self._frames(state))
+
+    def _frames(self, state: PetState | str):
+        value = state.value if isinstance(state, PetState) else str(state)
+        scale_w = max(1, int(self.frame_w * self.scale))
+        scale_h = max(1, int(self.frame_h * self.scale))
+        return _frames_for(
+            self.spritesheet,
+            value,
+            self.frame_w,
+            self.frame_h,
+            self.frames_per_state,
+            scale_w,
+            scale_h,
+        )
+
+    def cells(self, state: PetState | str, index: int, *, cols: int | None = None) -> list[list[Cell]]:
+        """Return one frame as a half-block cell grid (framework-neutral).
+
+        Used by the TUI, which renders the grid with native Ink color props
+        instead of raw ANSI.  Returns ``[]`` when no frame is available.
+        """
+        frames = self._frames(state)
+        if not frames:
+            return []
+        frame = frames[index % len(frames)]
+        return _downscale_cells(frame, target_cols=cols or self.unicode_cols)
+
+    def _cell_box(self, frame) -> tuple[int, int]:
+        """Terminal cell box for a scaled frame (~8×16 px per cell).
+
+        Must match :meth:`frame` graphics sizing — kitty stretches the image to
+        fill ``c``×``r`` cells, so these must reflect the scaled pixel
+        dimensions, not a native-aspect column count (that upscales small pets).
+        """
+        return max(1, frame.width // 8), max(1, frame.height // 16)
+
+    def kitty_payload(self, state: PetState | str, *, image_id: int) -> dict | None:
+        """Build the kitty Unicode-placeholder payload for one state.
+
+        Returns ``{cols, rows, placeholder, frames}`` where ``frames`` is a
+        list of transmit escapes (one per animation frame, all reusing
+        ``image_id``) and ``placeholder`` is the static text grid Ink paints.
+        Placement geometry is derived from the scaled frame pixels (via
+        :meth:`_cell_box`), not ``unicode_cols`` — kitty upscales to fill
+        ``c``×``r`` cells. ``None`` when no frame is available.
+        """
+        frames = self._frames(state)
+        if not frames:
+            return None
+        cols, rows = self._cell_box(frames[0])
+        return {
+            "cols": cols,
+            "rows": rows,
+            "placeholder": kitty_placeholder_rows(cols, rows),
+            "frames": [
+                _encode_kitty_virtual(f, image_id=image_id, cols=cols, rows=rows) for f in frames
+            ],
+        }
+
+    def frame(self, state: PetState | str, index: int) -> str:
+        """Return the encoded escape string for one frame, or ``""``.
+
+        ``index`` is taken modulo the available frame count so callers can pass
+        a free-running counter.
+        """
+        if self.mode == "off":
+            return ""
+        frames = self._frames(state)
+        if not frames:
+            return ""
+        frame = frames[index % len(frames)]
+        cell_cols, cell_rows = self._cell_box(frame)
+
+        try:
+            if self.mode == "kitty":
+                return _encode_kitty(frame, cell_cols=cell_cols, cell_rows=cell_rows)
+            if self.mode == "iterm":
+                return _encode_iterm(frame, cell_cols=cell_cols, cell_rows=cell_rows)
+            if self.mode == "sixel":
+                return _encode_sixel(frame)
+            return _encode_unicode(frame, target_cols=self.unicode_cols)
+        except Exception as exc:  # noqa: BLE001 - degrade silently
+            logger.debug("pet frame encode failed (mode=%s): %s", self.mode, exc)
+            return ""
+
+
+def build_renderer(
+    spritesheet: str | Path,
+    *,
+    configured_mode: str | None = None,
+    scale: float = DEFAULT_SCALE,
+    unicode_cols: int = 20,
+    stream=None,
+) -> PetRenderer:
+    """Convenience factory: resolve the mode from config+env, then construct."""
+    mode = resolve_mode(configured_mode, stream=stream)
+    return PetRenderer(
+        spritesheet,
+        mode=mode,
+        scale=scale,
+        unicode_cols=unicode_cols,
+    )
--- a/agent/pet/state.py
+++ b/agent/pet/state.py
@@ -0,0 +1,81 @@
+"""Map agent activity → a :class:`PetState`.
+
+This is the one place the "what is the agent doing right now?" → "which
+animation row?" decision lives.  Each surface feeds it the signals it already
+tracks:
+
+- CLI    — ``KawaiiSpinner`` waiting/thinking state + tool outcomes.
+- TUI    — gateway ``tool.start/complete`` + ``message.delta/complete`` events.
+- Desktop — the ``$busy``/``$awaitingResponse``/tool-event nanostores
+            (re-implemented in TS, but mirroring this priority order).
+
+Keeping the priority order here (and documenting it) lets the TypeScript
+mirror stay faithful without a second design.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any
+
+from agent.pet.constants import PetState
+
+
+def todos_all_done(todos: Iterable[Any] | None) -> bool:
+    """True iff there's ≥1 todo and every one is completed/cancelled.
+
+    The "celebrate" beat (``JUMP``) fires when a plan finishes; this mirrors
+    the TUI's ``isTodoDone`` so the trigger is defined once across surfaces.
+    Accepts dicts (``{"status": ...}``) or objects with a ``status`` attr.
+    """
+    items = list(todos or [])
+    if not items:
+        return False
+
+    def _status(t: Any) -> Any:
+        return t.get("status") if isinstance(t, dict) else getattr(t, "status", None)
+
+    return all(_status(t) in ("completed", "cancelled") for t in items)
+
+
+def derive_pet_state(
+    *,
+    busy: bool = False,
+    awaiting_input: bool = False,
+    error: bool = False,
+    celebrate: bool = False,
+    just_completed: bool = False,
+    tool_running: bool = False,
+    reasoning: bool = False,
+) -> PetState:
+    """Resolve the animation state from coarse activity signals.
+
+    Priority (highest first) — only one row can show at a time, so the most
+    salient signal wins:
+
+    1. ``error``          → ``FAILED``  (a tool/turn just failed)
+    2. ``celebrate``      → ``JUMP``    (explicit success beat, e.g. todos done)
+    3. ``just_completed`` → ``WAVE``    (turn finished cleanly / greeting)
+    4. ``awaiting_input`` → ``WAITING`` (blocked on the user — a clarify/approval
+       prompt is open; this outranks the in-flight signals below because the turn
+       is paused on *you*, even though a tool is technically mid-call)
+    5. ``tool_running``   → ``RUN``     (a tool is executing)
+    6. ``reasoning``      → ``REVIEW``  (model is thinking / reading)
+    7. ``busy``           → ``RUN``     (turn in flight, unspecified work)
+    8. otherwise          → ``IDLE``
+    """
+    if error:
+        return PetState.FAILED
+    if celebrate:
+        return PetState.JUMP
+    if just_completed:
+        return PetState.WAVE
+    if awaiting_input:
+        return PetState.WAITING
+    if tool_running:
+        return PetState.RUN
+    if reasoning:
+        return PetState.REVIEW
+    if busy:
+        return PetState.RUN
+    return PetState.IDLE
--- a/agent/pet/store.py
+++ b/agent/pet/store.py
@@ -0,0 +1,503 @@
+"""On-disk pet store — install / list / resolve pets.
+
+Pets live under ``get_hermes_home()/pets/<slug>/`` so every profile gets its
+own set (we deliberately do **not** reuse petdex's ``~/.codex/pets`` default —
+that's owned by the petdex npm CLI and isn't profile-aware).  Each installed
+pet directory holds:
+
+    pets/<slug>/
+        pet.json            # {id, displayName, description, spritesheetPath}
+        spritesheet.webp    # (or .png)
+
+The active pet is resolved from the caller-supplied ``display.pet.slug`` config
+value (falling back to the first installed pet), so this module stays free of
+the config loader.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+
+from hermes_constants import get_hermes_home
+
+logger = logging.getLogger(__name__)
+
+_DOWNLOAD_TIMEOUT = 60.0
+
+
+class PetStoreError(RuntimeError):
+    """Raised on install/IO failures."""
+
+
+@dataclass(frozen=True)
+class InstalledPet:
+    """A pet present on disk."""
+
+    slug: str
+    display_name: str
+    description: str
+    directory: Path
+    spritesheet: Path
+    created_by: str = ""  # "generator" for pets hatched locally; "" for petdex installs
+
+    @property
+    def exists(self) -> bool:
+        return self.spritesheet.is_file()
+
+    @property
+    def generated(self) -> bool:
+        return self.created_by == "generator"
+
+
+def pets_dir() -> Path:
+    """Return the profile-scoped pets directory (created on demand)."""
+    path = get_hermes_home() / "pets"
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def _read_pet_json(directory: Path) -> dict:
+    pet_json = directory / "pet.json"
+    if not pet_json.is_file():
+        return {}
+    try:
+        return json.loads(pet_json.read_text(encoding="utf-8"))
+    except (OSError, ValueError) as exc:
+        logger.debug("unreadable pet.json in %s: %s", directory, exc)
+        return {}
+
+
+def _resolve_spritesheet(directory: Path, meta: dict) -> Path:
+    """Find the spritesheet for a pet dir.
+
+    Honors ``spritesheetPath`` from pet.json, else probes the conventional
+    filenames (``spritesheet.{webp,png}`` and petdex R2's ``sprite.webp``).
+    """
+    declared = str(meta.get("spritesheetPath", "") or "").strip()
+    if declared:
+        candidate = directory / declared
+        if candidate.is_file():
+            return candidate
+    for name in ("spritesheet.webp", "spritesheet.png", "sprite.webp", "sprite.png"):
+        candidate = directory / name
+        if candidate.is_file():
+            return candidate
+    # Default expectation even if missing, so callers get a stable path.
+    return directory / "spritesheet.webp"
+
+
+def _safe_slug(slug: str) -> str:
+    """Normalize a slug to a single bare path segment.
+
+    Pet slugs index into ``pets_dir()/<slug>/`` for load/remove, so a value
+    carrying path separators (``../``, absolute paths) could escape the pets
+    directory. Strip every separator and reject ``.``/``..`` so callers can
+    only ever name a direct child of the pets directory.
+    """
+    segment = Path(str(slug).strip()).name
+    if segment in ("", ".", ".."):
+        return ""
+    return segment
+
+
+def load_pet(slug: str) -> InstalledPet | None:
+    """Return the :class:`InstalledPet` for *slug*, or ``None`` if absent."""
+    slug = _safe_slug(slug)
+    if not slug:
+        return None
+    directory = pets_dir() / slug
+    if not directory.is_dir():
+        return None
+    meta = _read_pet_json(directory)
+    return InstalledPet(
+        slug=slug,
+        display_name=str(meta.get("displayName", "") or slug),
+        description=str(meta.get("description", "") or ""),
+        directory=directory,
+        spritesheet=_resolve_spritesheet(directory, meta),
+        created_by=str(meta.get("createdBy", "") or ""),
+    )
+
+
+def installed_pets() -> list[InstalledPet]:
+    """Return every installed pet (dirs containing a usable spritesheet)."""
+    out: list[InstalledPet] = []
+    for child in sorted(pets_dir().iterdir()):
+        if not child.is_dir():
+            continue
+        pet = load_pet(child.name)
+        if pet and pet.exists:
+            out.append(pet)
+    return out
+
+
+def resolve_active_pet(configured_slug: str | None = None) -> InstalledPet | None:
+    """Resolve which pet to display.
+
+    Precedence: the configured slug (``display.pet.slug``) if it's installed,
+    otherwise the first installed pet alphabetically, otherwise ``None``.
+    """
+    if configured_slug:
+        pet = load_pet(configured_slug.strip())
+        if pet and pet.exists:
+            return pet
+    pets = installed_pets()
+    return pets[0] if pets else None
+
+
+def install_pet(slug: str, *, force: bool = False, timeout: float = _DOWNLOAD_TIMEOUT) -> InstalledPet:
+    """Download *slug* from the manifest into the pets directory.
+
+    Idempotent: a fully-installed pet is returned as-is unless *force*.  Raises
+    :class:`PetStoreError` / :class:`~agent.pet.manifest.ManifestError` on
+    failure.
+    """
+    from agent.pet.manifest import find_entry
+
+    slug = _safe_slug(slug)
+    if not slug:
+        raise PetStoreError("invalid pet slug")
+    existing = load_pet(slug)
+    if existing and existing.exists and not force:
+        return existing
+
+    entry = find_entry(slug, timeout=timeout)
+    if entry is None:
+        raise PetStoreError(f"pet '{slug}' is not in the petdex manifest")
+
+    # Host-pin every asset URL to petdex. The manifest is trusted (HTTPS from
+    # petdex.dev), but pin the asset hosts too so a compromised/spoofed manifest
+    # can't redirect the download at an arbitrary host. Matches thumbnail_png.
+    if not _is_petdex_host(entry.spritesheet_url):
+        raise PetStoreError(f"refusing non-petdex spritesheet host for '{slug}'")
+
+    directory = pets_dir() / slug
+    directory.mkdir(parents=True, exist_ok=True)
+
+    sprite_ext = ".png" if entry.spritesheet_url.lower().split("?")[0].endswith(".png") else ".webp"
+    sprite_path = directory / f"spritesheet{sprite_ext}"
+
+    _download(entry.spritesheet_url, sprite_path, timeout=timeout)
+
+    # Fetch the upstream pet.json if present; otherwise synthesize a minimal
+    # one so the local layout is self-describing.
+    meta: dict = {}
+    if entry.pet_json_url and _is_petdex_host(entry.pet_json_url):
+        try:
+            meta = _download_json(entry.pet_json_url, timeout=timeout)
+        except Exception as exc:  # noqa: BLE001 - non-fatal, fall back below
+            logger.debug("pet.json fetch failed for %s: %s", slug, exc)
+    if not isinstance(meta, dict) or not meta:
+        meta = {"id": slug, "displayName": entry.display_name, "description": ""}
+    meta["spritesheetPath"] = sprite_path.name
+    meta.setdefault("id", slug)
+    meta.setdefault("displayName", entry.display_name)
+    (directory / "pet.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+    pet = load_pet(slug)
+    if pet is None or not pet.exists:
+        raise PetStoreError(f"install of '{slug}' did not produce a spritesheet")
+    return pet
+
+
+def slugify(name: str) -> str:
+    """Lowercase, hyphenate, and strip a display name into a filesystem slug."""
+    slug = re.sub(r"[^a-z0-9]+", "-", (name or "").strip().lower()).strip("-")
+    return slug or "pet"
+
+
+def unique_slug(name: str) -> str:
+    """A :func:`slugify` result that doesn't collide with an existing pet dir."""
+    base = slugify(name)
+    slug = base
+    counter = 2
+    while (pets_dir() / slug).exists():
+        slug = f"{base}-{counter}"
+        counter += 1
+    return slug
+
+
+def _write_spritesheet(source, dest: Path) -> None:
+    """Write *source* (PIL image, bytes, or path) as a lossless WebP at *dest*."""
+    if isinstance(source, (bytes, bytearray)):
+        dest.write_bytes(bytes(source))
+        return
+
+    from PIL import Image
+
+    if isinstance(source, (str, Path)):
+        with Image.open(source) as opened:
+            image = opened.convert("RGBA")
+    else:
+        image = source.convert("RGBA")
+    image.save(dest, format="WEBP", lossless=True, quality=100, method=6, exact=True)
+
+
+def register_local_pet(
+    spritesheet,
+    *,
+    slug: str,
+    display_name: str = "",
+    description: str = "",
+) -> InstalledPet:
+    """Write a locally-generated pet into the store and return it.
+
+    *spritesheet* may be a PIL image, raw WebP/PNG bytes, or a path. The pet
+    appears in :func:`installed_pets` immediately, and because :func:`install_pet`
+    returns an already-on-disk pet before consulting the manifest, it can be
+    adopted (``pet.select`` / ``/pet <slug>``) without a manifest entry.
+    """
+    slug = slugify(slug)
+    directory = pets_dir() / slug
+    directory.mkdir(parents=True, exist_ok=True)
+    sprite_path = directory / "spritesheet.webp"
+    try:
+        _write_spritesheet(spritesheet, sprite_path)
+    except Exception as exc:  # noqa: BLE001 - normalize to one error type
+        raise PetStoreError(f"could not write spritesheet for '{slug}': {exc}") from exc
+
+    meta = {
+        "id": slug,
+        "displayName": display_name or slug,
+        "description": description or "",
+        "spritesheetPath": sprite_path.name,
+        "createdBy": "generator",
+    }
+    (directory / "pet.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+    pet = load_pet(slug)
+    if pet is None or not pet.exists:
+        raise PetStoreError(f"register of generated pet '{slug}' did not produce a spritesheet")
+    return pet
+
+
+def export_pet(slug: str) -> tuple[str, bytes]:
+    """Zip an installed pet's folder (pet.json + spritesheet) → (filename, bytes).
+
+    Dotfiles (cached thumbs, backups) are skipped so the archive is a clean,
+    re-importable pet package. Raises :class:`PetStoreError` if not installed.
+    """
+    import io
+    import zipfile
+
+    root = pets_dir()
+    directory = root / slug.strip()
+    # Guard against traversal: the target must be a direct child of pets_dir.
+    if directory.resolve().parent != root.resolve() or not directory.is_dir():
+        raise PetStoreError(f"pet '{slug}' is not installed")
+
+    name = directory.name
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as archive:
+        for path in sorted(directory.iterdir()):
+            if path.is_file() and not path.name.startswith("."):
+                archive.write(path, f"{name}/{path.name}")
+    return f"{name}.zip", buf.getvalue()
+
+
+_THUMB_FRAME_W = 192
+_THUMB_FRAME_H = 208
+_THUMB_W = 96  # rendered ~40px; 2x+ keeps it crisp on HiDPI
+
+
+def _thumbs_dir() -> Path:
+    path = pets_dir() / ".thumbs"
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def _is_petdex_host(url: str) -> bool:
+    """True only for petdex.dev hosts — bounds server-side fetch (anti-SSRF)."""
+    from urllib.parse import urlparse
+
+    try:
+        host = (urlparse(url).hostname or "").lower()
+    except ValueError:
+        return False
+    return host == "petdex.dev" or host.endswith(".petdex.dev")
+
+
+def thumbnail_png(slug: str, *, source_url: str = "", timeout: float = 30.0) -> bytes | None:
+    """Return a small idle-frame PNG for *slug*, cached on disk.
+
+    Crops the top-left (idle, frame 0) cell of the spritesheet and downsamples
+    it to a thumbnail. Source preference: an installed spritesheet on disk, else
+    *source_url* — but only when it points at petdex (so the gateway never
+    fetches an arbitrary client-supplied URL). Returns ``None`` when there's no
+    usable source or Pillow/network fails; callers render a placeholder.
+
+    Doing this server-side sidesteps the renderer's CSP / R2 hotlink limits that
+    break a direct ``<img src=cdn>`` and lets the result ride the authenticated
+    gateway as a same-origin data URL.
+    """
+    slug = slug.strip()
+    if not slug:
+        return None
+
+    cache = _thumbs_dir() / f"{slug}.png"
+    if cache.is_file():
+        try:
+            return cache.read_bytes()
+        except OSError:
+            pass
+
+    sheet_bytes: bytes | None = None
+    pet = load_pet(slug)
+    if pet and pet.exists:
+        try:
+            sheet_bytes = pet.spritesheet.read_bytes()
+        except OSError:
+            sheet_bytes = None
+
+    if sheet_bytes is None and source_url and _is_petdex_host(source_url):
+        try:
+            import httpx
+
+            resp = httpx.get(
+                source_url,
+                timeout=timeout,
+                follow_redirects=True,
+                headers={"User-Agent": "hermes-agent-petdex"},
+            )
+            resp.raise_for_status()
+            sheet_bytes = resp.content
+        except Exception as exc:  # noqa: BLE001 - cosmetic, degrade to placeholder
+            logger.debug("thumb fetch failed for %s: %s", slug, exc)
+
+    if not sheet_bytes:
+        return None
+
+    try:
+        import io
+
+        from PIL import Image
+
+        with Image.open(io.BytesIO(sheet_bytes)) as im:
+            frame = im.convert("RGBA").crop(
+                (0, 0, min(_THUMB_FRAME_W, im.width), min(_THUMB_FRAME_H, im.height))
+            )
+            height = round(_THUMB_W * _THUMB_FRAME_H / _THUMB_FRAME_W)
+            frame = frame.resize((_THUMB_W, height), Image.NEAREST)
+            buf = io.BytesIO()
+            frame.save(buf, format="PNG")
+            data = buf.getvalue()
+    except Exception as exc:  # noqa: BLE001
+        logger.debug("thumb crop failed for %s: %s", slug, exc)
+        return None
+
+    try:
+        cache.write_bytes(data)
+    except OSError:
+        pass
+    return data
+
+
+def remove_pet(slug: str) -> bool:
+    """Delete an installed pet directory.  Returns True if anything was removed."""
+    import shutil
+
+    slug = _safe_slug(slug)
+    if not slug:
+        return False
+
+    # The cached thumbnail lives in pets/.thumbs/<slug>.png — OUTSIDE the pet
+    # dir, so rmtree won't catch it. Drop it too, or a later pet that reuses this
+    # slug renders this one's stale thumbnail.
+    try:
+        (_thumbs_dir() / f"{slug}.png").unlink(missing_ok=True)
+    except OSError:
+        pass
+
+    directory = pets_dir() / slug
+    if not directory.is_dir():
+        return False
+    shutil.rmtree(directory, ignore_errors=True)
+    return not directory.exists()
+
+
+def rename_pet(slug: str, display_name: str) -> str | None:
+    """Rename a pet's ``displayName`` AND realign its slug/dir to match.
+
+    Generated pets are hatched under a provisional, prompt-derived slug; when
+    the user names the pet on the reveal screen we make that name the real
+    identity so lists/subtitles show what they typed, not the prompt. The dir is
+    renamed to ``slugify(name)`` (and the cached thumbnail moved alongside it)
+    whenever that yields a free, different slug — otherwise the slug is left as
+    is. Returns the resulting slug on success, or ``None`` on failure.
+    """
+    slug = _safe_slug(slug)
+    display_name = (display_name or "").strip()
+    if not slug or not display_name:
+        return None
+    directory = pets_dir() / slug
+    pet_json = directory / "pet.json"
+    if not pet_json.is_file():
+        return None
+    try:
+        meta = json.loads(pet_json.read_text(encoding="utf-8"))
+    except (OSError, ValueError):
+        meta = {}
+    if not isinstance(meta, dict):
+        meta = {}
+    meta["displayName"] = display_name
+
+    new_slug = slug
+    desired = slugify(display_name)
+    if desired and desired != slug and not (pets_dir() / desired).exists():
+        try:
+            directory.rename(pets_dir() / desired)
+            try:
+                (_thumbs_dir() / f"{slug}.png").rename(_thumbs_dir() / f"{desired}.png")
+            except OSError:
+                pass
+            directory = pets_dir() / desired
+            pet_json = directory / "pet.json"
+            new_slug = desired
+            meta["id"] = new_slug
+        except OSError:
+            new_slug = slug  # keep the provisional slug if the move fails
+
+    try:
+        pet_json.write_text(json.dumps(meta, indent=2), encoding="utf-8")
+    except OSError:
+        return None
+    return new_slug
+
+
+def _download(url: str, dest: Path, *, timeout: float) -> None:
+    import httpx
+
+    try:
+        with httpx.stream(
+            "GET",
+            url,
+            timeout=timeout,
+            follow_redirects=True,
+            headers={"User-Agent": "hermes-agent-petdex"},
+        ) as resp:
+            resp.raise_for_status()
+            tmp = dest.with_suffix(dest.suffix + ".part")
+            with tmp.open("wb") as fh:
+                for chunk in resp.iter_bytes():
+                    fh.write(chunk)
+            tmp.replace(dest)
+    except Exception as exc:  # noqa: BLE001
+        raise PetStoreError(f"download failed for {url}: {exc}") from exc
+
+
+def _download_json(url: str, *, timeout: float) -> dict:
+    import httpx
+
+    resp = httpx.get(
+        url,
+        timeout=timeout,
+        follow_redirects=True,
+        headers={"User-Agent": "hermes-agent-petdex"},
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    return data if isinstance(data, dict) else {}
--- a/agent/process_bootstrap.py
+++ b/agent/process_bootstrap.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import os
 import sys
 import urllib.request
-from typing import Optional
+from typing import Any, Optional

 from utils import base_url_hostname, normalize_proxy_url

@@ -142,6 +142,46 @@ def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
    return proxy


+def build_keepalive_http_client(
+    base_url: str = "",
+    *,
+    async_mode: bool = False,
+) -> Optional[Any]:
+    """Build an httpx client for OpenAI SDK calls with env-only proxy policy.
+
+    Uses explicit ``HTTPS_PROXY`` / ``NO_PROXY`` env vars via
+    ``_get_proxy_for_base_url``. A custom transport disables httpx's default
+    ``trust_env`` path, so macOS system proxy settings from
+    ``urllib.request.getproxies()`` (which omit the ExceptionsList) are not
+    applied. Mirrors ``AIAgent._build_keepalive_http_client``.
+    """
+    try:
+        import httpx
+        import socket
+
+        if "api.githubcopilot.com" in str(base_url or "").lower():
+            client_cls = httpx.AsyncClient if async_mode else httpx.Client
+            return client_cls()
+
+        sock_opts = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)]
+        if hasattr(socket, "TCP_KEEPIDLE"):
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 30))
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10))
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3))
+        elif hasattr(socket, "TCP_KEEPALIVE"):
+            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPALIVE, 30))
+
+        proxy = _get_proxy_for_base_url(base_url)
+        transport_cls = httpx.AsyncHTTPTransport if async_mode else httpx.HTTPTransport
+        client_cls = httpx.AsyncClient if async_mode else httpx.Client
+        return client_cls(
+            transport=transport_cls(socket_options=sock_opts),
+            proxy=proxy,
+        )
+    except Exception:
+        return None
+
+
 def _install_safe_stdio() -> None:
    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
    for stream_name in ("stdout", "stderr"):
@@ -164,4 +204,5 @@ __all__ = [
    "_install_safe_stdio",
    "_get_proxy_from_env",
    "_get_proxy_for_base_url",
+    "build_keepalive_http_client",
 ]
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -88,12 +88,15 @@ def _find_hermes_md(cwd: Path) -> Optional[Path]:
    stop_at = _find_git_root(cwd)
    current = cwd.resolve()

-    for directory in [current, *current.parents]:
+    # When there is no git root, only check cwd itself – walking parents
+    # could pick up a .hermes.md planted in /tmp, /home, etc.
+    search_dirs = [current, *current.parents] if stop_at else [current]
+
+    for directory in search_dirs:
        for name in _HERMES_MD_NAMES:
            candidate = directory / name
            if candidate.is_file():
                return candidate
-        # Stop walking at the git root (or filesystem root).
        if stop_at and directory == stop_at:
            break
    return None
@@ -243,7 +246,10 @@ KANBAN_GUIDANCE = (
    "- **Workspace.** `cd $HERMES_KANBAN_WORKSPACE` first. For a `worktree` kind "
    "with no `.git`, `git worktree add <path> "
    "${HERMES_KANBAN_BRANCH:-wt/$HERMES_KANBAN_TASK}` from the main repo, then "
-    "cd there.\n"
+    "cd there. For a project-linked task the workspace is a fresh "
+    "`<repo>/.worktrees/<task-id>` and `$HERMES_KANBAN_BRANCH` a deterministic "
+    "`<project-slug>/<task-id>` — the main repo is two levels up, so run "
+    "`git worktree add` from there.\n"
    "- **Deliverables.** Files a human wants go in "
    "`kanban_complete(artifacts=[<absolute paths>])` (top-level param; paths in "
    "`metadata` are NOT uploaded). Files must exist at completion.\n"
@@ -614,7 +620,12 @@ DEVELOPER_ROLE_MODELS = ("gpt-5", "codex")
 PLATFORM_HINTS = {
    "whatsapp": (
        "You are on a text messaging communication platform, WhatsApp. "
-        "Please do not use markdown as it does not render. "
+        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
+        "`code`, ```code blocks```, [links](url)) is auto-converted to "
+        "WhatsApp's native syntax (*bold*, _italic_, ~strike~, monospace) — "
+        "feel free to write in markdown, and use bullet lists ('- item') "
+        "freely. Tables are NOT supported — prefer bullet lists or labeled "
+        "key:value pairs. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. The file "
        "will be sent as a native WhatsApp attachment — images (.jpg, .png, "
@@ -679,7 +690,11 @@ PLATFORM_HINTS = {
    ),
    "signal": (
        "You are on a text messaging communication platform, Signal. "
-        "Please do not use markdown as it does not render. "
+        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
+        "`code`, ```code blocks```) is auto-converted to Signal's native "
+        "rich formatting — feel free to write in markdown, and use bullet "
+        "lists ('- item') freely (they render as • bullets). Tables are NOT "
+        "supported — prefer bullet lists or labeled key:value pairs. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. Images "
        "(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
@@ -709,7 +724,24 @@ PLATFORM_HINTS = {
        "(those are only intercepted on messaging platforms like Telegram, "
        "Discord, Slack, etc.; on the CLI they render as literal text). "
        "When referring to a file you created or changed, just state its "
-        "absolute path in plain text; the user can open it from there."
+        "absolute path in plain text; the user can open it from there. "
+        "Cron jobs scheduled from this session are LOCAL-ONLY: their output is "
+        "saved (viewable via cronjob action='list') but is NOT delivered back "
+        "into this terminal — there is no live-delivery channel here. If the "
+        "user wants to be notified when a job runs, the job's `deliver` must "
+        "target a gateway-connected messaging platform (e.g. deliver='telegram' "
+        "or 'all'). Do not promise the user that a deliver='origin' or "
+        "default-deliver cron job will message them in this session."
+    ),
+    "tui": (
+        "You are running in the Hermes terminal UI (TUI). "
+        "Cron jobs scheduled from this session are LOCAL-ONLY: their output is "
+        "saved (viewable via cronjob action='list') but is NOT delivered back "
+        "into this TUI session — there is no live-delivery channel here. If the "
+        "user wants to be notified when a job runs, the job's `deliver` must "
+        "target a gateway-connected messaging platform (e.g. deliver='telegram' "
+        "or 'all'). Do not promise the user that a deliver='origin' or "
+        "default-deliver cron job will message them in this session."
    ),
    "sms": (
        "You are communicating via SMS. Keep responses concise and use plain text "
@@ -897,8 +929,7 @@ def _probe_remote_backend(env_type: str) -> str | None:
    try:
        # Import locally: tools/ imports are heavy and only relevant when a
        # non-local backend is actually configured.
-        from tools.terminal_tool import _get_env_config  # type: ignore
-        from tools.environments import get_environment  # type: ignore
+        from tools.terminal_tool import _create_environment, _get_env_config  # type: ignore
    except Exception as e:
        logger.debug("Backend probe unavailable (import failed): %s", e)
        _BACKEND_PROBE_CACHE[cache_key] = ""
@@ -906,7 +937,59 @@ def _probe_remote_backend(env_type: str) -> str | None:

    try:
        config = _get_env_config()
-        env = get_environment(config)
+        # Build the environment the same way tools/terminal_tool.py does for a
+        # live command: select the backend image, then assemble ssh/container
+        # config from the env-derived dict. (There is no `get_environment`
+        # factory — the real entry point is `_create_environment`.)
+        if env_type == "docker":
+            image = config.get("docker_image", "")
+        elif env_type == "singularity":
+            image = config.get("singularity_image", "")
+        elif env_type == "modal":
+            image = config.get("modal_image", "")
+        elif env_type == "daytona":
+            image = config.get("daytona_image", "")
+        else:
+            image = ""
+
+        ssh_config = None
+        if env_type == "ssh":
+            ssh_config = {
+                "host": config.get("ssh_host", ""),
+                "user": config.get("ssh_user", ""),
+                "port": config.get("ssh_port", 22),
+                "key": config.get("ssh_key", ""),
+                "persistent": config.get("ssh_persistent", False),
+            }
+
+        container_config = None
+        if env_type in {"docker", "singularity", "modal", "daytona"}:
+            container_config = {
+                "container_cpu": config.get("container_cpu", 1),
+                "container_memory": config.get("container_memory", 5120),
+                "container_disk": config.get("container_disk", 51200),
+                "container_persistent": config.get("container_persistent", True),
+                "modal_mode": config.get("modal_mode", "auto"),
+                "docker_volumes": config.get("docker_volumes", []),
+                "docker_mount_cwd_to_workspace": config.get("docker_mount_cwd_to_workspace", False),
+                "docker_forward_env": config.get("docker_forward_env", []),
+                "docker_env": config.get("docker_env", {}),
+                "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
+                "docker_extra_args": config.get("docker_extra_args", []),
+                "docker_persist_across_processes": config.get("docker_persist_across_processes", True),
+                "docker_orphan_reaper": config.get("docker_orphan_reaper", True),
+            }
+
+        env = _create_environment(
+            env_type=env_type,
+            image=image,
+            cwd=config.get("cwd", ""),
+            timeout=config.get("timeout", 180),
+            ssh_config=ssh_config,
+            container_config=container_config,
+            task_id="prompt-backend-probe",
+            host_cwd=config.get("host_cwd"),
+        )
        # Single-line POSIX probe — works on any Unixy backend. Wrapped in
        # `2>/dev/null` so a missing binary doesn't pollute the output.
        probe_cmd = (
--- a/agent/reasoning_timeouts.py
+++ b/agent/reasoning_timeouts.py
@@ -0,0 +1,216 @@
+"""Per-reasoning-model stale-timeout floor for known reasoning models.
+
+Reasoning models (those that emit extended thinking blocks before their
+first content token) routinely exceed Hermes's default chat-model
+stale detectors:
+
+* Stream stale detector:   ``HERMES_STREAM_STALE_TIMEOUT``     default 180s
+                           ``agent/chat_completion_helpers.py:2544``
+* Non-stream stale detector: ``HERMES_API_CALL_STALE_TIMEOUT``  default 90s
+                           ``run_agent.py:1140``
+
+For NVIDIA Nemotron 3 Ultra on the hosted NIM gateway the empirical
+upstream idle kill is ~120s (first-party reproduction at
+NVIDIA/NemoClaw#4846 — TTFB ~31s, stream dies at 120s). The same
+failure mode exists on OpenAI o1/o3, Anthropic Opus 4.x thinking,
+DeepSeek R1, Qwen QwQ, xAI Grok reasoning — every cloud reasoning
+model hits upstream-proxies / load-balancers with idle timeouts
+shorter than the model's thinking phase. Result: the stale detector
+kills the connection mid-think, surfacing as
+``BrokenPipeError``/``RemoteProtocolError`` on the next read.
+
+This module provides a floor that the existing stale-detector scaling
+blocks consult via :func:`get_reasoning_stale_timeout_floor` and
+apply as ``max(default, floor)``. It is a FLOOR:
+
+* Never overrides explicit user config (``providers.<id>.models.<model>.stale_timeout_seconds``
+  or ``request_timeout_seconds`` already wins — this code never runs
+  in that branch).
+* Never lowers an existing threshold.
+* Has zero effect on non-reasoning models — they are not in the
+  allowlist and the resolver returns ``None``.
+
+Matching uses start-anchored regex on the slug-only component of
+the model name (after stripping any aggregator prefix like
+``openai/``, ``x-ai/``, ``anthropic/``).  The right-anchor matches
+end-of-string or a ``-``/``.``/``_`` slug separator, so ``qwen3-235b``
+matches the ``qwen3`` family entry (a future model slug would be
+``qwen3-235b-instruct`` and would also match) but ``some-other-qwen3``
+does NOT match ``qwen3`` (the ``-qwen3`` is not at start of slug).
+
+The ``o1`` case is the most delicate: a model named
+``llama-4-70b-o1-preview`` is a hypothetical community derivative that
+should NOT trigger the reasoning-model floor for the user (the user
+chose a non-OpenAI model, not a reasoning model).  The start-of-slug
+anchor naturally excludes this — the matched ``o1-preview`` is at
+position 11 of the slug, not at position 0.  The previous substring-
+with-trailing-hyphen design would have over-matched here, which is
+why start-of-slug anchoring is the right shape.
+
+Fixes #52217.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+
+# (slug, floor_seconds).  Each slug is matched as a discrete
+# word-boundary component via the wrapper regex in ``_match_any``
+# below.  Order is irrelevant — the first regex match wins.
+_REASONING_STALE_TIMEOUT_FLOORS: tuple[tuple[str, int], ...] = (
+    # NVIDIA Nemotron — reasoning models behind hosted NIM with
+    # documented 60-180s upstream idle kill (NVIDIA/NemoClaw#4846:
+    # 120s measured).
+    ("nemotron-3-ultra", 600),
+    ("nemotron-3-super", 600),
+    ("nemotron-3-nano",  300),
+    # DeepSeek — R1 reasoning model on hosted NIM / DeepSeek direct.
+    ("deepseek-r1", 600),
+    ("deepseek-reasoner", 600),
+    # Qwen — QwQ reasoning + Qwen3 thinking variants.  QwQ-32B
+    # preview is the stable slug; ``qwen3`` covers the family of
+    # thinking-mode Qwen3 models (qwen3-235b-a22b, qwen3-32b, etc.)
+    # without over-matching every Qwen3 instruct variant — the
+    # right-anchor requires the slug to be at the start of the
+    # remaining model name, so ``qwen3-235b-instruct`` (instruct is
+    # NOT a thinking variant) would still match.  Acceptable
+    # trade-off: instruct variants of qwen3 get the 180s floor
+    # even though they don't reason.  The cost is a slightly longer
+    # wait on a hung provider; the alternative (matching only
+    # ``qwen3-.*-thinking``) breaks the moment NVIDIA or Alibaba
+    # ships a slightly different naming shape.
+    ("qwq-32b", 300),
+    ("qwen3", 180),
+    # OpenAI o-series — known multi-minute TTFB.  Each variant
+    # enumerated explicitly so bare ``o1`` doesn't over-match
+    # ``olmo-1`` or hypothetical future community derivatives.
+    ("o1", 600),
+    ("o1-mini", 600),
+    ("o1-pro", 600),
+    ("o1-preview", 600),
+    ("o3", 600),
+    ("o3-pro", 600),
+    ("o3-mini", 300),
+    ("o4-mini", 300),
+    # Anthropic Claude 4.x thinking variants.  Anchored at
+    # ``claude-opus-4`` so non-thinking Claude 3.x or future
+    # non-reasoning Claude variants don't match.
+    ("claude-opus-4", 240),
+    ("claude-sonnet-4.5", 180),
+    ("claude-sonnet-4.6", 180),
+    # xAI Grok reasoning variants.  Explicit reasoning-only keys
+    # plus one for the ``non-reasoning`` variant so users picking
+    # the fast variant don't get the 300s floor.  Bare ``grok-3``,
+    # ``grok-4`` etc. don't match — only the explicit reasoning /
+    # non-reasoning pairs.
+    ("grok-4-fast-reasoning", 300),
+    ("grok-4.20-reasoning", 300),
+    ("grok-4-fast-non-reasoning", 180),
+)
+
+
+# Pre-compile each pattern.  Wrapper = start-of-slug + slug + end-or-
+# separator, where ``start-of-slug`` means start-of-string OR
+# immediately after the last ``/`` (aggregator separator) and
+# ``end-or-separator`` means end-of-string OR a ``-``/``.``/``_``.
+#
+# Why start-of-slug and not start-of-string: aggregator prefixes
+# like ``openai/`` should not affect matching — the slug identity is
+# the part after the last ``/``.  Stripping the aggregator prefix in
+# :func:`get_reasoning_stale_timeout_floor` before regex matching
+# gives the wrapper a clean start-of-string anchor.
+#
+# Why end-or-separator on the right: ``openai/o3-mini`` must match
+# the ``o3-mini`` slug (the right anchor is end-of-string).  And
+# ``openai/o3-mini-2025-01-31`` must also match ``o3-mini`` (the right
+# anchor is the ``-`` separator).  But ``openai/o3-mini-fork`` should
+# NOT match ``o3-mini`` if we wanted to exclude forks — though the
+# pattern ``o3-mini-fork`` would be matched as a derivative anyway,
+# so we accept that community forks inheriting the same prefix are
+# treated as reasoning models (a reasonable default — the upstream
+# gateway timing is the same).
+_PATTERN_CACHE: dict[str, re.Pattern[str]] = {}
+
+
+def _get_pattern(slug: str) -> re.Pattern[str]:
+    compiled = _PATTERN_CACHE.get(slug)
+    if compiled is None:
+        compiled = re.compile(
+            r"^"
+            + re.escape(slug)
+            + r"(?:$|[\-._])"
+        )
+        _PATTERN_CACHE[slug] = compiled
+    return compiled
+
+
+def _match_any(model_lower: str) -> Optional[float]:
+    """Return the floor for the first matching slug, else None.
+
+    Each table entry is matched as a start-of-slug prefix with the
+    slug-separator-or-end-of-string right-anchor.  Table iteration
+    order is irrelevant: longest slug wins (so ``o3-mini`` beats
+    ``o3`` on a model like ``openai/o3-mini``).
+    """
+    # Sort by slug length descending so longer / more-specific slugs
+    # win on shared prefixes (o3-mini beats o3).
+    sorted_floors = sorted(
+        _REASONING_STALE_TIMEOUT_FLOORS, key=lambda kv: -len(kv[0])
+    )
+    for slug, floor in sorted_floors:
+        if _get_pattern(slug).search(model_lower):
+            return float(floor)
+    return None
+
+
+def get_reasoning_stale_timeout_floor(model: object) -> Optional[float]:
+    """Return the stale-timeout floor (seconds) for a known reasoning model.
+
+    Returns ``None`` when the model is not in the allowlist or the
+    argument is empty / not a string.  Matching uses
+    word-boundary-anchored regex on the lowercased model name, so
+    ``openai/o3-mini`` matches the ``o3-mini`` slug but
+    ``olmo-1`` does NOT match ``o1`` (the ``o1`` substring is not
+    at a word boundary inside ``olmo-1``).
+
+    Aggregator prefixes (``openai/``, ``x-ai/``, ``anthropic/`` etc.)
+    are preserved through matching — the ``/`` is itself a word
+    boundary, so ``openai/o3-mini`` matches ``o3-mini`` because the
+    ``/`` before ``o3-mini`` satisfies the left-anchor alternation.
+
+    This is a FLOOR — callers must apply it as ``max(default, floor)``
+    and only when no explicit user-configured per-model
+    ``stale_timeout_seconds`` exists.
+
+    >>> get_reasoning_stale_timeout_floor("nvidia/nemotron-3-ultra-550b-a55b")
+    600.0
+    >>> get_reasoning_stale_timeout_floor("openai/o3-mini")
+    300.0
+    >>> get_reasoning_stale_timeout_floor("deepseek/deepseek-r1")
+    600.0
+    >>> get_reasoning_stale_timeout_floor("qwen/qwen3-235b-a22b-thinking")
+    180.0
+    >>> get_reasoning_stale_timeout_floor("x-ai/grok-4-fast-reasoning")
+    300.0
+    >>> get_reasoning_stale_timeout_floor("anthropic/claude-opus-4-6")
+    240.0
+    >>> get_reasoning_stale_timeout_floor("gpt-4o") is None
+    True
+    >>> get_reasoning_stale_timeout_floor("olmo-1") is None
+    True
+    >>> get_reasoning_stale_timeout_floor(None) is None
+    True
+    """
+    if not model or not isinstance(model, str):
+        return None
+    name = model.strip().lower()
+    if not name:
+        return None
+    # Strip aggregator prefix (everything before and including the
+    # last ``/``).  The wrapper regex anchors at start-of-string, so
+    # the slug identity is the bare model name.
+    if "/" in name:
+        name = name.rsplit("/", 1)[1]
+    return _match_any(name)
--- a/agent/redact.py
+++ b/agent/redact.py
@@ -10,6 +10,7 @@ the first 6 and last 4 characters for debuggability.
 import logging
 import os
 import re
+import shlex

 logger = logging.getLogger(__name__)

@@ -107,12 +108,60 @@ _PREFIX_PATTERNS = [
    r"ntn_[A-Za-z0-9]{10,}",            # Notion internal integration token
 ]

-# ENV assignment patterns: KEY=value where KEY contains a secret-like name
+# ENV assignment patterns: KEY=value where KEY contains a secret-like name.
+# Uppercase keys tolerate spaces around "=" (e.g. ``FOO_SECRET = bar``) because
+# an all-caps key is almost never prose/code.
 _SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
 _ENV_ASSIGN_RE = re.compile(
    rf"([A-Z0-9_]{{0,50}}{_SECRET_ENV_NAMES}[A-Z0-9_]{{0,50}})\s*=\s*(['\"]?)(\S+)\2",
 )

+# Lowercase / dotted / hyphenated config keys from config files
+# (application.properties, .env, YAML-ish dumps): ``spring.datasource.password=secret``,
+# ``app.api.key=xyz``, ``password=secret``. The uppercase _ENV_ASSIGN_RE above
+# never matched these, so config-file passwords leaked verbatim (issue #16413).
+#
+# These run only in a config-file context, NOT in prose, code, or URLs — three
+# carve-outs preserved from the original design (#4367 + the documented
+# web-URL passthrough below):
+#   1. The value is bounded by ``[^\s&]`` (stops at whitespace AND ``&``) so
+#      form-urlencoded bodies are handled pair-by-pair (by _redact_form_body),
+#      not greedily swallowed.
+#   2. _CFG_DOTTED_RE only matches when the key is NAMESPACED (contains a dot),
+#      which is unambiguously a config key — never a prose word.
+#   3. _CFG_ANCHORED_RE matches a bare secret-word key only at line start
+#      (optionally after ``export``), so conversational ``I have password=foo``
+#      mid-sentence is left alone.
+# The colon-form URL guard (skip when ``://`` present) lives at the call site.
+_SECRET_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential|auth)"
+_CFG_VALUE = r"(['\"]?)([^\s&]+?)\2(?=[\s&]|$)"
+# Namespaced (dotted) key: the secret word may sit anywhere in a dotted path.
+_CFG_DOTTED_RE = re.compile(
+    rf"((?:[A-Za-z0-9_\-]+\.)+[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*"
+    rf"|[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*\.[A-Za-z0-9_.\-]+)"
+    rf"={_CFG_VALUE}",
+    re.IGNORECASE,
+)
+# Line-anchored bare key: ``password=…`` / ``export api_key=…`` at start of line.
+_CFG_ANCHORED_RE = re.compile(
+    rf"(^[ \t]*(?:export[ \t]+)?[A-Za-z0-9_\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_\-]*)={_CFG_VALUE}",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Unquoted YAML / colon config (e.g. ``password: secret``,
+# ``spring.datasource.password: hunter2``). The secret keyword must be part of
+# the KEY (anchored to the start of the line/indent), and the value is a single
+# whitespace-free token — so prose like ``note: secret meeting`` (keyword in the
+# value) and ``error: token expired`` are left alone. Bare ``auth`` is excluded
+# from the key set so ``Authorization:`` / ``author:`` don't match (the former
+# is masked by _AUTH_HEADER_RE); ``auth_token``/``auth-token`` still match via
+# the ``token`` keyword. Quoted values defer to _JSON_FIELD_RE via the lookahead.
+_YAML_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential)"
+_YAML_ASSIGN_RE = re.compile(
+    rf"(^[ \t]*[A-Za-z0-9_.\-]*{_YAML_CFG_NAMES}[A-Za-z0-9_.\-]*)(:[ \t]*)(?!['\"])([^\s&]+)",
+    re.IGNORECASE | re.MULTILINE,
+)
+
 # JSON field patterns: "apiKey": "value", "token": "value", etc.
 _JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer|secret_value|raw_secret|secret_input|key_material)"
 _JSON_FIELD_RE = re.compile(
@@ -125,8 +174,15 @@ _JSON_FIELD_RE = re.compile(
 # while the header name and scheme word are preserved for debuggability. The
 # previous rule only matched ``Bearer``, so ``Basic <base64 user:pass>`` and
 # ``token <pat>`` leaked verbatim into logs/transcripts.
+#
+# The credential class excludes quote characters (``"`` / ``'``): a token sitting
+# flush against a closing quote (``"Authorization: Bearer sk-..."``) must not pull
+# that quote into the match, or masking turns value corruption into *syntax*
+# corruption — the closing quote vanishes and the command/string no longer parses
+# (unterminated quote → shell EOF / Python SyntaxError). Real credentials never
+# contain ``"`` or ``'``, so excluding them is safe. See #43083.
 _AUTH_HEADER_RE = re.compile(
-    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?(\S+)",
+    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?([^\s\"']+)",
    re.IGNORECASE,
 )

@@ -154,9 +210,37 @@ _PRIVATE_KEY_RE = re.compile(
 )

 # Database connection strings: protocol://user:PASSWORD@host
-# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
+# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password.
+# The userinfo and password groups forbid whitespace ([^:\s]+ / [^@\s]+) so the
+# match can never span a line break. A real DSN password never contains
+# whitespace; without this bound the greedy [^@]+ would scan past the end of a
+# code line to the next stray "@" (e.g. a Python decorator), swallowing
+# intervening lines and corrupting tool OUTPUT for any source containing a
+# postgresql:// f-string template. See issue #33801.
 _DB_CONNSTR_RE = re.compile(
-    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
+    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:\s]+:)([^@\s]+)(@)",
+    re.IGNORECASE,
+)
+
+# Bare-token credential in a web/transport URL: ``scheme://TOKEN@host``.
+# This is the ``git remote set-url origin https://PASSWORD@github.com/...``
+# shape from issue #6396 — a single opaque credential in the userinfo position
+# with NO ``user:pass`` colon. It is unambiguously a secret: legitimate
+# round-trip URLs (OAuth callbacks, magic links, pre-signed shares — see the
+# "Web-URL redaction is intentionally OFF" note in redact_sensitive_text) carry
+# their tokens in the QUERY STRING, never in bare userinfo. The colon form
+# ``user:pass@`` is deliberately left to pass through (commit "pass web URLs
+# through unchanged", #34029) and is NOT matched here — the token class forbids
+# ``:``. DB schemes are handled by _DB_CONNSTR_RE above and excluded here.
+#
+# Guards against false positives:
+#   - 8+ char floor skips short usernames (git, admin, root, deploy, ubuntu).
+#   - The token class ``[^\s:@/]`` cannot cross ``/``, so an ``@`` sitting in a
+#     path or query (e.g. ``?q=user@example.com``) is never treated as userinfo.
+_URL_BARE_TOKEN_RE = re.compile(
+    r"((?:https?|wss?|git|ssh|ftp|ftps|sftp)://)"  # scheme
+    r"([^\s:@/]{8,})"                               # bare token (no colon/slash/@), 8+ chars
+    r"(@[^\s]+)",                                   # @host...
    re.IGNORECASE,
 )

@@ -340,7 +424,40 @@ def _redact_form_body(text: str) -> str:
    return _redact_query_string(text.strip())


-def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = False) -> str:
+def _mask_token_nonreusable(token: str) -> str:
+    """Redact a prefix-matched credential to a NON-REUSABLE sentinel.
+
+    Unlike :func:`_mask_token` (which keeps head/tail chars — fine for logs
+    that are never fed back into a config), this emits a marker that:
+
+    * cannot be mistaken for a usable-but-truncated key, so an agent that
+      reads it from a config file and writes it back does NOT corrupt the
+      stored credential into a dead 13-char string (issue #35519); and
+    * still does not leak the secret material (no head/tail chars).
+
+    The vendor prefix label is preserved for debuggability so the agent can
+    still tell *which* credential is present (e.g. a GitHub PAT vs an OpenAI
+    key) without seeing any of its bytes.
+    """
+    if not token:
+        return "«redacted-secret»"
+    # Preserve only the recognizable vendor prefix label (e.g. "ghp_", "sk-"),
+    # never any of the random secret body.
+    label = ""
+    for sub in _PREFIX_SUBSTRINGS:
+        if token.startswith(sub):
+            label = sub
+            break
+    return f"«redacted:{label}…»" if label else "«redacted-secret»"
+
+
+def redact_sensitive_text(
+    text: str,
+    *,
+    force: bool = False,
+    code_file: bool = False,
+    file_read: bool = False,
+) -> str:
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
@@ -353,6 +470,17 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    constants, "apiKey": "test" fixtures). Prefix patterns, auth headers,
    private keys, DB connstrings, JWTs, and URL secrets are still redacted.

+    Set file_read=True for file *content* returned to the agent (read_file /
+    search_files / cat). Secrets are STILL redacted — they are never exposed —
+    but prefix-matched credentials are replaced with a non-reusable sentinel
+    (``«redacted:ghp_…»``) instead of a head/tail-preserving mask
+    (``ghp_S1...Pn2T``). The old mask looked like a real-but-truncated key, so
+    an agent reading it from config.yaml and writing it back silently corrupted
+    the stored credential into a dead 13-char value → 401 (issue #35519). The
+    sentinel is syntactically invalid as a token, so it can't be mistaken for a
+    usable key or written back as one. Implies code_file=True (config/data
+    files shouldn't trigger the source-code ENV/JSON false-positive paths).
+
    Performance: each regex pattern is gated behind a cheap substring
    pre-check (e.g. ``"=" in text`` for ENV assignments, ``"://" in text``
    for URLs, ``"eyJ" in text`` for JWTs). On a typical hermes log line
@@ -371,9 +499,15 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    if not (force or _REDACT_ENABLED):
        return text

+    # file_read content shouldn't hit the source-code ENV/JSON false-positive
+    # paths either (it's config/data, not log lines).
+    if file_read:
+        code_file = True
+
    # Known prefixes (sk-, ghp_, etc.) — gate on substring presence
    if _has_known_prefix_substring(text):
-        text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)
+        _prefix_sub = _mask_token_nonreusable if file_read else _mask_token
+        text = _PREFIX_RE.sub(lambda m: _prefix_sub(m.group(1)), text)

    # ENV assignments: OPENAI_API_KEY=***  (skip for code files — false positives)
    if not code_file:
@@ -382,6 +516,13 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
                name, quote, value = m.group(1), m.group(2), m.group(3)
                return f"{name}={quote}{_mask_token(value)}{quote}"
            text = _ENV_ASSIGN_RE.sub(_redact_env, text)
+            # Lowercase/dotted config keys (issue #16413). Skip URLs entirely —
+            # web-URL query params are intentionally passed through (see note
+            # near the bottom of this function); _DB_CONNSTR_RE still guards
+            # connection-string passwords.
+            if "://" not in text:
+                text = _CFG_DOTTED_RE.sub(_redact_env, text)
+                text = _CFG_ANCHORED_RE.sub(_redact_env, text)

        # JSON fields: "apiKey": "***"  (skip for code files — false positives)
        if ":" in text and '"' in text:
@@ -390,6 +531,15 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
                return f'{key}: "{_mask_token(value)}"'
            text = _JSON_FIELD_RE.sub(_redact_json, text)

+        # Unquoted YAML / colon config: password: ***  (after JSON so quoted
+        # values are handled there; the lookahead in _YAML_ASSIGN_RE skips
+        # quotes). Skip URLs — web-URL query params pass through by design.
+        if ":" in text and "://" not in text:
+            def _redact_yaml(m):
+                key, sep, value = m.group(1), m.group(2), m.group(3)
+                return f"{key}{sep}{_mask_token(value)}"
+            text = _YAML_ASSIGN_RE.sub(_redact_yaml, text)
+
    # Authorization headers — _AUTH_HEADER_RE matches any scheme after
    # "[Proxy-]Authorization:" case-insensitively, so "uthorization" is the
    # cheapest substring gate that covers every casing without a casefold().
@@ -419,9 +569,32 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    if "BEGIN" in text and "-----" in text:
        text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)

-    # Database connection string passwords
+    # Database connection string passwords. With code_file=True, a password
+    # group that is a pure ``{...}`` brace expression is an f-string template
+    # reference (e.g. f"postgresql://{user}:{pass}@{host}"), not a literal
+    # credential — preserve it. Literal passwords are still redacted. The regex
+    # forbids whitespace in the password group, so a single-line template's
+    # group(2) is exactly the brace expression. See issue #33801.
    if "://" in text:
-        text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
+        if code_file:
+            def _redact_db(m):
+                pw = m.group(2)
+                if pw.startswith("{") and pw.endswith("}"):
+                    return m.group(0)
+                return f"{m.group(1)}***{m.group(3)}"
+            text = _DB_CONNSTR_RE.sub(_redact_db, text)
+        else:
+            text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
+
+        # Bare-token userinfo in web/transport URLs: ``scheme://TOKEN@host``.
+        # The git-remote-with-embedded-password shape from #6396. Only the
+        # colon-less bare-token form is redacted — ``user:pass@`` and
+        # query-string tokens are left to pass through (see the web-URL note
+        # below). See _URL_BARE_TOKEN_RE for the false-positive guards.
+        text = _URL_BARE_TOKEN_RE.sub(
+            lambda m: f"{m.group(1)}{_mask_token(m.group(2))}{m.group(3)}",
+            text,
+        )

    # JWT tokens (eyJ... — base64-encoded JSON headers)
    if "eyJ" in text:
@@ -434,7 +607,12 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    # blanket-redacting param values by name breaks those skills mid-flow.
    # Known credential shapes (sk-, ghp_, JWTs, etc.) inside URLs are still
    # caught by _PREFIX_RE and _JWT_RE above. DB connection-string passwords
-    # are still caught by _DB_CONNSTR_RE.
+    # are still caught by _DB_CONNSTR_RE. The ONE userinfo case still redacted
+    # is the colon-less bare-token form ``scheme://TOKEN@host`` (#6396, handled
+    # by _URL_BARE_TOKEN_RE in the ``://`` block above): a bare credential in
+    # userinfo is never a round-trip workflow token (those live in the query
+    # string), so masking it can't break a skill. The ``user:pass@`` form is
+    # left to pass through per #34029.

    # Form-urlencoded bodies (only triggers on clean k=v&k=v inputs).
    if "&" in text and "=" in text:
@@ -452,6 +630,66 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    return text


+# Commands whose stdout is an environment-variable dump (KEY=value lines),
+# NOT source code. For these, terminal-output redaction must run the
+# ENV-assignment pass (code_file=False) so opaque tokens with no recognized
+# vendor prefix (e.g. ``MY_SERVICE_TOKEN=abc123randomstring``) are still
+# masked. For all other commands, code_file=True is used to avoid mangling
+# legitimate source/config dumps (``MAX_TOKENS=100``, ``"apiKey": "x"``
+# fixtures, ``postgresql://{user}`` f-string templates). See issue #43025.
+_ENV_DUMP_COMMANDS = frozenset({"env", "printenv", "set", "export", "declare"})
+
+
+def is_env_dump_command(command: str | None) -> bool:
+    """Return True if ``command`` dumps environment variables to stdout.
+
+    Detects ``env`` / ``printenv`` / ``set`` / ``export`` / ``declare`` as the
+    first token of any segment in a pipeline or sequence (``;`` / ``&&`` /
+    ``||`` / ``|``). Conservative: a parse failure or anything unrecognized
+    returns False (callers then fall back to the safer code_file=True path,
+    which still masks prefix-shaped keys).
+    """
+    if not command or not isinstance(command, str):
+        return False
+    # Split on shell separators, then inspect the first token of each segment.
+    segments = re.split(r"[|;&]+", command)
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+        try:
+            tokens = shlex.split(seg)
+        except ValueError:
+            tokens = seg.split()
+        if tokens and tokens[0] in _ENV_DUMP_COMMANDS:
+            return True
+    return False
+
+
+def redact_terminal_output(
+    output: str, command: str | None = None, *, force: bool = False
+) -> str:
+    """Redact secrets from terminal/process stdout.
+
+    Single redaction policy for ALL terminal-output surfaces — foreground
+    ``terminal`` results AND background ``process(action=poll/log/wait)``
+    output — so they can't diverge. Picks ``code_file`` based on whether
+    ``command`` is an environment dump:
+
+    - env-dump command (``env``/``printenv``/``set``/``export``/``declare``)
+      → ``code_file=False`` so the ENV-assignment pass masks opaque tokens.
+    - anything else (or unknown command) → ``code_file=True`` to avoid
+      false positives on source/config dumps.
+
+    ``force=True`` bypasses the global ``security.redact_secrets`` preference
+    for safety boundaries that must never emit raw credentials.
+    """
+    if not output:
+        return output
+    code_file = not is_env_dump_command(command or "")
+    return redact_sensitive_text(output, force=force, code_file=code_file)
+
+
 # Substrings used to gate ``_PREFIX_RE`` execution. If none of these appear in
 # the input string, the prefix regex cannot match anything, so we skip it.
 # False positives are fine (they just run the regex, which then matches
--- a/agent/replay_cleanup.py
+++ b/agent/replay_cleanup.py
@@ -0,0 +1,140 @@
+"""Replay-history sanitization shared across resume code paths.
+
+When a session's last turn dies mid-tool-loop — the process is killed by a
+restart/shutdown command, a stale-timeout fires, or an interrupt lands before
+the tool result is written — the persisted transcript can end with a dangling
+``assistant(tool_calls)`` (no matching ``tool`` answer) or an interrupted
+``assistant→tool`` block.  On resume the model sees that broken tail and
+re-issues the unanswered call, producing an endless "thinking"/reboot loop
+(#49201, #29086).
+
+These pure helpers strip those tails before the history is replayed to the
+model.  They were originally local to ``gateway/run.py`` (which fixed the
+messaging-gateway path) and are extracted here so every resume surface — the
+messaging gateway AND the TUI/WebUI gateway — shares the same cleanup instead
+of the WebUI path silently skipping it.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+def is_interrupted_tool_result(content: Any) -> bool:
+    """Return True if a tool result indicates the tool was interrupted."""
+    if not isinstance(content, str):
+        return False
+    lowered = content.lower()
+    if "[command interrupted]" in lowered:
+        return True
+    if "exit_code" in lowered and ("130" in lowered or "-1" in lowered):
+        return "interrupt" in lowered
+    return False
+
+
+def strip_interrupted_tool_tails(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Strip interrupted assistant→tool sequences from replay history.
+
+    Older interrupted gateway turns can be followed by a queued real user
+    message, so the interrupted assistant/tool block is not necessarily the
+    final tail by the time we rebuild replay history.  Remove any contiguous
+    assistant(tool_calls) + tool-result block that contains an interrupted tool
+    result, while preserving successful tool-call sequences intact.
+    """
+    if not agent_history:
+        return agent_history
+
+    cleaned: List[Dict[str, Any]] = []
+    i = 0
+    n = len(agent_history)
+    while i < n:
+        msg = agent_history[i]
+        if msg.get("role") == "assistant" and "tool_calls" in msg:
+            j = i + 1
+            tool_results: List[Dict[str, Any]] = []
+            while j < n and agent_history[j].get("role") == "tool":
+                tool_results.append(agent_history[j])
+                j += 1
+            if tool_results and any(
+                is_interrupted_tool_result(m.get("content", ""))
+                for m in tool_results
+            ):
+                logger.debug(
+                    "Stripping interrupted assistant→tool replay block "
+                    "(indices %d–%d, tool_results=%d)",
+                    i, j - 1, len(tool_results),
+                )
+                i = j
+                continue
+        if msg.get("role") == "tool" and is_interrupted_tool_result(msg.get("content", "")):
+            logger.debug("Stripping orphan interrupted tool result from replay history")
+            i += 1
+            continue
+        cleaned.append(msg)
+        i += 1
+
+    return cleaned
+
+
+def strip_dangling_tool_call_tail(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
+
+    When a tool call itself kills the gateway process (``docker restart``,
+    ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
+    is terminated by SIGKILL *mid-call* — before the tool result is ever
+    written and before the orderly shutdown rewind
+    (``_drop_trailing_empty_response_scaffolding``) can run.  The last thing
+    persisted is the ``assistant`` message that issued the ``tool_calls``,
+    with zero matching ``tool`` rows.
+
+    On resume the model sees an unanswered tool call at the tail and naturally
+    re-issues it — which restarts the gateway again, producing the infinite
+    reboot loop in #49201.  ``strip_interrupted_tool_tails`` does not catch
+    this because there is no tool result to inspect for an interrupt marker.
+
+    This strips that dangling tail at the source so there is nothing for the
+    model to re-execute.  It only acts when the tail is an
+    ``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
+    results — a completed assistant→tool pair (any tool answers present) is
+    left untouched so genuine mid-progress tool loops still resume.
+    """
+    if not agent_history:
+        return agent_history
+
+    last = agent_history[-1]
+    if not (
+        isinstance(last, dict)
+        and last.get("role") == "assistant"
+        and last.get("tool_calls")
+    ):
+        return agent_history
+
+    logger.debug(
+        "Stripping dangling unanswered assistant(tool_calls) tail "
+        "(%d call(s)) — process likely killed mid-tool-call by a "
+        "restart/shutdown command (#49201)",
+        len(last.get("tool_calls") or []),
+    )
+    return agent_history[:-1]
+
+
+def sanitize_replay_history(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Apply both replay-tail strippers in the canonical order.
+
+    Convenience entry point for resume code paths: removes interrupted
+    assistant→tool blocks anywhere in the history, then removes a dangling
+    unanswered ``assistant(tool_calls)`` tail.  Returns the same list object
+    when there is nothing to strip.
+    """
+    if not agent_history:
+        return agent_history
+    return strip_dangling_tool_call_tail(strip_interrupted_tool_tails(agent_history))
--- a/agent/retry_utils.py
+++ b/agent/retry_utils.py
@@ -8,6 +8,7 @@ rate-limited provider concurrently.
 import random
 import threading
 import time
+from typing import Any

 # Monotonic counter for jitter seed uniqueness within the same process.
 # Protected by a lock to avoid race conditions in concurrent retry paths
@@ -15,6 +16,14 @@ import time
 _jitter_counter = 0
 _jitter_lock = threading.Lock()

+# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
+# ("The service may be temporarily overloaded...") for otherwise valid
+# Hermes requests. Short retries tend to hammer the same overloaded window;
+# after a few normal retries, progressively widen the wait window. Keep the
+# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
+# not sit silent for 20+ minutes.
+_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
+

 def jittered_backoff(
    attempt: int,
@@ -55,3 +64,66 @@ def jittered_backoff(
    jitter = rng.uniform(0, jitter_ratio * delay)

    return delay + jitter
+
+
+def _error_text(error: Any) -> str:
+    """Best-effort flattened provider error text for retry classification."""
+    parts = [
+        error,
+        getattr(error, "message", None),
+        getattr(error, "body", None),
+        getattr(error, "response", None),
+    ]
+    return " ".join(str(part) for part in parts if part is not None).lower()
+
+
+def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
+    """Return True for Z.AI Coding Plan transient overload 429s.
+
+    The coding-plan endpoint reports overload as HTTP 429 with body code 1305
+    and message "The service may be temporarily overloaded...". Treat only
+    that narrow shape specially so ordinary quota/billing 429s still fail fast
+    through the existing classifier.
+    """
+    base = (base_url or "").lower()
+    model_name = (model or "").lower()
+    status = getattr(error, "status_code", None)
+    text = _error_text(error)
+    return (
+        status == 429
+        and "api.z.ai/api/coding/paas/v4" in base
+        and "glm-5.2" in model_name
+        and ("1305" in text or "temporarily overloaded" in text)
+    )
+
+
+def adaptive_rate_limit_backoff(
+    attempt: int,
+    *,
+    base_url: str | None,
+    model: str | None,
+    error: Any,
+    default_wait: float,
+    short_attempts: int = 3,
+) -> tuple[float, str | None]:
+    """Provider-aware rate-limit backoff.
+
+    For most providers this returns ``default_wait`` unchanged. For Z.AI
+    Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
+    the normal short exponential schedule, then switch to progressively longer
+    waits (30s → 60s → 90s → 120s, capped) plus light jitter.
+
+    ``attempt`` is 1-based, matching the retry loop's logged attempt number.
+    Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
+    for status/log decoration when a provider-specific policy fired.
+    """
+    if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
+        return default_wait, None
+    if attempt <= short_attempts:
+        return default_wait, "zai_coding_overload_short"
+
+    idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
+    base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
+    # A smaller jitter ratio keeps long waits readable while still avoiding
+    # synchronized retry storms across concurrent Hermes sessions.
+    return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"
--- a/agent/shell_hooks.py
+++ b/agent/shell_hooks.py
@@ -122,6 +122,8 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 try:
    import fcntl  # POSIX only; Windows falls back to best-effort without flock.
 except ImportError:  # pragma: no cover
@@ -441,6 +443,7 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
        return result

    t0 = time.monotonic()
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        proc = subprocess.run(
            argv,
@@ -449,6 +452,7 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
            timeout=spec.timeout,
            text=True,
            shell=False,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        result["timed_out"] = True
@@ -584,6 +588,17 @@ def _parse_response(event: str, stdout: str) -> Optional[Dict[str, Any]]:
            return {"action": "block", "message": _block_message(data.get("reason"), data.get("message"))}
        return None

+    if event == "pre_verify":
+        # "continue" (Hermes) / "block" (Claude-Code Stop: block the stop) both
+        # mean keep going; the message/reason is the follow-up for the model. A
+        # continue with no message is a no-op — let the turn finish.
+        action = str(data.get("action") or data.get("decision") or "").strip().lower()
+        if action in {"continue", "block"}:
+            message = data.get("message") or data.get("reason")
+            if isinstance(message, str) and message.strip():
+                return {"action": "continue", "message": message.strip()}
+        return None
+
    context = data.get("context")
    if isinstance(context, str) and context.strip():
        return {"context": context}
--- a/agent/skill_preprocessing.py
+++ b/agent/skill_preprocessing.py
@@ -5,6 +5,8 @@ import re
 import subprocess
 from pathlib import Path

+from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
+
 logger = logging.getLogger(__name__)

 # Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
@@ -66,6 +68,7 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
    Failures return a short ``[inline-shell error: ...]`` marker instead of
    raising, so one bad snippet can't wreck the whole skill message.
    """
+    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        completed = subprocess.run(
            ["bash", "-c", command],
@@ -75,6 +78,7 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
            timeout=max(1, int(timeout)),
            check=False,
            stdin=subprocess.DEVNULL,
+            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"[inline-shell timeout after {timeout}s: {command}]"
--- a/agent/skill_utils.py
+++ b/agent/skill_utils.py
@@ -507,6 +507,34 @@ def get_all_skills_dirs() -> List[Path]:
    return dirs


+def _resolve_for_skill_ownership(path) -> Path:
+    path_obj = path if isinstance(path, Path) else Path(str(path))
+    try:
+        return path_obj.expanduser().resolve()
+    except (OSError, RuntimeError):
+        return path_obj.expanduser().absolute()
+
+
+def is_external_skill_path(path) -> bool:
+    """Return True when ``path`` lives under a configured external skills dir.
+
+    ``skills.external_dirs`` are externally owned: Hermes can discover and view
+    their skills, and foreground user-directed tool calls may still edit them,
+    but autonomous lifecycle maintenance must treat them as read-only. This
+    helper centralizes the ownership boundary so curator/reporting/tool paths do
+    not each need to re-interpret the config.
+    """
+    candidate = _resolve_for_skill_ownership(path)
+    for root in get_external_skills_dirs():
+        resolved_root = _resolve_for_skill_ownership(root)
+        try:
+            candidate.relative_to(resolved_root)
+            return True
+        except ValueError:
+            continue
+    return False
+
+
 # ── Condition extraction ──────────────────────────────────────────────────


--- a/agent/thinking_timeout_guidance.py
+++ b/agent/thinking_timeout_guidance.py
@@ -0,0 +1,136 @@
+"""Thinking-timeout detection and user-facing guidance for reasoning models.
+
+When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3,
+Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning)
+hits a transport-layer error before the first content token arrives, the
+upstream proxy has almost certainly idle-killed a long thinking stream —
+not a true context overflow or a configuration error.  The user needs
+distinct guidance for this case:
+
+    "The model's thinking phase exceeded the upstream proxy's idle
+     timeout before the first content token arrived.  This is a known
+     issue with reasoning models behind cloud gateways (NVIDIA NIM,
+     OpenAI, Anthropic, DeepSeek).  Workarounds in priority order:
+     1. Set `providers.<provider>.models.<model>.stale_timeout_seconds: 900`
+        in `~/.hermes/config.yaml` to extend the per-call timeout...
+     2. Lower `reasoning_budget` or set `reasoning_effort: medium`...
+     3. Use a smaller / faster reasoning model..."
+
+The existing `_is_stream_drop` guidance at
+``agent/conversation_loop.py:3464-3486`` fires for large-file-write
+stream drops ("try execute_code with Python's open() for large files")
+which is the WRONG advice for the thinking-timeout case.  This module
+provides the detection and the message as standalone helpers so the
+detection logic is unit-testable without driving the full retry loop,
+and the message text can be regression-tested for spelling and accuracy.
+
+Part 2 of Fixes #52310.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+
+# Substring set that identifies a transport-layer failure on the
+# response stream.  Same shape as the existing
+# ``_SERVER_DISCONNECT_PATTERNS`` in ``agent/error_classifier.py:394``
+# but extended to also catch the OSS-level error signature
+# (``broken pipe`` / ``errno 32``) that the upstream kill surfaces
+# to the OpenAI SDK wrapper.
+_THINKING_TIMEOUT_SUBSTRINGS: tuple[str, ...] = (
+    "broken pipe",
+    "errno 32",
+    "remote protocol",
+    "connection reset",
+    "connection lost",
+    "peer closed",
+    "server disconnected",
+)
+
+
+def is_thinking_timeout(classified: object, model: str, error_msg: str) -> bool:
+    """Return True when a reasoning model's thinking phase hit a transport kill.
+
+    Args:
+        classified: a :class:`agent.error_classifier.ClassifiedError` instance
+            (duck-typed here to avoid an import cycle in unit tests).
+        model: the model slug at failure time (e.g.
+            ``"nvidia/nemotron-3-ultra-550b-a55b"``).
+        error_msg: lowercased string representation of the underlying
+            exception (typically ``str(api_error).lower()``).
+
+    Returns True when ALL conditions hold:
+        1. ``classified.reason == FailoverReason.timeout`` (the classifier
+           override at ``agent/error_classifier.py:720-738`` ensures this
+           is the case for reasoning models even on large sessions).
+        2. ``api_error`` has no ``.status_code`` attribute set (transport
+           disconnect, not an HTTP error).
+        3. ``model`` is in the reasoning-model allowlist (reuses
+           ``agent.reasoning_timeouts.get_reasoning_stale_timeout_floor``).
+        4. ``error_msg`` contains one of the transport-kill substrings.
+
+    Non-reasoning models always return False.  Non-transport errors
+    (billing / rate_limit / auth / context_overflow / format_error)
+    always return False.  HTTP-status errors always return False.
+    """
+    # Import here (not at module top) to keep this helper cheap to
+    # import even from callers that don't need it.  ``agent.reasoning_timeouts``
+    # is small and dependency-free.
+    from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
+
+    # Condition 1: classifier says timeout.  Use a string/value check
+    # rather than importing FailoverReason so this module has zero
+    # import cycles from the error_classifier package.
+    reason = getattr(classified, "reason", None)
+    reason_value = getattr(reason, "value", None)
+    if reason_value != "timeout":
+        return False
+
+    # Condition 2: no HTTP status code (transport, not API error).
+    # Caller is expected to gate on ``getattr(api_error, "status_code", None) is None``
+    # before calling this helper; the surface here is just the post-gate
+    # boolean so the caller can pass an already-prepped error_msg.
+
+    # Condition 3: reasoning model allowlist.
+    if get_reasoning_stale_timeout_floor(model) is None:
+        return False
+
+    # Condition 4: transport-kill substring in the error message.
+    error_msg_lower = (error_msg or "").lower()
+    return any(p in error_msg_lower for p in _THINKING_TIMEOUT_SUBSTRINGS)
+
+
+def build_thinking_timeout_guidance(
+    provider: str, model: str, model_label: Optional[str] = None,
+) -> str:
+    """Return the user-facing guidance string appended to ``_final_response``.
+
+    Args:
+        provider: provider slug (e.g. ``"nvidia"``, ``"openai"``).
+        model: bare model slug the user would put in their config
+            (e.g. ``"nemotron-3-ultra-550b-a55b"`` if the user uses
+            NVIDIA direct, or the full ``"nvidia/nemotron-3-ultra-550b-a55b"``
+            if they go through an aggregator).  Used verbatim in the
+            config snippet so the user can copy-paste.
+        model_label: optional short label for the model name in the
+            prose (e.g. ``"Nemotron 3 Ultra"``).  Falls back to the
+            slug if not provided.
+    """
+    label = model_label or model
+    return (
+        "\n\nThe model's thinking phase exceeded the upstream proxy's "
+        "idle timeout before the first content token arrived. This is a "
+        f"known issue with reasoning models (like {label}) behind cloud "
+        "gateways (NVIDIA NIM, OpenAI, Anthropic, DeepSeek). Workarounds "
+        "in priority order:\n"
+        f"1. Set `providers.{provider}.models.{model}.stale_timeout_seconds: 900` "
+        "in `~/.hermes/config.yaml` to extend the per-call timeout. "
+        "(Hermes's built-in floor is 600s for known reasoning models — "
+        "if you still see this after raising, the upstream cap is even "
+        "shorter.)\n"
+        "2. Lower `reasoning_budget` or set `reasoning_effort: medium` on this "
+        "model if the provider supports it.\n"
+        "3. Use a smaller / faster reasoning model if the task doesn't "
+        "require deep thinking."
+    )
--- a/agent/tool_dispatch_helpers.py
+++ b/agent/tool_dispatch_helpers.py
@@ -11,7 +11,8 @@ Pure module-level utilities extracted from ``run_agent.py``:
  ``_append_subdir_hint_to_multimodal`` — envelope helpers for the
  ``{"_multimodal": True, "content": [...], "text_summary": ...}`` dict
  shape returned by tools like ``computer_use``.
-* ``_extract_file_mutation_targets`` / ``_extract_error_preview`` —
+* ``_extract_file_mutation_targets`` / ``_extract_landed_file_mutation_paths`` /
+  ``_extract_error_preview`` —
  per-turn file-mutation verifier inputs.
 * ``_trajectory_normalize_msg`` — strip image blobs from a message for
  trajectory saving.
@@ -269,6 +270,35 @@ def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List
    return []


+def _extract_landed_file_mutation_paths(
+    tool_name: str,
+    args: Dict[str, Any],
+    result: Any,
+) -> List[str]:
+    """Return the concrete file paths a successful mutation reports."""
+    targets = _extract_file_mutation_targets(tool_name, args)
+    if tool_name not in _FILE_MUTATING_TOOLS or not isinstance(result, str):
+        return targets
+    try:
+        data = json.loads(result.strip())
+    except Exception:
+        return targets
+    if not isinstance(data, dict):
+        return targets
+
+    files = data.get("files_modified")
+    if isinstance(files, list):
+        landed = [str(p) for p in files if p]
+        if landed:
+            return landed
+
+    resolved = data.get("resolved_path")
+    if resolved:
+        return [str(resolved)]
+
+    return targets
+
+
 def _extract_error_preview(result: Any, max_len: int = 180) -> str:
    """Pull a one-line error summary out of a tool result for footer display."""
    text = _multimodal_text_summary(result) if result is not None else ""
@@ -411,6 +441,7 @@ __all__ = [
    "_multimodal_text_summary",
    "_append_subdir_hint_to_multimodal",
    "_extract_file_mutation_targets",
+    "_extract_landed_file_mutation_paths",
    "_extract_error_preview",
    "_trajectory_normalize_msg",
    "make_tool_result_message",
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@@ -24,8 +24,10 @@ from typing import Any, Optional
 from agent.display import (
    KawaiiSpinner,
    build_tool_preview as _build_tool_preview,
+    build_tool_label as _build_tool_label,
    get_cute_tool_message as _get_cute_tool_message_impl,
    get_tool_emoji as _get_tool_emoji,
+    redact_tool_args_for_display as _redact_tool_args_for_display,
    _detect_tool_failure,
 )
 from agent.tool_guardrails import ToolGuardrailDecision
@@ -69,12 +71,35 @@ def _budget_for_agent(agent) -> BudgetConfig:
 _MAX_TOOL_WORKERS = 8


+def _flush_session_db_after_tool_progress(
+    agent,
+    messages: list,
+    *,
+    stage: str,
+) -> None:
+    """Best-effort incremental SessionDB flush for tool-call progress.
+
+    Tool execution can perform side effects that terminate or restart the
+    current Hermes process before the normal turn-end persistence path runs.
+    Flush the already-appended assistant/tool messages immediately so the
+    transcript survives destructive-but-valid tool calls.
+    """
+    try:
+        agent._flush_messages_to_session_db(messages)
+    except Exception as exc:
+        logger.warning("Incremental tool-call persistence failed after %s: %s", stage, exc)
+
+
 def _ra():
    """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work."""
    import run_agent
    return run_agent


+def _is_interpreter_shutdown_submit_error(exc: RuntimeError) -> bool:
+    return "cannot schedule new futures after interpreter shutdown" in str(exc)
+
+
 def _emit_terminal_post_tool_call(
    agent,
    *,
@@ -279,6 +304,11 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
                f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
                tc.id,
            ))
+            _flush_session_db_after_tool_progress(
+                agent,
+                messages,
+                stage=f"cancelled tool result {tc.function.name}",
+            )
        return

    # ── Parse args + pre-execution bookkeeping ───────────────────────
@@ -441,10 +471,11 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
    if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
        for i, (tc, name, args, middleware_trace, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
-            args_str = json.dumps(args, ensure_ascii=False)
+            display_args = _redact_tool_args_for_display(name, args) or args
+            args_str = json.dumps(display_args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {name}({list(display_args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
@@ -454,8 +485,9 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_progress_callback:
            try:
-                preview = _build_tool_preview(name, args)
-                agent.tool_progress_callback("tool.started", name, preview, args)
+                display_args = _redact_tool_args_for_display(name, args) or args
+                preview = _build_tool_preview(name, display_args)
+                agent.tool_progress_callback("tool.started", name, preview, display_args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

@@ -464,7 +496,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_start_callback:
            try:
-                agent.tool_start_callback(tc.id, name, args)
+                display_args = _redact_tool_args_for_display(name, args) or args
+                agent.tool_start_callback(tc.id, name, display_args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -581,13 +614,40 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        if runnable_calls:
            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                for i, tc, name, args in runnable_calls:
+                for submit_index, (i, tc, name, args) in enumerate(runnable_calls):
                    # Propagate the agent turn's ContextVars (e.g.
                    # _approval_session_key) AND thread-local approval/sudo
                    # callbacks into the worker thread; clears callbacks on exit.
-                    f = executor.submit(
-                        propagate_context_to_thread(_run_tool), i, tc, name, args, parsed_calls[i][3]
-                    )
+                    try:
+                        f = executor.submit(
+                            propagate_context_to_thread(_run_tool), i, tc, name, args, parsed_calls[i][3]
+                        )
+                    except RuntimeError as submit_error:
+                        if not _is_interpreter_shutdown_submit_error(submit_error):
+                            raise
+                        skipped_calls = runnable_calls[submit_index:]
+                        logger.warning(
+                            "interpreter shutdown while scheduling concurrent tools; "
+                            "skipping %d unsubmitted tool(s)",
+                            len(skipped_calls),
+                        )
+                        for skipped_i, _tc, skipped_name, skipped_args in skipped_calls:
+                            if results[skipped_i] is None:
+                                middleware_trace = parsed_calls[skipped_i][3]
+                                result = (
+                                    f"Error executing tool '{skipped_name}': "
+                                    "Python interpreter is shutting down; tool was not started"
+                                )
+                                results[skipped_i] = (
+                                    skipped_name,
+                                    skipped_args,
+                                    result,
+                                    0.0,
+                                    True,
+                                    False,
+                                    middleware_trace,
+                                )
+                        break
                    futures.append(f)

                # Wait for all to complete with periodic heartbeats so the
@@ -737,7 +797,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe

        if not blocked and agent.tool_complete_callback:
            try:
-                agent.tool_complete_callback(tc.id, name, args, function_result)
+                display_args = _redact_tool_args_for_display(name, args) or args
+                agent.tool_complete_callback(tc.id, name, display_args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

@@ -768,6 +829,11 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        # String results pass through unchanged.
        _tool_content = agent._tool_result_content_for_active_model(name, function_result)
        messages.append(make_tool_result_message(name, _tool_content, tc.id))
+        _flush_session_db_after_tool_progress(
+            agent,
+            messages,
+            stage=f"tool result {name}",
+        )

        # ── Per-tool /steer drain ───────────────────────────────────
        # Same as the sequential path: drain between each collected
@@ -803,13 +869,16 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
            for skipped_tc in remaining_calls:
                skipped_name = skipped_tc.function.name
-                skip_msg = {
-                    "role": "tool",
-                    "name": skipped_name,
-                    "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
-                    "tool_call_id": skipped_tc.id,
-                }
-                messages.append(skip_msg)
+                messages.append(make_tool_result_message(
+                    skipped_name,
+                    f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
+                    skipped_tc.id,
+                ))
+                _flush_session_db_after_tool_progress(
+                    agent,
+                    messages,
+                    stage=f"cancelled tool result {skipped_name}",
+                )
            break

        function_name = tool_call.function.name
@@ -891,10 +960,11 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            agent._iters_since_skill = 0

        if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
-            args_str = json.dumps(function_args, ensure_ascii=False)
+            display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+            args_str = json.dumps(display_args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {function_name}({list(display_args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
@@ -915,14 +985,16 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_progress_callback:
            try:
-                preview = _build_tool_preview(function_name, function_args)
-                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                preview = _build_tool_preview(function_name, display_args)
+                agent.tool_progress_callback("tool.started", function_name, preview, display_args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

        if not _execution_blocked and agent.tool_start_callback:
            try:
-                agent.tool_start_callback(tool_call.id, function_name, function_args)
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                agent.tool_start_callback(tool_call.id, function_name, display_args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -1152,7 +1224,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                preview = _build_tool_preview(function_name, function_args) or function_name
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                preview = _build_tool_label(function_name, display_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _ce_result = None
@@ -1185,7 +1258,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                preview = _build_tool_preview(function_name, function_args) or function_name
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                preview = _build_tool_label(function_name, display_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _mem_result = None
@@ -1216,7 +1290,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                preview = _build_tool_preview(function_name, function_args) or function_name
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                preview = _build_tool_label(function_name, display_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _spinner_result = None
@@ -1378,7 +1453,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_complete_callback:
            try:
-                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
+                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
+                agent.tool_complete_callback(tool_call.id, function_name, display_args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

@@ -1402,6 +1478,11 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
        # (see parallel path for rationale). String results pass through.
        _tool_content = agent._tool_result_content_for_active_model(function_name, function_result)
        messages.append(make_tool_result_message(function_name, _tool_content, tool_call.id))
+        _flush_session_db_after_tool_progress(
+            agent,
+            messages,
+            stage=f"tool result {function_name}",
+        )

        # ── Per-tool /steer drain ───────────────────────────────────
        # Drain pending steer BETWEEN individual tool calls so the
@@ -1428,6 +1509,11 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
                    skipped_tc.id,
                ))
+                _flush_session_db_after_tool_progress(
+                    agent,
+                    messages,
+                    stage=f"skipped tool result {skipped_name}",
+                )
            break

        if agent.tool_delay > 0 and i < len(assistant_message.tool_calls):
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@@ -5,12 +5,47 @@ This transport owns format conversion and normalization — NOT client lifecycle
 streaming, or the _run_codex_stream() call path.
 """

+import hashlib
+import json
 from typing import Any, Dict, List, Optional

 from agent.transports.base import ProviderTransport
 from agent.transports.types import NormalizedResponse, ToolCall


+def _content_cache_key(instructions: str, tools: Optional[List[Dict[str, Any]]]) -> Optional[str]:
+    """Content-address the prompt cache key from the static request prefix.
+
+    Returns ``pck_<sha256[:24]>`` of (instructions + sorted tool schemas), or
+    None when there is nothing static to key on. The cache key is a routing
+    hint only — never a correctness boundary — so two requests sharing a system
+    prompt and tool set intentionally resolve to the same warm prefix bucket.
+
+    The fix this exists for: recurring cron jobs build session_id as
+    ``cron_<id>_<timestamp>``, so using session_id as the cache key made every
+    fire cache-cold. The static prefix (identity + tools) is identical across
+    fires, so hashing it gives a stable key that stays warm within the
+    provider's cache TTL. Sorting tools by name keeps the hash insertion-order
+    independent.
+    """
+    if not instructions and not tools:
+        return None
+    tools_part = ""
+    if tools:
+        sorted_tools = sorted(
+            (t for t in tools if isinstance(t, dict)),
+            key=lambda t: str(t.get("name") or t.get("type") or ""),
+        )
+        tools_part = json.dumps(
+            sorted_tools, sort_keys=True, ensure_ascii=False, separators=(",", ":")
+        )
+    # \x00 separator so instructions ending in the tool JSON can't collide with
+    # a request whose instructions contain that JSON and whose tools are empty.
+    content = f"{instructions or ''}\x00{tools_part}"
+    digest = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()[:24]
+    return f"pck_{digest}"
+
+
 class ResponsesApiTransport(ProviderTransport):
    """Transport for api_mode='codex_responses'.

@@ -71,7 +106,10 @@ class ResponsesApiTransport(ProviderTransport):
        params:
            instructions: str — system prompt (extracted from messages[0] if not given)
            reasoning_config: dict | None — {effort, enabled}
-            session_id: str | None — used for prompt_cache_key + xAI conv header
+            session_id: str | None — transcript/session id; drives the xAI
+                x-grok-conv-id header and the Codex cache-scope headers, and is
+                the fallback prompt_cache_key when there is no static prefix to
+                content-address
            max_tokens: int | None — max_output_tokens
            timeout: float | None — per-request timeout forwarded to the SDK
            request_overrides: dict | None — extra kwargs merged in
@@ -212,10 +250,17 @@ class ResponsesApiTransport(ProviderTransport):
            kwargs["parallel_tool_calls"] = True

        session_id = params.get("session_id")
+        # prompt_cache_key is content-addressed from the static prefix
+        # (instructions + tools), NOT session_id — recurring cron jobs carry a
+        # per-fire timestamp in session_id (cron_<id>_<ts>) that made every run
+        # cache-cold. session_id is left untouched for transcript isolation and
+        # the cache-scope routing headers below. Falls back to session_id when
+        # there is no static content to hash.
+        cache_key = _content_cache_key(instructions, response_tools) or session_id
        # xAI Responses takes prompt_cache_key in extra_body (set further
        # down); GitHub Models opts out of cache-key routing entirely.
-        if not is_github_responses and not is_xai_responses and session_id:
-            kwargs["prompt_cache_key"] = session_id
+        if not is_github_responses and not is_xai_responses and cache_key:
+            kwargs["prompt_cache_key"] = cache_key

        if reasoning_enabled and is_xai_responses:
            from agent.model_metadata import grok_supports_reasoning_effort
@@ -326,7 +371,7 @@ class ResponsesApiTransport(ProviderTransport):
            merged_extra_body: Dict[str, Any] = {}
            if isinstance(existing_extra_body, dict):
                merged_extra_body.update(existing_extra_body)
-            merged_extra_body.setdefault("prompt_cache_key", session_id)
+            merged_extra_body.setdefault("prompt_cache_key", cache_key)
            kwargs["extra_body"] = merged_extra_body

        return kwargs
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@@ -28,8 +28,12 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional

+from agent.conversation_compression import conversation_history_after_compression
 from agent.iteration_budget import IterationBudget
-from agent.model_metadata import estimate_request_tokens_rough
+from agent.model_metadata import (
+    estimate_messages_tokens_rough,
+    estimate_request_tokens_rough,
+)

 logger = logging.getLogger(__name__)

@@ -57,6 +61,34 @@ def _compression_made_progress(
    return orig_tokens > 0 and new_tokens < orig_tokens * 0.95


+def _should_run_preflight_estimate(
+    messages: List[Dict[str, Any]],
+    protect_first_n: int,
+    protect_last_n: int,
+    threshold_tokens: int,
+) -> bool:
+    """Cheap gate for the (expensive) full preflight token estimate.
+
+    Returns ``True`` when either:
+      (a) message count exceeds the protected ranges (the historical gate), or
+      (b) a cheap char-based estimate already crosses the configured threshold
+          — the few-but-huge case from issue #27405 that the count-only gate
+          would silently skip (a handful of very large messages never trips
+          the count condition, so compression was never attempted and the
+          turn hit a hard context-overflow error).
+
+    Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based
+    estimator) so a single large base64 image isn't mistaken for ~250K tokens.
+    It intentionally undercounts vs. the full request estimate — it omits the
+    system prompt and tool schemas — because it is only a *hint* deciding
+    whether to pay for the authoritative ``estimate_request_tokens_rough``,
+    which (together with ``should_compress``) makes the real decision.
+    """
+    if len(messages) > protect_first_n + protect_last_n + 1:
+        return True
+    return estimate_messages_tokens_rough(messages) >= threshold_tokens
+
+
@dataclass
 class TurnContext:
    """Values produced by the turn prologue and consumed by the turn loop."""
@@ -111,7 +143,13 @@ def build_turn_context(
    # Guard stdio against OSError from broken pipes (systemd/headless/daemon).
    install_safe_stdio()

-    agent._ensure_db_session()
+    # NOTE: the DB session row is created later, AFTER the system prompt is
+    # restored/built (see _ensure_db_session() below the system-prompt block).
+    # Creating it here — before _cached_system_prompt is populated — inserts a
+    # row with system_prompt=NULL on a fresh API/gateway agent that carries
+    # client-managed history, which then trips the "stored system prompt is
+    # null; rebuilding from scratch" warning and a needless first-turn prefix
+    # cache miss. (Issue #45499.)

    # Tell auxiliary_client what the live main provider/model are for this turn.
    try:
@@ -278,6 +316,11 @@ def build_turn_context(

    active_system_prompt = agent._cached_system_prompt

+    # Create the DB session row now that _cached_system_prompt is populated, so
+    # the persisted snapshot is written non-NULL on the first turn (Issue
+    # #45499). Idempotent: _ensure_db_session() no-ops once the row exists.
+    agent._ensure_db_session()
+
    # Crash-resilience: persist the inbound user turn as soon as the session row exists.
    try:
        agent._persist_session(messages, conversation_history)
@@ -289,10 +332,14 @@ def build_turn_context(
        )

    # ── Preflight context compression ──
-    if (
-        agent.compression_enabled
-        and len(messages) > agent.context_compressor.protect_first_n
-                            + agent.context_compressor.protect_last_n + 1
+    # Gate the (expensive) full token estimate behind a cheap pre-check.
+    # See ``_should_run_preflight_estimate`` for the OR semantics that fix
+    # issue #27405 (a few very large messages slipping past the count gate).
+    if agent.compression_enabled and _should_run_preflight_estimate(
+        messages,
+        agent.context_compressor.protect_first_n,
+        agent.context_compressor.protect_last_n,
+        agent.context_compressor.threshold_tokens,
    ):
        _preflight_tokens = estimate_request_tokens_rough(
            messages,
@@ -354,7 +401,9 @@ def build_turn_context(
                    _orig_len, len(messages), _orig_tokens, _preflight_tokens
                ):
                    break  # Cannot compress further: neither rows nor tokens moved
-                conversation_history = None
+                conversation_history = conversation_history_after_compression(
+                    agent, messages
+                )
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
@@ -392,6 +441,9 @@ def build_turn_context(

    # Per-turn file-mutation verifier state.
    agent._turn_failed_file_mutations = {}
+    agent._turn_file_mutation_paths = set()
+    agent._verification_stop_nudges = 0
+    agent._pre_verify_nudges = 0

    # Record the execution thread so interrupt()/clear_interrupt() can scope
    # the tool-level interrupt signal to THIS agent's thread only.
--- a/agent/turn_finalizer.py
+++ b/agent/turn_finalizer.py
@@ -166,6 +166,25 @@ def finalize_turn(
    # same empty-response loop again.
    try:
        agent._drop_trailing_empty_response_scaffolding(messages)
+
+        # When the turn was interrupted and the last message is a tool
+        # result, append a synthetic assistant message to close the
+        # tool-call sequence. Without this, the session persists a
+        # ``tool → user`` alternation that strict providers (Gemini,
+        # Claude) reject, causing them to hallucinate a continuation of
+        # the user's message on the next turn (#48879).
+        #
+        # ``_drop_trailing_empty_response_scaffolding`` only rewinds the
+        # tool tail when an empty-response scaffolding flag is present; a
+        # clean ``/stop`` interrupt after a successful tool sets no such
+        # flag, so the tool result survives as the tail and we close it
+        # here instead. On an interrupt ``final_response`` is typically
+        # empty, so fall back to an explicit placeholder rather than
+        # persisting an empty-content assistant turn.
+        if interrupted:
+            from agent.message_sanitization import close_interrupted_tool_sequence
+            close_interrupted_tool_sequence(messages, final_response)
+
        agent._persist_session(messages, conversation_history)
    except Exception as _persist_err:
        _cleanup_errors.append(f"persist_session: {_persist_err}")
@@ -270,7 +289,14 @@ def finalize_turn(
                    and len(_stripped) <= 24
                    and _stripped[-1:] not in {".", "!", "?", "。", "！", "？", "`", ")"}
                )
-                if _is_empty_terminal or _is_partial_fragment:
+                _is_partial_stream_recovery = (
+                    str(_turn_exit_reason) == "partial_stream_recovery"
+                )
+                if (
+                    _is_empty_terminal
+                    or _is_partial_fragment
+                    or _is_partial_stream_recovery
+                ):
                    _explanation = agent._format_turn_completion_explanation(
                        _turn_exit_reason
                    )
--- a/agent/turn_retry_state.py
+++ b/agent/turn_retry_state.py
@@ -67,6 +67,11 @@ class TurnRetryState:
    # ── Restart signals (read by the outer loop after the attempt) ───────
    restart_with_compressed_messages: bool = False
    restart_with_length_continuation: bool = False
+    # Set when a content-filter stream stall (e.g. MiniMax "new_sensitive")
+    # has been escalated to the fallback chain: the partial-stream content
+    # was rolled back off ``messages`` and the loop should re-issue the API
+    # call against the newly-activated provider (#32421).
+    restart_with_rebuilt_messages: bool = False

    def __iter__(self):
        # Convenience for debugging / tests: iterate (name, value) pairs.
--- a/agent/verification_evidence.py
+++ b/agent/verification_evidence.py
@@ -0,0 +1,618 @@
+"""Coding verification evidence ledger.
+
+This module records what the agent actually proved while working in a code
+workspace. It is deliberately passive: it never decides to run a suite, never
+blocks completion, and never upgrades targeted checks into "repo green".
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import shlex
+import sqlite3
+import tempfile
+import threading
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+from hermes_constants import get_hermes_home
+
+
+_DB_LOCK = threading.Lock()
+_MAX_OUTPUT_SUMMARY_CHARS = 2000
+_MAX_EVIDENCE_AGE_DAYS = 30
+_MAX_EVENTS_PER_SESSION_ROOT = 100
+_MAX_TOTAL_UNREFERENCED_EVENTS = 10_000
+_AD_HOC_SCRIPT_NAME_PREFIXES = ("hermes-verify-", "hermes-ad-hoc-")
+_VERIFY_SCHEMA_VERSION = 1
+_SHELL_SPLIT_RE = re.compile(r"\s*(?:&&|\|\||;)\s*")
+
+
+@dataclass(frozen=True)
+class VerificationEvidence:
+    """A classified command result worth recording."""
+
+    command: str
+    canonical_command: str
+    kind: str
+    scope: str
+    status: str
+    exit_code: int
+    cwd: str
+    root: str
+    session_id: str
+    output_summary: str = ""
+
+
+def _utc_now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _retention_cutoff() -> str:
+    return (datetime.now(timezone.utc) - timedelta(days=_MAX_EVIDENCE_AGE_DAYS)).isoformat()
+
+
+def _db_path() -> Path:
+    return get_hermes_home() / "verification_evidence.db"
+
+
+def _connect() -> sqlite3.Connection:
+    path = _db_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(path)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA busy_timeout=5000")
+    conn.row_factory = sqlite3.Row
+    _ensure_schema(conn)
+    return conn
+
+
+def _ensure_schema(conn: sqlite3.Connection) -> None:
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS meta (
+            key TEXT PRIMARY KEY,
+            value TEXT NOT NULL
+        )
+        """
+    )
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS verification_events (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            created_at TEXT NOT NULL,
+            session_id TEXT NOT NULL,
+            cwd TEXT NOT NULL,
+            root TEXT NOT NULL,
+            command TEXT NOT NULL,
+            canonical_command TEXT NOT NULL,
+            kind TEXT NOT NULL,
+            scope TEXT NOT NULL,
+            status TEXT NOT NULL,
+            exit_code INTEGER NOT NULL,
+            output_summary TEXT NOT NULL
+        )
+        """
+    )
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS verification_state (
+            session_id TEXT NOT NULL,
+            root TEXT NOT NULL,
+            last_event_id INTEGER,
+            last_edit_at TEXT,
+            changed_paths_json TEXT NOT NULL DEFAULT '[]',
+            PRIMARY KEY (session_id, root)
+        )
+        """
+    )
+    conn.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_verification_events_session_root
+        ON verification_events(session_id, root, id DESC)
+        """
+    )
+    conn.execute(
+        "INSERT OR REPLACE INTO meta(key, value) VALUES ('schema_version', ?)",
+        (str(_VERIFY_SCHEMA_VERSION),),
+    )
+    conn.commit()
+
+
+def _split_segment_tokens(command: str) -> list[list[str]]:
+    segments: list[list[str]] = []
+    for segment in _SHELL_SPLIT_RE.split(command.strip()):
+        if not segment:
+            continue
+        try:
+            tokens = shlex.split(segment)
+        except ValueError:
+            continue
+        if tokens:
+            segments.append(tokens)
+    return segments
+
+
+def _clean_token(token: str) -> str:
+    token = token.strip()
+    while token.startswith("./"):
+        token = token[2:]
+    return token
+
+
+def _canonical_tokens(canonical: str) -> list[str]:
+    try:
+        return [_clean_token(t) for t in shlex.split(canonical) if t]
+    except ValueError:
+        return []
+
+
+def _find_subsequence(tokens: list[str], needle: list[str]) -> Optional[int]:
+    if not tokens or not needle or len(needle) > len(tokens):
+        return None
+    cleaned = [_clean_token(t) for t in tokens]
+    for idx in range(0, len(cleaned) - len(needle) + 1):
+        if cleaned[idx:idx + len(needle)] == needle:
+            return idx
+    return None
+
+
+def _strip_command_prefix(tokens: list[str]) -> list[str]:
+    """Remove harmless command prefixes before matching canonical commands."""
+    remaining = list(tokens)
+    if remaining and remaining[0] == "env":
+        remaining = remaining[1:]
+    while remaining and "=" in remaining[0] and not remaining[0].startswith("-"):
+        remaining = remaining[1:]
+    while remaining and remaining[0] in {"command", "time", "noglob"}:
+        remaining = remaining[1:]
+    return remaining
+
+
+def _equivalent_needles(needle: list[str]) -> list[list[str]]:
+    """Return command spellings equivalent to the detected canonical command."""
+    candidates = [needle]
+    if len(needle) >= 3 and needle[1] == "run":
+        package_manager = needle[0]
+        script_name = needle[2]
+        if package_manager in {"npm", "pnpm", "yarn", "bun"}:
+            candidates.append([package_manager, script_name])
+    if len(needle) == 1 and "/" in needle[0]:
+        candidates.extend([["bash", needle[0]], ["sh", needle[0]]])
+    if needle == ["pytest"]:
+        candidates.extend(
+            [
+                ["python", "-m", "pytest"],
+                ["python3", "-m", "pytest"],
+                ["uv", "run", "pytest"],
+                ["poetry", "run", "pytest"],
+                ["pipenv", "run", "pytest"],
+            ]
+        )
+    return candidates
+
+
+def _find_canonical_match(command: str, canonical_commands: list[str]) -> Optional[tuple[str, list[str]]]:
+    """Return ``(canonical, trailing_args)`` for the first detected command."""
+
+    segments = _split_segment_tokens(command)
+    for canonical in canonical_commands:
+        needle = _canonical_tokens(canonical)
+        if not needle:
+            continue
+        for tokens in segments:
+            candidate_tokens = _strip_command_prefix(tokens)
+            for candidate in _equivalent_needles(needle):
+                if candidate_tokens[:len(candidate)] == candidate:
+                    return canonical, candidate_tokens[len(candidate):]
+    return None
+
+
+def _kind_for_command(canonical: str) -> str:
+    lowered = canonical.lower()
+    if any(word in lowered for word in ("lint", "eslint", "ruff")):
+        return "lint"
+    if any(word in lowered for word in ("typecheck", "tsc", "mypy", "pyright", "ty")):
+        return "typecheck"
+    if "build" in lowered:
+        return "build"
+    if "fmt" in lowered or "format" in lowered:
+        return "format"
+    if "check" in lowered and "test" not in lowered:
+        return "check"
+    return "test"
+
+
+def _looks_like_target(arg: str) -> bool:
+    if not arg or arg.startswith("-") or "=" in arg:
+        return False
+    return (
+        "/" in arg
+        or "\\" in arg
+        or "::" in arg
+        or arg.endswith((".py", ".js", ".jsx", ".ts", ".tsx", ".rs", ".go", ".java"))
+        or arg.startswith(("test_", "tests", "spec", "__tests__"))
+    )
+
+
+def _scope_for_args(args: list[str]) -> str:
+    return "targeted" if any(_looks_like_target(arg) for arg in args) else "full"
+
+
+def _is_under_temp_dir(token: str) -> bool:
+    if not token or token.startswith("-"):
+        return False
+    try:
+        path = Path(token).expanduser()
+        if not path.is_absolute():
+            return False
+        resolved = path.resolve()
+        temp_root = Path(tempfile.gettempdir()).resolve()
+        return resolved == temp_root or temp_root in resolved.parents
+    except Exception:
+        return False
+
+
+def _is_under_root(token: str, root: str | Path | None) -> bool:
+    if not root:
+        return False
+    try:
+        path = Path(token).expanduser().resolve()
+        root_path = Path(root).expanduser().resolve()
+        return path == root_path or root_path in path.parents
+    except Exception:
+        return False
+
+
+def _is_temp_script_path(token: str, root: str | Path | None) -> bool:
+    try:
+        name = Path(token).expanduser().name
+    except Exception:
+        return False
+    return (
+        name.startswith(_AD_HOC_SCRIPT_NAME_PREFIXES)
+        and _is_under_temp_dir(token)
+        and not _is_under_root(token, root)
+    )
+
+
+def _ad_hoc_script_args(tokens: list[str], root: str | Path | None) -> Optional[list[str]]:
+    candidate_tokens = _strip_command_prefix(tokens)
+    if not candidate_tokens:
+        return None
+    command = candidate_tokens[0]
+    if _is_temp_script_path(command, root):
+        return candidate_tokens[1:]
+    if command in {"python", "python3", "node", "bash", "sh", "ruby", "perl"}:
+        for idx, token in enumerate(candidate_tokens[1:], start=1):
+            if token == "--":
+                continue
+            if _is_temp_script_path(token, root):
+                return candidate_tokens[idx + 1:]
+            if not token.startswith("-"):
+                return None
+    return None
+
+
+def _find_ad_hoc_match(command: str, root: str | Path | None) -> Optional[list[str]]:
+    for tokens in _split_segment_tokens(command):
+        trailing_args = _ad_hoc_script_args(tokens, root)
+        if trailing_args is not None:
+            return trailing_args
+    return None
+
+
+def _summarize_output(output: str) -> str:
+    text = (output or "").strip()
+    if len(text) <= _MAX_OUTPUT_SUMMARY_CHARS:
+        return text
+    head = _MAX_OUTPUT_SUMMARY_CHARS // 3
+    tail = _MAX_OUTPUT_SUMMARY_CHARS - head
+    return (
+        text[:head]
+        + f"\n... [{len(text) - _MAX_OUTPUT_SUMMARY_CHARS} chars omitted] ...\n"
+        + text[-tail:]
+    )
+
+
+def _prune_old_events(conn: sqlite3.Connection, *, session_id: str, root: str) -> None:
+    """Bound ledger growth without deleting the current state pointer."""
+    cutoff = _retention_cutoff()
+    conn.execute(
+        """
+        DELETE FROM verification_events
+        WHERE session_id = ?
+          AND root = ?
+          AND id NOT IN (
+              SELECT id FROM verification_events
+              WHERE session_id = ? AND root = ?
+              ORDER BY id DESC
+              LIMIT ?
+          )
+        """,
+        (session_id, root, session_id, root, _MAX_EVENTS_PER_SESSION_ROOT),
+    )
+    conn.execute(
+        """
+        DELETE FROM verification_state
+        WHERE (
+            last_edit_at IS NOT NULL
+            AND last_edit_at < ?
+        )
+        OR (
+            last_edit_at IS NULL
+            AND last_event_id IN (
+                SELECT id FROM verification_events
+                WHERE created_at < ?
+            )
+        )
+        """,
+        (cutoff, cutoff),
+    )
+    conn.execute(
+        """
+        DELETE FROM verification_events
+        WHERE created_at < ?
+          AND id NOT IN (
+              SELECT last_event_id FROM verification_state
+              WHERE last_event_id IS NOT NULL
+          )
+        """,
+        (cutoff,),
+    )
+    conn.execute(
+        """
+        DELETE FROM verification_events
+        WHERE id NOT IN (
+            SELECT id FROM verification_events
+            ORDER BY id DESC
+            LIMIT ?
+        )
+          AND id NOT IN (
+              SELECT last_event_id FROM verification_state
+              WHERE last_event_id IS NOT NULL
+          )
+        """,
+        (_MAX_TOTAL_UNREFERENCED_EVENTS,),
+    )
+
+
+def classify_verification_command(
+    command: str,
+    *,
+    cwd: str | Path | None = None,
+    session_id: str | None = None,
+    exit_code: int = 0,
+    output: str = "",
+) -> Optional[VerificationEvidence]:
+    """Classify a terminal command as verification evidence, if applicable."""
+
+    if not command or not isinstance(command, str):
+        return None
+    try:
+        from agent.coding_context import project_facts_for
+
+        facts = project_facts_for(cwd)
+    except Exception:
+        facts = None
+    if not facts:
+        return None
+
+    verify_commands = list(facts.get("verifyCommands") or [])
+    match = _find_canonical_match(command, verify_commands)
+    is_ad_hoc = False
+    if match is None and not verify_commands:
+        ad_hoc_args = _find_ad_hoc_match(command, facts.get("root"))
+        if ad_hoc_args is not None:
+            match = ("ad-hoc verification script", ad_hoc_args)
+            is_ad_hoc = True
+    if match is None:
+        return None
+
+    canonical, trailing_args = match
+    return VerificationEvidence(
+        command=command,
+        canonical_command=canonical,
+        kind="ad_hoc" if is_ad_hoc else _kind_for_command(canonical),
+        scope="targeted" if is_ad_hoc else _scope_for_args(trailing_args),
+        status="passed" if int(exit_code) == 0 else "failed",
+        exit_code=int(exit_code),
+        cwd=str(Path(cwd or ".").resolve()),
+        root=str(facts.get("root") or Path(cwd or ".").resolve()),
+        session_id=str(session_id or "default"),
+        output_summary=_summarize_output(output),
+    )
+
+
+def record_terminal_result(
+    *,
+    command: str,
+    cwd: str | Path | None,
+    session_id: str | None,
+    exit_code: int,
+    output: str = "",
+) -> Optional[dict[str, Any]]:
+    """Record a foreground terminal result when it is verification evidence."""
+
+    evidence = classify_verification_command(
+        command,
+        cwd=cwd,
+        session_id=session_id,
+        exit_code=exit_code,
+        output=output,
+    )
+    if evidence is None:
+        return None
+
+    created_at = _utc_now()
+    with _DB_LOCK:
+        with _connect() as conn:
+            cur = conn.execute(
+                """
+                INSERT INTO verification_events(
+                    created_at, session_id, cwd, root, command, canonical_command,
+                    kind, scope, status, exit_code, output_summary
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    created_at,
+                    evidence.session_id,
+                    evidence.cwd,
+                    evidence.root,
+                    evidence.command,
+                    evidence.canonical_command,
+                    evidence.kind,
+                    evidence.scope,
+                    evidence.status,
+                    evidence.exit_code,
+                    evidence.output_summary,
+                ),
+            )
+            if cur.lastrowid is None:
+                raise RuntimeError("verification event insert did not return an id")
+            event_id = int(cur.lastrowid)
+            conn.execute(
+                """
+                INSERT INTO verification_state(
+                    session_id, root, last_event_id, last_edit_at, changed_paths_json
+                ) VALUES (?, ?, ?, NULL, '[]')
+                ON CONFLICT(session_id, root) DO UPDATE SET
+                    last_event_id = excluded.last_event_id,
+                    last_edit_at = NULL,
+                    changed_paths_json = '[]'
+                """,
+                (evidence.session_id, evidence.root, event_id),
+            )
+            _prune_old_events(conn, session_id=evidence.session_id, root=evidence.root)
+            conn.commit()
+
+    return {"id": event_id, **evidence.__dict__, "created_at": created_at}
+
+
+def mark_workspace_edited(
+    *,
+    session_id: str | None,
+    cwd: str | Path | None,
+    paths: list[str] | tuple[str, ...] | None = None,
+) -> Optional[dict[str, Any]]:
+    """Mark verification evidence stale after a successful file edit."""
+
+    try:
+        from agent.coding_context import project_facts_for
+
+        facts = project_facts_for(cwd)
+    except Exception:
+        facts = None
+    if not facts:
+        return None
+
+    sid = str(session_id or "default")
+    root = str(facts.get("root") or Path(cwd or ".").resolve())
+    changed_paths = sorted({str(p) for p in (paths or []) if p})
+    edited_at = _utc_now()
+
+    with _DB_LOCK:
+        with _connect() as conn:
+            row = conn.execute(
+                """
+                SELECT changed_paths_json FROM verification_state
+                WHERE session_id = ? AND root = ?
+                """,
+                (sid, root),
+            ).fetchone()
+            existing: set[str] = set()
+            if row is not None:
+                try:
+                    existing = set(json.loads(row["changed_paths_json"] or "[]"))
+                except (TypeError, ValueError):
+                    existing = set()
+            merged = sorted((existing | set(changed_paths)))[-200:]
+            conn.execute(
+                """
+                INSERT INTO verification_state(
+                    session_id, root, last_event_id, last_edit_at, changed_paths_json
+                ) VALUES (?, ?, NULL, ?, ?)
+                ON CONFLICT(session_id, root) DO UPDATE SET
+                    last_edit_at = excluded.last_edit_at,
+                    changed_paths_json = excluded.changed_paths_json
+                """,
+                (sid, root, edited_at, json.dumps(merged)),
+            )
+            conn.commit()
+
+    return {"session_id": sid, "root": root, "last_edit_at": edited_at, "changed_paths": changed_paths}
+
+
+def verification_status(
+    *,
+    session_id: str | None,
+    cwd: str | Path | None,
+) -> dict[str, Any]:
+    """Return the best known verification state for a session/workspace."""
+
+    try:
+        from agent.coding_context import project_facts_for
+
+        facts = project_facts_for(cwd)
+    except Exception:
+        facts = None
+    if not facts:
+        return {"status": "not_applicable", "evidence": None}
+
+    sid = str(session_id or "default")
+    root = str(facts.get("root") or Path(cwd or ".").resolve())
+    with _DB_LOCK:
+        with _connect() as conn:
+            state = conn.execute(
+                """
+                SELECT last_event_id, last_edit_at, changed_paths_json
+                FROM verification_state
+                WHERE session_id = ? AND root = ?
+                """,
+                (sid, root),
+            ).fetchone()
+            if state is None:
+                return {
+                    "status": "unverified",
+                    "evidence": None,
+                    "root": root,
+                    "session_id": sid,
+                    "changed_paths": [],
+                }
+            event = None
+            if state["last_event_id"] is not None:
+                event = conn.execute(
+                    "SELECT * FROM verification_events WHERE id = ?",
+                    (state["last_event_id"],),
+                ).fetchone()
+
+    changed_paths: list[str] = []
+    try:
+        changed_paths = json.loads(state["changed_paths_json"] or "[]")
+    except (TypeError, ValueError):
+        changed_paths = []
+
+    if event is None:
+        return {
+            "status": "unverified",
+            "evidence": None,
+            "root": root,
+            "session_id": sid,
+            "changed_paths": changed_paths,
+        }
+
+    evidence = dict(event)
+    if state["last_edit_at"] and state["last_edit_at"] > evidence["created_at"]:
+        status = "stale"
+    else:
+        status = evidence["status"]
+    return {
+        "status": status,
+        "evidence": evidence,
+        "root": root,
+        "session_id": sid,
+        "changed_paths": changed_paths,
+    }
--- a/agent/verification_stop.py
+++ b/agent/verification_stop.py
@@ -0,0 +1,314 @@
+"""Turn-end verification guard for coding edits.
+
+This module is intentionally policy-only. It never runs checks itself; it turns
+the passive verification ledger into a bounded follow-up when the model tries to
+finish immediately after editing code without fresh evidence.
+"""
+
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Iterable
+
+
+_MAX_CHANGED_PATHS_IN_NUDGE = 8
+
+# Non-code file extensions whose edits carry no verifiable runtime behavior:
+# documentation, prose, and data/markup that no test/build exercises. When a
+# turn touches ONLY these, verify-on-stop has nothing to check, so the nudge is
+# suppressed (this is fix "C" for the doc/markdown/skill false-positive — a
+# SKILL.md or README edit must never demand a /tmp verification script). A turn
+# that edits any non-listed path (a real source/code/config file) still nudges.
+_NON_CODE_VERIFY_EXTENSIONS = frozenset(
+    {
+        ".md",
+        ".markdown",
+        ".mdx",
+        ".rst",
+        ".txt",
+        ".text",
+        ".adoc",
+        ".asciidoc",
+        ".org",
+        ".log",
+        ".csv",
+        ".tsv",
+    }
+)
+
+# Filenames (case-insensitive, extension-less or otherwise) that are pure prose
+# even without a recognized doc extension.
+_NON_CODE_VERIFY_FILENAMES = frozenset(
+    {
+        "license",
+        "licence",
+        "notice",
+        "authors",
+        "contributors",
+        "changelog",
+        "codeowners",
+    }
+)
+
+
+def _is_non_code_path(raw: str) -> bool:
+    """Return True when a changed path is documentation/prose with nothing to verify."""
+    try:
+        p = Path(str(raw))
+    except Exception:
+        return False
+    suffix = p.suffix.lower()
+    if suffix in _NON_CODE_VERIFY_EXTENSIONS:
+        return True
+    if not suffix and p.name.lower() in _NON_CODE_VERIFY_FILENAMES:
+        return True
+    return False
+
+
+def _filter_verifiable_paths(paths: Iterable[str]) -> list[str]:
+    """Drop documentation/prose paths; keep paths that could have verifiable behavior."""
+    return [p for p in paths if p and not _is_non_code_path(p)]
+
+
+# Session identities (platform or source) that are NOT human conversational
+# messaging surfaces: interactive coding surfaces (CLI, TUI, desktop, codex,
+# local, gateway) and programmatic callers (API server, webhooks, tools).
+# Verify-on-stop stays ON by default for these. Any other resolved gateway
+# platform is a conversational messaging surface (Telegram, Discord, WhatsApp,
+# Signal, Slack, etc.) where the verification narrative would reach a human as
+# chat noise, so it defaults OFF. Mirrors LOCAL_SESSION_SOURCE_IDS in
+# apps/desktop/src/lib/session-source.ts; keep roughly in sync when adding a
+# local or programmatic surface. Default-deny by design: an unrecognized
+# identity is treated as messaging (OFF) so a new chat platform never leaks the
+# verification receipt before this set is updated.
+_NON_MESSAGING_SESSION_SURFACES = frozenset(
+    {
+        "",
+        "cli",
+        "codex",
+        "desktop",
+        "gateway",
+        "local",
+        "tui",
+        "tool",
+        "api_server",
+        "webhook",
+        "msgraph_webhook",
+    }
+)
+
+
+def _session_is_messaging_surface() -> bool:
+    """Return whether this turn is delivered over a human messaging channel.
+
+    The gateway binds the platform value (e.g. ``telegram``) to
+    ``HERMES_SESSION_PLATFORM``; the CLI and TUI set ``HERMES_SESSION_SOURCE``
+    (e.g. ``cli``, ``tui``) instead. Both are consulted via the session-context
+    helper (with an ``os.environ`` fallback), alongside the ``HERMES_PLATFORM``
+    override, matching the sibling platform resolution in
+    ``agent/skill_commands.py`` and ``agent/prompt_builder.py``. A turn is a
+    messaging surface when a resolved identity is present and is not a known
+    non-messaging surface.
+    """
+    try:
+        from gateway.session_context import get_session_env
+
+        platform = (
+            os.getenv("HERMES_PLATFORM")
+            or get_session_env("HERMES_SESSION_PLATFORM", "")
+        )
+        source = get_session_env("HERMES_SESSION_SOURCE", "")
+    except Exception:
+        platform = os.getenv("HERMES_PLATFORM", "") or os.environ.get(
+            "HERMES_SESSION_PLATFORM", ""
+        )
+        source = os.environ.get("HERMES_SESSION_SOURCE", "")
+    for identity in (platform, source):
+        identity = str(identity or "").strip().lower()
+        if identity and identity not in _NON_MESSAGING_SESSION_SURFACES:
+            return True
+    return False
+
+
+def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
+    """Return whether edit -> verify-before-finish behavior is enabled.
+
+    Precedence: an explicit ``HERMES_VERIFY_ON_STOP`` env var wins, then an
+    explicit ``agent.verify_on_stop`` config value. The config default is
+    ``False`` (see ``DEFAULT_CONFIG``) — verify-on-stop is OFF unless the user
+    opts in. The legacy ``"auto"`` sentinel is still honored for anyone who
+    sets it explicitly: it resolves to ON for interactive coding surfaces
+    (CLI, TUI, desktop) and programmatic callers, and OFF for conversational
+    messaging surfaces (Telegram, Discord, etc.). A missing/unknown value
+    falls back to OFF.
+    """
+    env = os.environ.get("HERMES_VERIFY_ON_STOP")
+    if env is not None:
+        return env.strip().lower() not in {"0", "false", "no", "off"}
+    if config is None:
+        try:
+            from hermes_cli.config import load_config
+
+            config = load_config()
+        except Exception:
+            config = {}
+    agent_cfg = (config or {}).get("agent") if isinstance(config, dict) else None
+    cfg_val = agent_cfg.get("verify_on_stop") if isinstance(agent_cfg, dict) else None
+    if isinstance(cfg_val, bool):
+        return cfg_val
+    if isinstance(cfg_val, str):
+        token = cfg_val.strip().lower()
+        if token in {"1", "true", "yes", "on"}:
+            return True
+        if token in {"0", "false", "no", "off"}:
+            return False
+        if token == "auto":
+            # Explicit opt-in to the legacy surface-aware behavior.
+            return not _session_is_messaging_surface()
+    # Missing or unknown value -> OFF (the new default).
+    return False
+
+
+def _candidate_cwds(paths: Iterable[str]) -> list[Path]:
+    candidates: list[Path] = []
+    seen: set[str] = set()
+    for raw in paths:
+        if not raw:
+            continue
+        try:
+            path = Path(raw).expanduser()
+            candidate = path if path.is_dir() else path.parent
+            resolved = str(candidate.resolve())
+        except Exception:
+            continue
+        if resolved not in seen:
+            seen.add(resolved)
+            candidates.append(Path(resolved))
+    return candidates
+
+
+def _verification_snapshot(
+    *,
+    session_id: str | None,
+    changed_paths: list[str],
+) -> tuple[dict[str, Any], dict[str, Any]] | None:
+    """Return ``(status, facts)`` for the first edited workspace needing proof."""
+    try:
+        from agent.coding_context import project_facts_for
+        from agent.verification_evidence import verification_status
+    except Exception:
+        return None
+
+    first_snapshot: tuple[dict[str, Any], dict[str, Any]] | None = None
+    for cwd in _candidate_cwds(changed_paths):
+        facts = project_facts_for(cwd)
+        if not facts:
+            continue
+        status = verification_status(session_id=session_id, cwd=cwd)
+        snapshot = (status, facts)
+        if first_snapshot is None:
+            first_snapshot = snapshot
+        if str(status.get("status") or "unverified") != "passed":
+            return snapshot
+    return first_snapshot
+
+
+def _format_changed_paths(paths: list[str]) -> str:
+    shown = paths[:_MAX_CHANGED_PATHS_IN_NUDGE]
+    lines = [f"- `{path}`" for path in shown]
+    remaining = len(paths) - len(shown)
+    if remaining > 0:
+        lines.append(f"- ... and {remaining} more")
+    return "\n".join(lines)
+
+
+def _status_detail(status: dict[str, Any]) -> str:
+    state = str(status.get("status") or "unverified")
+    evidence = status.get("evidence") if isinstance(status.get("evidence"), dict) else None
+    if not evidence:
+        return state
+
+    command = evidence.get("canonical_command") or evidence.get("command")
+    summary = str(evidence.get("output_summary") or "").strip()
+    parts = [state]
+    if command:
+        parts.append(f"last command `{command}`")
+    if summary:
+        max_summary = 1200
+        if len(summary) > max_summary:
+            summary = summary[:max_summary].rstrip() + "\n... [truncated]"
+        parts.append(f"last output:\n{summary}")
+    return "\n".join(parts)
+
+
+def build_verify_on_stop_nudge(
+    *,
+    session_id: str | None,
+    changed_paths: Iterable[str],
+    attempts: int = 0,
+    max_attempts: int = 2,
+) -> str | None:
+    """Return a synthetic follow-up when edited code lacks fresh verification."""
+    # Drop documentation/prose paths (markdown, skills, README, LICENSE, ...) —
+    # they carry no verifiable behavior, so a turn that touched only those has
+    # nothing to verify and must not nudge.
+    paths = sorted({str(p) for p in _filter_verifiable_paths(changed_paths)})
+    if not paths or attempts >= max_attempts:
+        return None
+
+    snapshot = _verification_snapshot(session_id=session_id, changed_paths=paths)
+    if snapshot is None:
+        return None
+    status, facts = snapshot
+
+    verify_commands = [
+        str(cmd).strip()
+        for cmd in (facts.get("verifyCommands") or [])
+        if str(cmd).strip()
+    ]
+
+    state = str(status.get("status") or "unverified")
+    if state == "passed":
+        return None
+
+    # Optional shipped coding guidance, only paid when this evidence gate fires.
+    try:
+        from agent.verify_hooks import coding_verify_guidance
+
+        guidance = coding_verify_guidance()
+    except Exception:
+        guidance = None
+    addendum = f"\n\n{guidance}" if guidance else ""
+
+    if verify_commands:
+        command_instruction = (
+            "Run the relevant verification command now ("
+            + ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
+            + (", ..." if len(verify_commands) > 3 else "")
+            + "), read any failure, repair the code, and summarize what passed."
+        )
+    else:
+        temp_dir = tempfile.gettempdir()
+        command_instruction = (
+            "No canonical test/lint/build command was detected. Create a focused "
+            f"temporary verification script under `{temp_dir}` using an OS-safe "
+            "`tempfile` path with a `hermes-verify-` filename prefix, run it "
+            "against the changed behavior, clean it up when possible, and "
+            "summarize it explicitly as ad-hoc verification rather than suite "
+            "green."
+        )
+
+    return (
+        "[System: You edited code in this turn, but the workspace does not have "
+        "fresh passing verification evidence yet.\n\n"
+        f"Verification status: {_status_detail(status)}\n\n"
+        f"Changed paths:\n{_format_changed_paths(paths)}\n\n"
+        f"{command_instruction} If verification is not possible, explain the "
+        "concrete blocker instead of claiming the work is fully verified."
+        f"{addendum}]"
+    )
+
+
+__all__ = ["build_verify_on_stop_nudge", "verify_on_stop_enabled"]
--- a/agent/verify_hooks.py
+++ b/agent/verify_hooks.py
@@ -0,0 +1,69 @@
+"""Verification-loop helpers for the ``pre_verify`` round-end gate.
+
+When the agent has edited code and is about to verify/finish, the loop fires the
+``pre_verify`` hook (user directives resolved by
+:func:`hermes_cli.plugins.get_pre_verify_continue_message`). A directive keeps
+the agent going one more turn — run a check, defer it, tidy the diff — instead of
+stopping immediately.
+
+The shipped coding guidance lives on the evidence-based verification-stop nudge
+(``agent/verification_stop.py``), not as a second default stop gate. That keeps
+the default token cost tied to the existing "missing verification evidence"
+decision while preserving ``pre_verify`` for user/plugin policy.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from utils import is_truthy_value
+
+DEFAULT_MAX_VERIFY_NUDGES = 3
+
+# Shipped guidance appended to the verification-stop nudge when code lacks fresh
+# verification evidence. Wording mirrors the user-facing "clean your work"
+# workflow, but does not create its own extra model turn.
+CODING_VERIFY_GUIDANCE = (
+    "[Coding] Before you run tests/linters or call this done: if this is "
+    "creative UI/visual work, hold off on tests and linters until the user says "
+    "they like the result or you're about to commit. And before every commit, "
+    "clean your work: keep it KISS/DRY, match the surrounding code style, and be "
+    "elitist, shorthand, clever, concise, efficient, and elegant."
+)
+
+
+def max_verify_nudges(config: Optional[dict[str, Any]] = None) -> int:
+    """Bound on consecutive ``pre_verify`` continue directives per turn (>= 0)."""
+    agent_cfg = _agent_cfg(config)
+    raw = agent_cfg.get("max_verify_nudges")
+    try:
+        return max(0, int(raw))
+    except (TypeError, ValueError):
+        return DEFAULT_MAX_VERIFY_NUDGES
+
+
+def coding_verify_guidance(config: Optional[dict[str, Any]] = None) -> Optional[str]:
+    """Return the optional guidance appended to verification-stop nudges."""
+    if not is_truthy_value(_agent_cfg(config).get("verify_guidance", True), default=True):
+        return None
+    return CODING_VERIFY_GUIDANCE
+
+
+def _agent_cfg(config: Optional[dict[str, Any]]) -> dict[str, Any]:
+    if config is None:
+        try:
+            from hermes_cli.config import load_config
+
+            config = load_config()
+        except Exception:
+            config = {}
+    agent_cfg = (config or {}).get("agent") if isinstance(config, dict) else None
+    return agent_cfg if isinstance(agent_cfg, dict) else {}
+
+
+__all__ = [
+    "CODING_VERIFY_GUIDANCE",
+    "DEFAULT_MAX_VERIFY_NUDGES",
+    "coding_verify_guidance",
+    "max_verify_nudges",
+]
--- a/apps/desktop/README.md
+++ b/apps/desktop/README.md
@@ -85,7 +85,7 @@ Installers are built and uploaded to GitHub Releases manually. macOS/Windows sig

 ### How it works

-The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a `hermes dashboard` backend over the `tui_gateway`/dashboard APIs and reuses the agent runtime rather than embedding `hermes --tui`. The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.
+The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a headless backend the app launches for you — a `hermes serve` process that serves the `tui_gateway` JSON-RPC/WebSocket API — through the framework-agnostic client in [`apps/shared`](../shared/) (the same client the web dashboard consumes), and reuses the agent runtime rather than embedding `hermes --tui`. The app is **self-contained**: it runs its own `hermes serve` backend and never opens or requires the web dashboard UI. (For backward compatibility, a runtime that predates the `serve` command automatically falls back to a headless `dashboard --no-open` — see `electron/backend-command.cjs` — so mid-upgrade installs never break.) The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.

 ### Verification

--- a/apps/desktop/components.json
+++ b/apps/desktop/components.json
@@ -17,5 +17,5 @@
    "lib": "@/lib",
    "hooks": "@/hooks"
  },
-  "iconLibrary": "lucide"
+  "iconLibrary": "tabler"
 }
--- a/apps/desktop/electron/backend-command.cjs
+++ b/apps/desktop/electron/backend-command.cjs
@@ -0,0 +1,51 @@
+'use strict'
+
+// Backend subcommand routing for the desktop-managed Hermes process.
+//
+// The desktop app launches its own headless backend via `hermes serve` — it
+// must NEVER depend on or launch the browser `dashboard`. But `serve` is a
+// newer subcommand: a runtime that predates it (an older managed install the
+// app hasn't updated yet, or an older `hermes` resolved from PATH) only knows
+// `dashboard --no-open`. To avoid bricking those users mid-upgrade we detect
+// whether the resolved runtime understands `serve` and, only when it does not,
+// fall back to the legacy `dashboard --no-open` invocation. Both produce the
+// exact same headless gateway; `serve` is just the decoupled name.
+//
+// These helpers are pure so they can be unit-tested without Electron.
+
+/**
+ * Build the canonical headless backend argv (always `serve`).
+ * @param {string} [profile] optional Hermes profile to pin via `--profile`.
+ */
+function serveBackendArgs(profile) {
+  const head = profile ? ['--profile', profile] : []
+  return [...head, 'serve', '--host', '127.0.0.1', '--port', '0']
+}
+
+/**
+ * Rewrite a resolved backend argv from `serve` to the legacy
+ * `dashboard --no-open` form, preserving every other argument (incl. a leading
+ * `-m hermes_cli.main` and any `--profile <name>`). Returns a copy; if there is
+ * no `serve` token the argv is returned unchanged.
+ */
+function dashboardFallbackArgs(args) {
+  const i = args.indexOf('serve')
+  if (i === -1) return args.slice()
+  return [...args.slice(0, i), 'dashboard', '--no-open', ...args.slice(i + 1)]
+}
+
+/**
+ * True when a runtime's `hermes_cli/subcommands/dashboard.py` source registers
+ * the `serve` subcommand. Matches `add_parser("serve"` / `add_parser('serve'`
+ * specifically so the substring "server" (e.g. "start_server", "web server")
+ * never produces a false positive.
+ */
+function sourceDeclaresServe(dashboardPySource) {
+  return /add_parser\(\s*["']serve["']/.test(String(dashboardPySource || ''))
+}
+
+module.exports = {
+  serveBackendArgs,
+  dashboardFallbackArgs,
+  sourceDeclaresServe,
+}
--- a/apps/desktop/electron/backend-command.test.cjs
+++ b/apps/desktop/electron/backend-command.test.cjs
@@ -0,0 +1,83 @@
+'use strict'
+
+const test = require('node:test')
+const assert = require('node:assert/strict')
+
+const {
+  serveBackendArgs,
+  dashboardFallbackArgs,
+  sourceDeclaresServe,
+} = require('./backend-command.cjs')
+
+test('serveBackendArgs builds a headless serve invocation', () => {
+  assert.deepEqual(serveBackendArgs(), [
+    'serve',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('serveBackendArgs pins a profile when provided', () => {
+  assert.deepEqual(serveBackendArgs('worker'), [
+    '--profile',
+    'worker',
+    'serve',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs rewrites serve -> dashboard --no-open, keeping the -m prefix', () => {
+  const serve = ['-m', 'hermes_cli.main', 'serve', '--host', '127.0.0.1', '--port', '0']
+  assert.deepEqual(dashboardFallbackArgs(serve), [
+    '-m',
+    'hermes_cli.main',
+    'dashboard',
+    '--no-open',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs preserves a --profile flag ahead of serve', () => {
+  const serve = ['-m', 'hermes_cli.main', '--profile', 'worker', 'serve', '--host', '127.0.0.1', '--port', '0']
+  assert.deepEqual(dashboardFallbackArgs(serve), [
+    '-m',
+    'hermes_cli.main',
+    '--profile',
+    'worker',
+    'dashboard',
+    '--no-open',
+    '--host',
+    '127.0.0.1',
+    '--port',
+    '0',
+  ])
+})
+
+test('dashboardFallbackArgs is a no-op (copy) when there is no serve token', () => {
+  const args = ['-m', 'hermes_cli.main', 'dashboard', '--no-open']
+  const out = dashboardFallbackArgs(args)
+  assert.deepEqual(out, args)
+  assert.notEqual(out, args, 'should return a copy, not the same reference')
+})
+
+test('sourceDeclaresServe detects the serve subparser registration', () => {
+  assert.equal(sourceDeclaresServe('subparsers.add_parser("serve", help="...")'), true)
+  assert.equal(sourceDeclaresServe("subparsers.add_parser('serve')"), true)
+  assert.equal(sourceDeclaresServe('subparsers.add_parser(\n        "serve",\n)'), true)
+})
+
+test('sourceDeclaresServe does not false-positive on the substring "server"', () => {
+  const oldSource = `
+    dashboard_parser = subparsers.add_parser("dashboard", help="Start the web UI dashboard")
+    from hermes_cli.web_server import start_server  # web server
+  `
+  assert.equal(sourceDeclaresServe(oldSource), false)
+})
--- a/apps/desktop/electron/backend-env.cjs
+++ b/apps/desktop/electron/backend-env.cjs
@@ -61,10 +61,7 @@ function buildDesktopBackendPath({
  const venvBin = venvRoot ? pathModule.join(venvRoot, platform === 'win32' ? 'Scripts' : 'bin') : null
  const saneEntries = platform === 'win32' ? [] : POSIX_SANE_PATH_ENTRIES

-  return appendUniquePathEntries(
-    [hermesNodeBin, venvBin, currentPath, saneEntries],
-    { delimiter }
-  )
+  return appendUniquePathEntries([hermesNodeBin, venvBin, currentPath, saneEntries], { delimiter })
 }

 function normalizeHermesHomeRoot(hermesHome, { pathModule = pathModuleForPlatform(process.platform) } = {}) {
--- a/apps/desktop/electron/backend-env.test.cjs
+++ b/apps/desktop/electron/backend-env.test.cjs
@@ -76,10 +76,7 @@ test('normalizeHermesHomeRoot maps profile homes back to the global Hermes root'
    normalizeHermesHomeRoot('C:\\Users\\test\\AppData\\Local\\hermes\\profiles\\oracle', { pathModule: path.win32 }),
    'C:\\Users\\test\\AppData\\Local\\hermes'
  )
-  assert.equal(
-    normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }),
-    '/Users/test/.hermes'
-  )
+  assert.equal(normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }), '/Users/test/.hermes')
 })

 test('Windows PATH casing and delimiter are preserved without POSIX sane entries', () => {
@@ -104,8 +101,5 @@ test('Windows PATH casing and delimiter are preserved without POSIX sane entries
 })

 test('appendUniquePathEntries drops empty entries and keeps first occurrence', () => {
-  assert.equal(
-    appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }),
-    '/a:/b:/c'
-  )
+  assert.equal(appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }), '/a:/b:/c')
 })
--- a/apps/desktop/electron/backend-probes.cjs
+++ b/apps/desktop/electron/backend-probes.cjs
@@ -37,7 +37,18 @@ const { execFileSync } = require('node:child_process')
 const PROBE_TIMEOUT_MS = 5000

 /**
- * Return true iff `python -c "import hermes_cli"` exits 0.
+ * Return the Python snippet used to verify Hermes can import far enough to
+ * launch the CLI. Kept exported for tests so dependency regressions are
+ * caught without needing a real broken venv fixture.
+ *
+ * @returns {string}
+ */
+function hermesRuntimeImportProbe() {
+  return 'import yaml; import hermes_cli.config'
+}
+
+/**
+ * Return true iff the Hermes runtime import probe exits 0.
 *
 * Used to gate the "fallback to system Python with hermes_cli installed"
 * rung of resolveHermesBackend. Without this, a system Python 3.11-3.13
@@ -46,13 +57,20 @@ const PROBE_TIMEOUT_MS = 5000
 * site-packages -- and the resolver returns a backend that immediately
 * dies on spawn.
 *
+ * The probe intentionally imports hermes_cli.config, not just the top-level
+ * package: a broken/empty Windows launcher venv can still see the source tree
+ * through PYTHONPATH but lack PyYAML, then die on the first real CLI import.
+ *
 * @param {string} pythonPath - Absolute path to a python.exe / python.
+ * @param {object} [opts]
+ * @param {object} [opts.env] - Additional environment for the probe.
 * @returns {boolean}
 */
-function canImportHermesCli(pythonPath) {
+function canImportHermesCli(pythonPath, opts = {}) {
  if (!pythonPath) return false
  try {
-    execFileSync(pythonPath, ['-c', 'import hermes_cli'], {
+    execFileSync(pythonPath, ['-c', hermesRuntimeImportProbe()], {
+      env: { ...process.env, ...(opts.env || {}) },
      stdio: 'ignore',
      timeout: PROBE_TIMEOUT_MS,
      windowsHide: true
@@ -101,6 +119,7 @@ function verifyHermesCli(hermesCommand, opts = {}) {

 module.exports = {
  canImportHermesCli,
+  hermesRuntimeImportProbe,
  verifyHermesCli,
  PROBE_TIMEOUT_MS
 }
--- a/apps/desktop/electron/backend-probes.test.cjs
+++ b/apps/desktop/electron/backend-probes.test.cjs
@@ -11,7 +11,7 @@ const fs = require('node:fs')
 const os = require('node:os')
 const path = require('node:path')

-const { canImportHermesCli, verifyHermesCli } = require('./backend-probes.cjs')
+const { canImportHermesCli, hermesRuntimeImportProbe, verifyHermesCli } = require('./backend-probes.cjs')

 // Resolve the host's own Node binary -- guaranteed to be on disk and
 // runnable. We use it as both a stand-in for "a python that doesn't
@@ -40,6 +40,12 @@ test('canImportHermesCli returns false when binary does not exist', () => {
  assert.equal(canImportHermesCli(ghost), false)
 })

+test('hermes runtime import probe checks config dependencies', () => {
+  const probe = hermesRuntimeImportProbe()
+  assert.match(probe, /\bimport yaml\b/)
+  assert.match(probe, /\bimport hermes_cli\.config\b/)
+})
+
 test('verifyHermesCli returns false when command is falsy', () => {
  assert.equal(verifyHermesCli(''), false)
  assert.equal(verifyHermesCli(null), false)
--- a/apps/desktop/electron/backend-ready.cjs
+++ b/apps/desktop/electron/backend-ready.cjs
@@ -1,3 +1,5 @@
+const fs = require('node:fs')
+
 const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m

 // The announcement clock starts the instant the backend process is spawned —
@@ -94,9 +96,76 @@ function waitForDashboardPort(child, timeoutMs = resolvePortAnnounceTimeoutMs())
  })
 }

+function readDashboardReadyFile(readyFile) {
+  if (!readyFile) return null
+  try {
+    const parsed = JSON.parse(fs.readFileSync(readyFile, 'utf8'))
+    const port = Number(parsed?.port)
+    return Number.isInteger(port) && port > 0 ? port : null
+  } catch {
+    return null
+  }
+}
+
+function waitForDashboardReadyFile(readyFile, child, timeoutMs = resolvePortAnnounceTimeoutMs()) {
+  return new Promise((resolve, reject) => {
+    let done = false
+    let interval = null
+
+    function cleanup() {
+      if (done) return
+      done = true
+      clearTimeout(timer)
+      if (interval) clearInterval(interval)
+      child.off('exit', onExit)
+      child.off('error', onError)
+    }
+
+    function check() {
+      const port = readDashboardReadyFile(readyFile)
+      if (port) {
+        cleanup()
+        resolve(port)
+      }
+    }
+
+    function onExit(code, signal) {
+      cleanup()
+      reject(new Error(`Hermes backend: exited before port announcement (${signal || code})`))
+    }
+
+    function onError(err) {
+      cleanup()
+      reject(err)
+    }
+
+    const timer = setTimeout(() => {
+      cleanup()
+      reject(new Error(`Timed out waiting for Hermes backend port announcement (${timeoutMs}ms)`))
+    }, timeoutMs)
+
+    child.on('exit', onExit)
+    child.on('error', onError)
+    interval = setInterval(check, 50)
+    if (typeof interval.unref === 'function') interval.unref()
+    check()
+  })
+}
+
+function waitForDashboardPortAnnouncement(child, options = {}) {
+  const timeoutMs = options.timeoutMs ?? resolvePortAnnounceTimeoutMs()
+  if (options.readyFile) {
+    return waitForDashboardReadyFile(options.readyFile, child, timeoutMs)
+  }
+  return waitForDashboardPort(child, timeoutMs)
+}
+
 module.exports = {
  waitForDashboardPort,
+  waitForDashboardPortAnnouncement,
+  waitForDashboardReadyFile,
+  readDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS
 }
--- a/apps/desktop/electron/backend-ready.test.cjs
+++ b/apps/desktop/electron/backend-ready.test.cjs
@@ -14,12 +14,18 @@
 const test = require('node:test')
 const assert = require('node:assert/strict')
 const { EventEmitter } = require('node:events')
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')

 const {
+  readDashboardReadyFile,
  waitForDashboardPort,
+  waitForDashboardPortAnnouncement,
+  waitForDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS
 } = require('./backend-ready.cjs')

 // A minimal stand-in for a spawned child process: an EventEmitter with a
@@ -119,3 +125,75 @@ test('a late announcement after timeout does not throw (listeners torn down)', a
    child.stdout.emit('data', 'HERMES_DASHBOARD_READY port=9999\n')
  })
 })
+
+// ---------------------------------------------------------------------------
+// ready-file port announcement
+// ---------------------------------------------------------------------------
+
+function mkTmpReadyFile() {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-ready-test-'))
+  return {
+    dir,
+    file: path.join(dir, 'ready.json'),
+    cleanup: () => fs.rmSync(dir, { recursive: true, force: true })
+  }
+}
+
+test('readDashboardReadyFile returns a valid port from JSON', () => {
+  const tmp = mkTmpReadyFile()
+  try {
+    fs.writeFileSync(tmp.file, JSON.stringify({ port: 4567 }))
+    assert.equal(readDashboardReadyFile(tmp.file), 4567)
+  } finally {
+    tmp.cleanup()
+  }
+})
+
+test('readDashboardReadyFile ignores missing, malformed, or invalid files', () => {
+  const tmp = mkTmpReadyFile()
+  try {
+    assert.equal(readDashboardReadyFile(tmp.file), null)
+    fs.writeFileSync(tmp.file, '{')
+    assert.equal(readDashboardReadyFile(tmp.file), null)
+    fs.writeFileSync(tmp.file, JSON.stringify({ port: 0 }))
+    assert.equal(readDashboardReadyFile(tmp.file), null)
+  } finally {
+    tmp.cleanup()
+  }
+})
+
+test('waitForDashboardReadyFile resolves when the ready file appears', async () => {
+  const tmp = mkTmpReadyFile()
+  const child = makeFakeChild()
+  try {
+    const p = waitForDashboardReadyFile(tmp.file, child, 1000)
+    setTimeout(() => fs.writeFileSync(tmp.file, JSON.stringify({ port: 8765 })), 20)
+    assert.equal(await p, 8765)
+  } finally {
+    tmp.cleanup()
+  }
+})
+
+test('waitForDashboardPortAnnouncement uses ready file when provided', async () => {
+  const tmp = mkTmpReadyFile()
+  const child = makeFakeChild()
+  try {
+    const p = waitForDashboardPortAnnouncement(child, { readyFile: tmp.file, timeoutMs: 1000 })
+    setTimeout(() => fs.writeFileSync(tmp.file, JSON.stringify({ port: 9876 })), 20)
+    assert.equal(await p, 9876)
+  } finally {
+    tmp.cleanup()
+  }
+})
+
+test('waitForDashboardReadyFile rejects when the child exits before file readiness', async () => {
+  const tmp = mkTmpReadyFile()
+  const child = makeFakeChild()
+  try {
+    const p = waitForDashboardReadyFile(tmp.file, child, 1000)
+    child.emit('exit', 1, null)
+    await assert.rejects(p, /exited before port announcement/)
+  } finally {
+    tmp.cleanup()
+  }
+})
--- a/apps/desktop/electron/bootstrap-runner.cjs
+++ b/apps/desktop/electron/bootstrap-runner.cjs
@@ -179,7 +179,13 @@ function downloadInstallScript(commit, destPath) {
  })
 }

-async function resolveInstallScript({ installStamp, sourceRepoRoot, hermesHome, emit, _download = downloadInstallScript }) {
+async function resolveInstallScript({
+  installStamp,
+  sourceRepoRoot,
+  hermesHome,
+  emit,
+  _download = downloadInstallScript
+}) {
  // 1. Dev shortcut: prefer a local checkout's installer so we can iterate
  //    without pushing. SOURCE_REPO_ROOT comes from main.cjs (path.resolve
  //    of APP_ROOT/../..).
@@ -293,15 +299,19 @@ function spawnPowerShell(scriptPath, args, { emit, stageName, abortSignal, herme
    const ps = process.platform === 'win32' ? resolveWindowsPowerShell() : 'pwsh'
    const fullArgs = ['-NoProfile', '-ExecutionPolicy', 'Bypass', '-File', scriptPath, ...args]

-    const child = spawn(ps, fullArgs, hiddenWindowsChildOptions({
-      stdio: ['ignore', 'pipe', 'pipe'],
-      env: {
-        ...process.env,
-        // Pass HERMES_HOME through so install.ps1 respects the caller's
-        // choice rather than re-computing the default.
-        HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
-      }
-    }))
+    const child = spawn(
+      ps,
+      fullArgs,
+      hiddenWindowsChildOptions({
+        stdio: ['ignore', 'pipe', 'pipe'],
+        env: {
+          ...process.env,
+          // Pass HERMES_HOME through so install.ps1 respects the caller's
+          // choice rather than re-computing the default.
+          HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
+        }
+      })
+    )

    let stdout = ''
    let stderr = ''
--- a/apps/desktop/electron/connection-config.cjs
+++ b/apps/desktop/electron/connection-config.cjs
@@ -261,12 +261,7 @@ function cookiesHaveSession(cookies) {
 */
 function cookiesHaveLiveSession(cookies) {
  if (!Array.isArray(cookies)) return false
-  return cookies.some(
-    c =>
-      c &&
-      c.value &&
-      (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name))
-  )
+  return cookies.some(c => c && c.value && (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name)))
 }

 module.exports = {
--- a/apps/desktop/electron/desktop-uninstall.cjs
+++ b/apps/desktop/electron/desktop-uninstall.cjs
@@ -138,10 +138,7 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
  if (pythonPath) {
    lines.push(`export PYTHONPATH=${q(pythonPath)}\${PYTHONPATH:+:$PYTHONPATH}`)
  }
-  lines.push(
-    `cd ${q(agentRoot)} 2>/dev/null || true`,
-    `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`
-  )
+  lines.push(`cd ${q(agentRoot)} 2>/dev/null || true`, `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`)
  if (appPath) {
    lines.push(`rm -rf ${q(appPath)} || true`)
  }
@@ -169,7 +166,15 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
 * Removal: even after the desktop PID is gone, Windows releases directory
 * handles lazily, so a single `rmdir /s /q` can half-fail — retry up to 10x.
 */
-function buildWindowsCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot, uninstallArgs, appPath, hermesHome }) {
+function buildWindowsCleanupScript({
+  desktopPid,
+  pythonExe,
+  pythonPath,
+  agentRoot,
+  uninstallArgs,
+  appPath,
+  hermesHome
+}) {
  const pid = Number(desktopPid) || 0
  // cmd.exe has no string escaping inside quotes; strip embedded quotes (paths
  // under %LOCALAPPDATA% never contain them). `&`/`^` in a path would still be
--- a/apps/desktop/electron/desktop-uninstall.test.cjs
+++ b/apps/desktop/electron/desktop-uninstall.test.cjs
@@ -101,10 +101,7 @@ test('resolveRemovableAppPath uses APPIMAGE on Linux when set', () => {
 })

 test('resolveRemovableAppPath finds the unpacked dir on Linux', () => {
-  assert.equal(
-    resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}),
-    '/opt/hermes/linux-unpacked'
-  )
+  assert.equal(resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}), '/opt/hermes/linux-unpacked')
  // A system-package install (/usr/bin) → null, left to apt/dnf.
  assert.equal(resolveRemovableAppPath('/usr/bin/hermes', 'linux', {}), null)
 })
--- a/apps/desktop/electron/embed-referer.cjs
+++ b/apps/desktop/electron/embed-referer.cjs
@@ -0,0 +1,48 @@
+'use strict'
+
+const { session } = require('electron')
+
+const EMBED_SESSION_PARTITION = 'persist:hermes-embed'
+const EMBED_REFERER = 'https://www.youtube.com/'
+const YOUTUBE_REFERER_HOST_RE =
+  /(^|\.)(youtube\.com|youtube-nocookie\.com|googlevideo\.com|ytimg\.com|youtubei\.googleapis\.com)$/i
+
+function installEmbedRefererForSession(embedSession) {
+  if (!embedSession) {
+    return
+  }
+
+  embedSession.webRequest.onBeforeSendHeaders((details, callback) => {
+    let host = ''
+
+    try {
+      host = new URL(details.url).hostname
+    } catch {
+      host = ''
+    }
+
+    if (!YOUTUBE_REFERER_HOST_RE.test(host)) {
+      callback({ requestHeaders: details.requestHeaders })
+      return
+    }
+
+    const headers = { ...details.requestHeaders }
+
+    if (!headers.Referer && !headers.referer) {
+      headers.Referer = EMBED_REFERER
+    }
+
+    callback({ requestHeaders: headers })
+  })
+}
+
+/** Stamp Referer on YouTube requests in the embed webview partition only. */
+function installEmbedReferer() {
+  try {
+    installEmbedRefererForSession(session.fromPartition(EMBED_SESSION_PARTITION))
+  } catch {
+    // Non-fatal: embeds still render; YouTube may show referer errors.
+  }
+}
+
+module.exports = { installEmbedReferer }
--- a/apps/desktop/electron/fs-ipc.cjs
+++ b/apps/desktop/electron/fs-ipc.cjs
@@ -0,0 +1,105 @@
+'use strict'
+
+const { shell } = require('electron')
+const fs = require('fs')
+const path = require('path')
+
+const { readDirForIpc } = require('./fs-read-dir.cjs')
+const { gitRootForIpc } = require('./git-root.cjs')
+const { resolveRequestedPathForIpc } = require('./hardening.cjs')
+
+// Filesystem IPC: read-dir, git-root, reveal, rename, write-text, trash. Path
+// hardening + `~` expansion + dir-existence checks live in the main process and
+// are injected so this module stays side-effect free.
+function registerFsIpc({ directoryExists, expandUserPath, ipcMain }) {
+  ipcMain.handle('hermes:fs:readDir', async (_event, dirPath) => readDirForIpc(dirPath))
+
+  ipcMain.handle('hermes:fs:gitRoot', async (_event, startPath) => gitRootForIpc(startPath))
+
+  // Reveal a path in the OS file manager (Finder / Explorer / Files).
+  ipcMain.handle('hermes:fs:reveal', async (_event, targetPath) => {
+    const target = String(targetPath || '').trim()
+
+    if (!target) {
+      return false
+    }
+
+    try {
+      shell.showItemInFolder(target)
+
+      return true
+    } catch {
+      return false
+    }
+  })
+
+  // Rename a file/folder in place. The renderer passes the existing path + a new
+  // base name; the destination is resolved in the SAME parent dir so a rename can
+  // never move the item elsewhere or traverse out. Rejects on a name collision.
+  ipcMain.handle('hermes:fs:rename', async (_event, targetPath, newName) => {
+    const src = String(targetPath || '').trim()
+    const name = String(newName || '').trim()
+
+    if (!src || !name || name === '.' || name === '..' || name.includes('/') || name.includes('\\')) {
+      throw new Error('Invalid rename')
+    }
+
+    const dst = path.join(path.dirname(src), name)
+
+    if (dst === src) {
+      return { path: dst }
+    }
+
+    if (fs.existsSync(dst)) {
+      throw new Error(`"${name}" already exists`)
+    }
+
+    await fs.promises.rename(src, dst)
+
+    return { path: dst }
+  })
+
+  // Write a small UTF-8 text file (e.g. a project's IDEA.md at creation). The path
+  // is hardened (resolveRequestedPathForIpc) and the parent must already exist —
+  // this never creates directory trees or escapes the allowed roots, and content
+  // is size-capped so it can't be abused as a bulk-write primitive.
+  ipcMain.handle('hermes:fs:writeText', async (_event, filePath, content) => {
+    const raw = String(filePath || '').trim()
+
+    if (!raw) {
+      throw new Error('Invalid path')
+    }
+
+    const text = String(content ?? '')
+
+    if (text.length > 1_000_000) {
+      throw new Error('Content too large')
+    }
+
+    const resolved = resolveRequestedPathForIpc(expandUserPath(raw), { purpose: 'Write text file' })
+
+    if (!directoryExists(path.dirname(resolved))) {
+      throw new Error('Parent directory does not exist')
+    }
+
+    await fs.promises.writeFile(resolved, text, 'utf8')
+
+    return { path: resolved }
+  })
+
+  // Move a file/folder to the OS trash (recoverable) — the VS Code "Delete"
+  // default. `shell.trashItem` routes to Finder/Explorer/Files trash per platform.
+  ipcMain.handle('hermes:fs:trash', async (_event, targetPath) => {
+    const target = String(targetPath || '').trim()
+
+    if (!target) {
+      throw new Error('Invalid delete')
+    }
+
+    await shell.trashItem(target)
+
+    return true
+  })
+}
+
+module.exports = { registerFsIpc }
--- a/apps/desktop/electron/fs-ipc.test.cjs
+++ b/apps/desktop/electron/fs-ipc.test.cjs
@@ -0,0 +1,49 @@
+'use strict'
+
+const assert = require('node:assert/strict')
+const test = require('node:test')
+
+const { registerFsIpc } = require('./fs-ipc.cjs')
+
+function fakeIpcMain() {
+  const handlers = new Map()
+
+  return {
+    handlers,
+    handle(channel, handler) {
+      assert.ok(!handlers.has(channel), `duplicate registration for ${channel}`)
+      handlers.set(channel, handler)
+    }
+  }
+}
+
+test('registerFsIpc wires only hermes:fs:* channels, each to a handler fn', () => {
+  const ipcMain = fakeIpcMain()
+
+  registerFsIpc({ ipcMain, directoryExists: () => true, expandUserPath: p => p })
+
+  assert.ok(ipcMain.handlers.size >= 6, `expected the full fs surface, got ${ipcMain.handlers.size}`)
+
+  for (const [channel, handler] of ipcMain.handlers) {
+    assert.match(channel, /^hermes:fs:/, `${channel} is not an fs channel`)
+    assert.equal(typeof handler, 'function', `${channel} should register a handler`)
+  }
+
+  for (const channel of ['hermes:fs:readDir', 'hermes:fs:rename', 'hermes:fs:trash']) {
+    assert.ok(ipcMain.handlers.has(channel), `missing ${channel}`)
+  }
+})
+
+test('rename rejects names that traverse out of the parent dir', async () => {
+  const ipcMain = fakeIpcMain()
+
+  registerFsIpc({ ipcMain, directoryExists: () => true, expandUserPath: p => p })
+
+  for (const bad of ['..', '.', 'a/b', 'a\\b']) {
+    await assert.rejects(
+      () => ipcMain.handlers.get('hermes:fs:rename')({}, '/tmp/x', bad),
+      /Invalid rename/,
+      `"${bad}" should be rejected`
+    )
+  }
+})
--- a/apps/desktop/electron/fs-read-dir.cjs
+++ b/apps/desktop/electron/fs-read-dir.cjs
@@ -92,9 +92,7 @@ async function readDirForIpc(dirPath, options = {}) {
  try {
    const dirents = await fsImpl.promises.readdir(resolved, { withFileTypes: true })
    const visibleDirents = dirents.filter(dirent => !FS_READDIR_HIDDEN.has(dirent.name))
-    const entries = await mapWithStatConcurrency(visibleDirents, dirent =>
-      entryForDirent(dirent, resolved, fsImpl)
-    )
+    const entries = await mapWithStatConcurrency(visibleDirents, dirent => entryForDirent(dirent, resolved, fsImpl))

    entries.sort((a, b) => Number(b.isDirectory) - Number(a.isDirectory) || a.name.localeCompare(b.name))

--- a/apps/desktop/electron/fs-read-dir.test.cjs
+++ b/apps/desktop/electron/fs-read-dir.test.cjs
@@ -349,7 +349,10 @@ test('readDirForIpc bounds concurrent stats while preserving complete sorted out
  assert.equal(result.error, undefined)
  assert.equal(result.entries.length, names.length)
  assert.equal(statCalls.length, names.length)
-  assert.equal(statCalls.some(fullPath => fullPath.endsWith(`${path.sep}node_modules`)), false)
+  assert.equal(
+    statCalls.some(fullPath => fullPath.endsWith(`${path.sep}node_modules`)),
+    false
+  )
  assert.ok(peak > 1, `expected concurrent stats, observed peak ${peak}`)
  assert.ok(peak <= 16, `expected at most 16 concurrent stats, observed peak ${peak}`)
  assert.deepEqual(
@@ -357,8 +360,5 @@ test('readDirForIpc bounds concurrent stats while preserving complete sorted out
    expectedNames
  )
  assert.equal(result.entries.find(entry => entry.name === failedName)?.isDirectory, false)
-  assert.equal(
-    result.entries.filter(entry => entry.isDirectory).length,
-    successfulDirectoryNames.size
-  )
+  assert.equal(result.entries.filter(entry => entry.isDirectory).length, successfulDirectoryNames.size)
 })
--- a/apps/desktop/electron/git-ipc.cjs
+++ b/apps/desktop/electron/git-ipc.cjs
@@ -0,0 +1,96 @@
+'use strict'
+
+const { scanGitRepos } = require('./git-repo-scan.cjs')
+const {
+  fileDiffVsHead,
+  repoStatus,
+  reviewCommit,
+  reviewCommitContext,
+  reviewCreatePr,
+  reviewDiff,
+  reviewList,
+  reviewPush,
+  reviewRevParse,
+  reviewRevert,
+  reviewShipInfo,
+  reviewStage,
+  reviewUnstage
+} = require('./git-review-ops.cjs')
+const { addWorktree, listBranches, listWorktrees, removeWorktree, switchBranch } = require('./git-worktree-ops.cjs')
+
+// Register the git/worktree/review IPC handlers. Thin delegators to the
+// git-*-ops sibling modules; the git/gh binary resolution lives in the main
+// process (Windows PATH discovery) and is injected so this module stays pure.
+function registerGitIpc({ ipcMain, resolveGitBinary, resolveGhBinary }) {
+  // Git-driven worktree management ("Start work" flow). Errors surface to the
+  // renderer as rejected promises so it can toast a friendly message.
+  ipcMain.handle('hermes:git:worktreeList', async (_event, repoPath) => listWorktrees(repoPath, resolveGitBinary()))
+
+  ipcMain.handle('hermes:git:worktreeAdd', async (_event, repoPath, options) =>
+    addWorktree(repoPath, options || {}, resolveGitBinary())
+  )
+
+  ipcMain.handle('hermes:git:worktreeRemove', async (_event, repoPath, worktreePath, options) =>
+    removeWorktree(repoPath, worktreePath, options || {}, resolveGitBinary())
+  )
+
+  ipcMain.handle('hermes:git:branchSwitch', async (_event, repoPath, branch) =>
+    switchBranch(repoPath, branch, resolveGitBinary())
+  )
+
+  ipcMain.handle('hermes:git:branchList', async (_event, repoPath) => listBranches(repoPath, resolveGitBinary()))
+
+  // Compact repo status (branch, ahead/behind, change counts + files) for the
+  // composer coding rail. Returns null on a non-repo / remote backend so the rail
+  // hides cleanly rather than erroring.
+  ipcMain.handle('hermes:git:repoStatus', async (_event, repoPath) => repoStatus(repoPath, resolveGitBinary()))
+
+  // Codex-style review pane: list changed files for a scope, fetch one file's
+  // unified diff, and stage / unstage / revert. Reads return empty on failure;
+  // mutations reject so the renderer can toast.
+  ipcMain.handle('hermes:git:review:list', async (_event, repoPath, scope, baseRef) =>
+    reviewList(repoPath, scope, baseRef, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:diff', async (_event, repoPath, filePath, scope, baseRef, staged) =>
+    reviewDiff(repoPath, filePath, scope, baseRef, staged, resolveGitBinary())
+  )
+  // Working-tree-vs-HEAD diff for one file (the preview's "show the diff" view).
+  ipcMain.handle('hermes:git:fileDiff', async (_event, repoPath, filePath) =>
+    fileDiffVsHead(repoPath, filePath, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:stage', async (_event, repoPath, filePath) =>
+    reviewStage(repoPath, filePath ?? null, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:unstage', async (_event, repoPath, filePath) =>
+    reviewUnstage(repoPath, filePath ?? null, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:revert', async (_event, repoPath, filePath) =>
+    reviewRevert(repoPath, filePath ?? null, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:revParse', async (_event, repoPath, ref) =>
+    reviewRevParse(repoPath, ref, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:commit', async (_event, repoPath, message, push) =>
+    reviewCommit(repoPath, message, Boolean(push), resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:commitContext', async (_event, repoPath) =>
+    reviewCommitContext(repoPath, resolveGitBinary())
+  )
+  ipcMain.handle('hermes:git:review:push', async (_event, repoPath) => reviewPush(repoPath, resolveGitBinary()))
+  ipcMain.handle('hermes:git:review:shipInfo', async (_event, repoPath) => reviewShipInfo(repoPath, resolveGhBinary()))
+  ipcMain.handle('hermes:git:review:createPr', async (_event, repoPath) =>
+    reviewCreatePr(repoPath, resolveGitBinary(), resolveGhBinary())
+  )
+
+  // Repo-first project discovery: scan bounded roots for git repos (pure fs walk,
+  // no native addon). Never throws to the renderer — failures yield an empty list.
+  ipcMain.handle('hermes:git:scanRepos', async (_event, roots, options) => {
+    try {
+      return await scanGitRepos(roots || [], options || {})
+    } catch {
+      return []
+    }
+  })
+}
+
+module.exports = { registerGitIpc }
--- a/apps/desktop/electron/git-ipc.test.cjs
+++ b/apps/desktop/electron/git-ipc.test.cjs
@@ -0,0 +1,61 @@
+'use strict'
+
+const assert = require('node:assert/strict')
+const test = require('node:test')
+
+const { registerGitIpc } = require('./git-ipc.cjs')
+
+function fakeIpcMain() {
+  const handlers = new Map()
+
+  return {
+    handlers,
+    handle(channel, handler) {
+      assert.ok(!handlers.has(channel), `duplicate registration for ${channel}`)
+      handlers.set(channel, handler)
+    }
+  }
+}
+
+test('registerGitIpc wires only hermes:git:* channels, each to a handler fn', () => {
+  const ipcMain = fakeIpcMain()
+
+  registerGitIpc({ ipcMain, resolveGitBinary: () => 'git', resolveGhBinary: () => 'gh' })
+
+  assert.ok(ipcMain.handlers.size >= 19, `expected the full git surface, got ${ipcMain.handlers.size}`)
+
+  for (const [channel, handler] of ipcMain.handlers) {
+    assert.match(channel, /^hermes:git:/, `${channel} is not a git channel`)
+    assert.equal(typeof handler, 'function', `${channel} should register a handler`)
+  }
+
+  // Spot-check the load-bearing channels across the worktree / review / scan groups.
+  for (const channel of ['hermes:git:worktreeList', 'hermes:git:review:commit', 'hermes:git:scanRepos']) {
+    assert.ok(ipcMain.handlers.has(channel), `missing ${channel}`)
+  }
+})
+
+test('handlers thread the injected resolver into the ops layer', async () => {
+  const ipcMain = fakeIpcMain()
+  const calls = []
+
+  registerGitIpc({
+    ipcMain,
+    resolveGitBinary: () => {
+      calls.push('git')
+
+      return 'git'
+    },
+    resolveGhBinary: () => 'gh'
+  })
+
+  // The resolver is consulted synchronously to build the ops call; whatever the
+  // ops layer does with a non-repo path is irrelevant to the wiring.
+  try {
+    await ipcMain.handlers.get('hermes:git:worktreeList')({}, '/definitely/not/a/repo')
+  } catch {
+    // ops layer may reject on a bad path — not what this test asserts.
+  }
+
+  assert.deepEqual(calls, ['git'])
+})
--- a/apps/desktop/electron/git-repo-scan.cjs
+++ b/apps/desktop/electron/git-repo-scan.cjs
@@ -0,0 +1,96 @@
+'use strict'
+
+// Repo-first discovery: walk bounded roots for git repos using only Node's `fs`
+// — no native addon, so it just works for anyone who pulls main (no
+// electron-rebuild). Mirrors how GitHub Desktop scans: stop at the first `.git`
+// (don't descend into a repo), cap depth, and skip heavy non-repo trees so the
+// first scan stays fast. Results are cached by the backend after the first run.
+
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')
+
+const fsp = fs.promises
+
+// Shallow on purpose: real projects live a few levels under home
+// (`~/www/repo`, `~/code/org/repo`); deeper `.git` dirs are almost always
+// fixtures/vendored/eval checkouts (e.g. `~/www/ha-evals/tasks/*/repo`). Repos
+// you actually use but keep deeper still surface via session-derived discovery,
+// so this only prunes noise, never repos with history.
+const DEFAULT_MAX_DEPTH = 3
+const MAX_CONCURRENCY = 32
+
+// Big trees that are never themselves repos and would waste the walk. Anything
+// hidden (dotdirs like .cache/.Trash/.npm) is skipped wholesale below, so this
+// only needs the non-hidden heavyweights.
+const JUNK_DIRS = new Set(['Applications', 'Library', 'node_modules', 'site-packages', 'vendor', 'venv'])
+
+async function mapLimit(items, limit, fn) {
+  let cursor = 0
+
+  async function worker() {
+    while (cursor < items.length) {
+      const index = cursor
+      cursor += 1
+      await fn(items[index])
+    }
+  }
+
+  await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker))
+}
+
+/**
+ * Scan `roots` (default: the home dir) for git repositories. Returns deduped
+ * `{ root, label }` entries. `options.maxDepth` caps recursion (default 3).
+ */
+async function scanGitRepos(roots, options = {}) {
+  const maxDepth = Number(options.maxDepth) || DEFAULT_MAX_DEPTH
+  const searchRoots = Array.isArray(roots) && roots.length > 0 ? roots : [os.homedir()]
+  const found = new Map()
+
+  async function walk(dir, depth) {
+    if (depth > maxDepth) {
+      return
+    }
+
+    let entries
+    try {
+      entries = await fsp.readdir(dir, { withFileTypes: true })
+    } catch {
+      return // unreadable / permission denied
+    }
+
+    // A `.git` DIRECTORY marks a real repo root (a main checkout). A `.git`
+    // FILE is a linked worktree or submodule — those belong to their parent
+    // repo as lanes, not as separate projects, so we don't list them (and we
+    // keep descending in case a real repo sits deeper). This is what kills the
+    // worktree/eval-repo duplicate explosion.
+    if (entries.some(entry => entry.name === '.git' && entry.isDirectory())) {
+      const root = dir.replace(/[/\\]+$/, '')
+      found.set(root, path.basename(root) || root)
+
+      return
+    }
+
+    const subdirs = []
+    for (const entry of entries) {
+      // Real directories only (skip symlinks to avoid loops), no hidden dirs, no
+      // known heavy trees.
+      if (!entry.isDirectory() || entry.name.startsWith('.') || JUNK_DIRS.has(entry.name)) {
+        continue
+      }
+
+      subdirs.push(path.join(dir, entry.name))
+    }
+
+    await mapLimit(subdirs, MAX_CONCURRENCY, sub => walk(sub, depth + 1))
+  }
+
+  await mapLimit(searchRoots.map(root => String(root || '').trim()).filter(Boolean), MAX_CONCURRENCY, root =>
+    walk(root, 0)
+  )
+
+  return [...found.entries()].map(([root, label]) => ({ label, root }))
+}
+
+module.exports = { scanGitRepos }
--- a/apps/desktop/electron/git-review-ops.cjs
+++ b/apps/desktop/electron/git-review-ops.cjs
@@ -0,0 +1,703 @@
+'use strict'
+
+// Git ops backing the coding rail + Codex-style review pane. Built on `simple-git`
+// (a maintained wrapper around the system git binary — same git the rest of the
+// app shells to, no native build) so we read structured status()/diffSummary()
+// results instead of hand-parsing porcelain. Reads degrade to null/empty on a
+// non-repo / remote backend; mutations reject so the renderer can toast.
+
+const { execFile } = require('node:child_process')
+const fs = require('node:fs/promises')
+const path = require('node:path')
+
+// `simple-git` is a pure-JS runtime dep that workspace dedup hoists into the
+// repo-root node_modules.  Packaged builds set `files:` in package.json, which
+// excludes node_modules from the asar, so the normal require() fails at launch
+// (issue #52735: "Cannot find module 'simple-git'").  We ship the dep's
+// closure under resources/native-deps/vendor/node_modules/ via extraResources
+// + scripts/stage-native-deps.cjs, and resolve from there when the hoisted
+// require() isn't reachable.  The `vendor/` nesting matters: electron-builder
+// drops a node_modules dir at the root of an extraResources copy but keeps a
+// nested one.  Dev mode never hits the fallback -- Node's normal lookup finds
+// the hoisted copy.
+let simpleGit
+try {
+  simpleGit = require('simple-git')
+} catch {
+  const resourcesPath = process.resourcesPath
+  if (!resourcesPath) {
+    throw new Error("git-review IPC: 'simple-git' not found and no resourcesPath to fall back to")
+  }
+  simpleGit = require(path.join(resourcesPath, 'native-deps', 'vendor', 'node_modules', 'simple-git'))
+}
+
+const { resolveRequestedPathForIpc } = require('./hardening.cjs')
+
+const COMMIT_CONTEXT_DIFF_MAX_CHARS = 120_000
+const COMMIT_CONTEXT_UNTRACKED_MAX = 80
+const UNTRACKED_LINE_COUNT_CONCURRENCY = 16
+const UNTRACKED_LINE_COUNT_MAX_BYTES = 1024 * 1024
+
+// GUI-launched Electron apps on macOS inherit only a minimal PATH (no
+// /opt/homebrew/bin or /usr/local/bin), so `gh` — and the `git` gh shells out
+// to — aren't found. Augment PATH with the resolved gh dir + the common
+// package-manager bins so gh runs the same way it does in a terminal.
+function ghEnv(ghBin) {
+  const extra = [ghBin ? path.dirname(ghBin) : '', '/opt/homebrew/bin', '/usr/local/bin', '/usr/bin'].filter(
+    dir => dir && dir !== '.'
+  )
+
+  return { ...process.env, PATH: [...extra, process.env.PATH].filter(Boolean).join(path.delimiter) }
+}
+
+// Run the `gh` CLI in a repo. Resolves { ok, stdout } so callers branch on
+// availability/auth without a throw. gh missing/unauthed → ok:false.
+function runGh(args, cwd, ghBin) {
+  return new Promise(resolve => {
+    execFile(
+      ghBin || 'gh',
+      args,
+      { cwd, env: ghEnv(ghBin), windowsHide: true, timeout: 30_000, maxBuffer: 8 * 1024 * 1024 },
+      (err, stdout) => resolve({ ok: !err, stdout: String(stdout || '') })
+    )
+  })
+}
+
+function gitFor(cwd, gitBin) {
+  return simpleGit({ baseDir: cwd, binary: gitBin || 'git', maxConcurrentProcesses: 4, trimmed: false })
+}
+
+// simple-git reports renames as `old => new` (and `dir/{old => new}/f`); resolve
+// to the NEW path so the row addresses the real file for diff/stage.
+function resolveRenamePath(raw) {
+  const path = String(raw || '').trim()
+
+  if (!path.includes(' => ')) {
+    return path
+  }
+
+  const brace = path.match(/^(.*)\{(.*) => (.*)\}(.*)$/)
+
+  if (brace) {
+    const [, prefix, , to, suffix] = brace
+
+    return `${prefix}${to}${suffix}`.replace(/\/{2,}/g, '/')
+  }
+
+  return path.split(' => ').pop().trim()
+}
+
+// DiffResult.files → Map<path, {added, removed}> (binary files carry no line
+// delta).
+function countsByPath(summary) {
+  const map = new Map()
+
+  for (const file of summary.files) {
+    map.set(resolveRenamePath(file.file), {
+      added: file.binary ? 0 : file.insertions,
+      removed: file.binary ? 0 : file.deletions
+    })
+  }
+
+  return map
+}
+
+// Untracked files don't appear in diffSummary(); count insertions from disk so
+// the review tree can show +N for new files (matches an all-add diff view).
+// Insertions = line count: newline bytes, plus one for a final unterminated
+// line. Binary (NUL byte) → 0, mirroring git numstat's "-".
+async function untrackedInsertions(cwd, relPath) {
+  try {
+    const fullPath = path.join(cwd, relPath)
+    const stat = await fs.stat(fullPath)
+
+    if (!stat.isFile() || stat.size > UNTRACKED_LINE_COUNT_MAX_BYTES) {
+      return 0
+    }
+
+    const buf = await fs.readFile(fullPath)
+
+    if (buf.includes(0)) {
+      return 0
+    }
+
+    let lines = 0
+
+    for (const byte of buf) {
+      if (byte === 10) {
+        lines++
+      }
+    }
+
+    return buf.length > 0 && buf[buf.length - 1] !== 10 ? lines + 1 : lines
+  } catch {
+    return 0
+  }
+}
+
+function capText(text, maxChars, label = 'truncated') {
+  const value = String(text || '')
+
+  if (value.length <= maxChars) {
+    return value
+  }
+
+  return `${value.slice(0, maxChars)}\n# ${label}: ${value.length - maxChars} chars omitted\n`
+}
+
+async function fillUntrackedCounts(cwd, files) {
+  const pending = files.filter(file => file.status === '?' && file.added === 0 && file.removed === 0)
+
+  for (let i = 0; i < pending.length; i += UNTRACKED_LINE_COUNT_CONCURRENCY) {
+    await Promise.all(
+      pending.slice(i, i + UNTRACKED_LINE_COUNT_CONCURRENCY).map(async file => {
+        file.added = await untrackedInsertions(cwd, file.path)
+      })
+    )
+  }
+}
+
+// Resolve the base ref for "all branch changes": merge-base with the remote
+// default branch (origin/HEAD), falling back to common trunk names.
+async function branchBase(git) {
+  const candidates = []
+
+  try {
+    const head = (await git.revparse(['--abbrev-ref', 'origin/HEAD'])).trim()
+
+    if (head) {
+      candidates.push(head)
+    }
+  } catch {
+    // No origin/HEAD configured.
+  }
+
+  candidates.push('origin/main', 'origin/master', 'main', 'master')
+
+  for (const ref of candidates) {
+    try {
+      const base = (await git.raw(['merge-base', 'HEAD', ref])).trim()
+
+      if (base) {
+        return base
+      }
+    } catch {
+      // Ref doesn't exist; try the next candidate.
+    }
+  }
+
+  return null
+}
+
+// Resolve the repo's default branch NAME ("main" / "master" / …), preferring
+// the remote's HEAD, then common local trunk names. Null when none is found
+// (e.g. a fresh repo with only a feature branch). Used to offer "branch off the
+// trunk" regardless of which branch you're currently on.
+async function defaultBranchName(git) {
+  try {
+    const head = (await git.revparse(['--abbrev-ref', 'origin/HEAD'])).trim()
+
+    // "origin/main" → "main"; skip the bare "origin/HEAD" placeholder.
+    if (head && head !== 'origin/HEAD') {
+      return head.replace(/^origin\//, '')
+    }
+  } catch {
+    // No origin/HEAD configured.
+  }
+
+  // Prefer a local trunk, then a remote-only one (returns the clean name either
+  // way) so "branch off main" works even before main is checked out locally.
+  for (const ref of [
+    'refs/heads/main',
+    'refs/heads/master',
+    'refs/remotes/origin/main',
+    'refs/remotes/origin/master'
+  ]) {
+    try {
+      await git.raw(['rev-parse', '--verify', '--quiet', ref])
+
+      return ref.replace(/^refs\/(?:heads|remotes\/origin)\//, '')
+    } catch {
+      // Ref doesn't exist; try the next candidate.
+    }
+  }
+
+  return null
+}
+
+// A status file's single-letter classification, preferring the staged (index)
+// code over the worktree code; untracked wins (simple-git marks both '?').
+function statusLetter(file) {
+  if (file.index === '?' || file.working_dir === '?') {
+    return '?'
+  }
+
+  const code = file.index && file.index !== ' ' ? file.index : file.working_dir
+
+  return (code || 'M').toUpperCase()
+}
+
+const isStaged = file => Boolean(file.index && file.index !== ' ' && file.index !== '?')
+
+async function reviewList(repoPath, scope, baseRef, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review list' })
+  } catch {
+    return { files: [], base: null }
+  }
+
+  const git = gitFor(cwd, gitBin)
+
+  try {
+    if (scope === 'branch' || scope === 'lastTurn') {
+      const base = scope === 'branch' ? await branchBase(git) : baseRef
+
+      if (!base) {
+        return { files: [], base: null }
+      }
+
+      const range = scope === 'branch' ? `${base}...HEAD` : base
+      const summary = await git.diffSummary([range])
+      const files = summary.files.map(file => ({
+        path: resolveRenamePath(file.file),
+        added: file.binary ? 0 : file.insertions,
+        removed: file.binary ? 0 : file.deletions,
+        status: 'M',
+        staged: false
+      }))
+
+      // "Last turn" also surfaces files created since the baseline (untracked).
+      if (scope === 'lastTurn') {
+        const status = await git.status()
+
+        for (const path of status.not_added) {
+          if (!files.some(f => f.path === path)) {
+            files.push({ path, added: 0, removed: 0, status: '?', staged: false })
+          }
+        }
+      }
+
+      files.sort((a, b) => a.path.localeCompare(b.path))
+      await fillUntrackedCounts(cwd, files)
+
+      return { files, base }
+    }
+
+    // Default: uncommitted (staged + unstaged + untracked), one row per path.
+    const [status, staged, unstaged] = await Promise.all([
+      git.status(),
+      git.diffSummary(['--cached']),
+      git.diffSummary([])
+    ])
+    const stagedCounts = countsByPath(staged)
+    const unstagedCounts = countsByPath(unstaged)
+
+    const files = status.files.map(file => {
+      const filePath = resolveRenamePath(file.path)
+      const sc = stagedCounts.get(filePath) || { added: 0, removed: 0 }
+      const uc = unstagedCounts.get(filePath) || { added: 0, removed: 0 }
+
+      return {
+        path: filePath,
+        added: sc.added + uc.added,
+        removed: sc.removed + uc.removed,
+        status: statusLetter(file),
+        staged: isStaged(file)
+      }
+    })
+
+    files.sort((a, b) => a.path.localeCompare(b.path))
+    await fillUntrackedCounts(cwd, files)
+
+    return { files, base: null }
+  } catch {
+    return { files: [], base: null }
+  }
+}
+
+async function reviewDiff(repoPath, filePath, scope, baseRef, staged, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review diff' })
+  } catch {
+    return ''
+  }
+
+  const git = gitFor(cwd, gitBin)
+  const safe = args => git.diff(args).catch(() => '')
+
+  if (scope === 'branch') {
+    const base = await branchBase(git)
+
+    return base ? safe([`${base}...HEAD`, '--', filePath]) : ''
+  }
+
+  if (scope === 'lastTurn') {
+    return baseRef ? safe([baseRef, '--', filePath]) : ''
+  }
+
+  if (staged) {
+    return safe(['--cached', '--', filePath])
+  }
+
+  const worktree = await safe(['--', filePath])
+
+  if (worktree.trim()) {
+    return worktree
+  }
+
+  // Untracked file: no worktree diff exists, so synthesize an all-add diff via
+  // --no-index (exits non-zero by design when files differ, so go around
+  // simple-git's reject-on-nonzero with a raw execFile).
+  return new Promise(resolve => {
+    execFile(
+      gitBin || 'git',
+      ['diff', '--no-index', '--', '/dev/null', filePath],
+      { cwd, windowsHide: true, timeout: 30_000, maxBuffer: 32 * 1024 * 1024 },
+      (_err, stdout) => resolve(String(stdout || ''))
+    )
+  })
+}
+
+// Working-tree-vs-HEAD diff for ONE file — the "what changed since the last
+// commit" view used by the file preview. Unlike reviewDiff this never synthesizes
+// a full-add for a clean tracked file (so a pristine file shows no diff); it only
+// all-adds a genuinely untracked file.
+async function fileDiffVsHead(repoPath, filePath, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'File diff' })
+  } catch {
+    return ''
+  }
+
+  const git = gitFor(cwd, gitBin)
+  const head = await git.diff(['HEAD', '--', filePath]).catch(() => '')
+
+  if (head.trim()) {
+    return head
+  }
+
+  // No tracked changes vs HEAD. Only synthesize an all-add diff for a file git
+  // doesn't know yet; a clean tracked file must return empty.
+  const status = await git.raw(['status', '--porcelain', '--', filePath]).catch(() => '')
+
+  if (!status.trim().startsWith('??')) {
+    return ''
+  }
+
+  return new Promise(resolve => {
+    execFile(
+      gitBin || 'git',
+      ['diff', '--no-index', '--', '/dev/null', filePath],
+      { cwd, windowsHide: true, timeout: 30_000, maxBuffer: 32 * 1024 * 1024 },
+      (_err, stdout) => resolve(String(stdout || ''))
+    )
+  })
+}
+
+async function reviewStage(repoPath, filePath, gitBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review stage' })
+
+  await gitFor(cwd, gitBin).raw(filePath ? ['add', '--', filePath] : ['add', '-A'])
+
+  return { ok: true }
+}
+
+async function reviewUnstage(repoPath, filePath, gitBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review unstage' })
+
+  await gitFor(cwd, gitBin).raw(filePath ? ['reset', '-q', 'HEAD', '--', filePath] : ['reset', '-q', 'HEAD'])
+
+  return { ok: true }
+}
+
+// Discard changes back to the committed state. Destructive — the renderer
+// confirms first. Restores tracked files and removes untracked ones.
+async function reviewRevert(repoPath, filePath, gitBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review revert' })
+  const git = gitFor(cwd, gitBin)
+
+  if (filePath) {
+    await git.raw(['checkout', 'HEAD', '--', filePath]).catch(() => undefined)
+    await git.raw(['clean', '-fd', '--', filePath]).catch(() => undefined)
+  } else {
+    await git.raw(['checkout', 'HEAD', '--', '.']).catch(() => undefined)
+    await git.raw(['clean', '-fd']).catch(() => undefined)
+  }
+
+  return { ok: true }
+}
+
+// Resolve a ref to a commit sha (captures the turn baseline for "Last turn").
+async function reviewRevParse(repoPath, ref, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review rev-parse' })
+  } catch {
+    return null
+  }
+
+  try {
+    return (await gitFor(cwd, gitBin).revparse([ref || 'HEAD'])).trim() || null
+  } catch {
+    return null
+  }
+}
+
+// Commit the working tree. Mirrors VS Code: if nothing is staged, stage
+// everything first ("commit all"), then commit. Optionally push afterward,
+// setting upstream on the first push.
+async function reviewCommit(repoPath, message, push, gitBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review commit' })
+  const git = gitFor(cwd, gitBin)
+  const status = await git.status()
+
+  if (status.staged.length === 0) {
+    await git.raw(['add', '-A'])
+  }
+
+  await git.commit(message)
+
+  if (push) {
+    const fresh = await git.status()
+
+    if (fresh.tracking) {
+      await git.push()
+    } else if (fresh.current) {
+      await git.raw(['push', '-u', 'origin', fresh.current])
+    }
+  }
+
+  return { ok: true }
+}
+
+// Gather the context the model needs to draft a commit message: the diff of
+// what *will* be committed (staged when anything is staged, else everything
+// vs HEAD — mirroring reviewCommit's "stage all when nothing staged" rule),
+// the names of untracked files (which carry no diff), and recent commit
+// subjects for style. Diff is capped so the payload stays bounded. Reads only.
+async function reviewCommitContext(repoPath, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review commit context' })
+  } catch {
+    return { diff: '', recent: '' }
+  }
+
+  const git = gitFor(cwd, gitBin)
+  const safe = args => git.diff(args).catch(() => '')
+
+  let status
+  try {
+    status = await git.status()
+  } catch {
+    return { diff: '', recent: '' }
+  }
+
+  // What will land: staged changes if any, otherwise all tracked changes vs HEAD.
+  let diff = capText(
+    status.staged.length > 0 ? await safe(['--cached']) : await safe(['HEAD']),
+    COMMIT_CONTEXT_DIFF_MAX_CHARS,
+    'diff truncated for commit-message generation'
+  )
+
+  // Untracked files have no diff — list them so new files aren't invisible.
+  const untracked = status.not_added || []
+  if (untracked.length > 0) {
+    const visible = untracked.slice(0, COMMIT_CONTEXT_UNTRACKED_MAX)
+    const omitted = untracked.length - visible.length
+    const note =
+      `\n# New (untracked) files:\n${visible.map(p => `#   ${p}`).join('\n')}\n` +
+      (omitted > 0 ? `#   ... ${omitted} more omitted\n` : '')
+
+    diff = diff ? `${diff}${note}` : note
+  }
+
+  const recent = await git.raw(['log', '-n', '10', '--pretty=format:%s']).catch(() => '')
+
+  return { diff: diff || '', recent: String(recent || '').trim() }
+}
+
+async function reviewPush(repoPath, gitBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review push' })
+  const git = gitFor(cwd, gitBin)
+  const status = await git.status()
+
+  if (status.tracking) {
+    await git.push()
+  } else if (status.current) {
+    await git.raw(['push', '-u', 'origin', status.current])
+  }
+
+  return { ok: true }
+}
+
+// gh availability + auth + whether this branch already has a PR. Reads only;
+// drives the PR button's enabled/label state. `ghReady` is false when gh is
+// missing OR not authenticated — either way the PR action can't run.
+async function reviewShipInfo(repoPath, ghBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review ship info' })
+  } catch {
+    return { ghReady: false, pr: null }
+  }
+
+  const auth = await runGh(['auth', 'status'], cwd, ghBin)
+
+  if (!auth.ok) {
+    return { ghReady: false, pr: null }
+  }
+
+  const view = await runGh(['pr', 'view', '--json', 'url,state,number'], cwd, ghBin)
+
+  if (!view.ok) {
+    // gh exits non-zero when no PR exists for the branch — that's not an error.
+    return { ghReady: true, pr: null }
+  }
+
+  try {
+    const pr = JSON.parse(view.stdout)
+
+    return { ghReady: true, pr: pr && pr.url ? { url: pr.url, state: pr.state, number: pr.number } : null }
+  } catch {
+    return { ghReady: true, pr: null }
+  }
+}
+
+// Create a PR for the current branch (pushing first so gh has a remote ref),
+// letting gh fill title/body from the commits. Returns the new PR url.
+async function reviewCreatePr(repoPath, gitBin, ghBin) {
+  const cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Review create PR' })
+
+  await reviewPush(repoPath, gitBin).catch(() => undefined)
+
+  const created = await runGh(['pr', 'create', '--fill'], cwd, ghBin)
+
+  if (!created.ok) {
+    throw new Error('gh pr create failed (is gh installed and authenticated?)')
+  }
+
+  const url = created.stdout.trim().split('\n').filter(Boolean).pop() || ''
+
+  return { url }
+}
+
+// Compact working-tree status for the composer coding rail: branch, ahead/behind,
+// per-state change counts, +/- vs HEAD, and a capped changed-file list.
+async function repoStatus(repoPath, gitBin) {
+  let cwd
+
+  try {
+    cwd = resolveRequestedPathForIpc(repoPath, { purpose: 'Repo status' })
+  } catch {
+    return null
+  }
+
+  // Session cwds can point at a deleted worktree for a moment (or forever in a
+  // stale row). simple-git throws at construction time on a missing baseDir, so
+  // fail soft and hide the coding rail instead of spamming IPC handler errors.
+  try {
+    const stat = await fs.stat(cwd)
+    if (!stat.isDirectory()) {
+      return null
+    }
+  } catch {
+    return null
+  }
+
+  let git
+  try {
+    git = gitFor(cwd, gitBin)
+  } catch {
+    return null
+  }
+  let status
+
+  try {
+    status = await git.status()
+  } catch {
+    // Not a repo / git unavailable / remote backend.
+    return null
+  }
+
+  const detached = typeof status.detached === 'boolean' ? status.detached : !status.current
+  const files = status.files.map(file => ({
+    path: file.path,
+    staged: isStaged(file),
+    unstaged: Boolean(file.working_dir && file.working_dir !== ' ' && file.working_dir !== '?'),
+    untracked: file.index === '?' || file.working_dir === '?',
+    conflicted: file.index === 'U' || file.working_dir === 'U'
+  }))
+
+  const result = {
+    branch: detached ? null : status.current || null,
+    defaultBranch: await defaultBranchName(git),
+    detached,
+    ahead: status.ahead || 0,
+    behind: status.behind || 0,
+    staged: files.filter(f => f.staged).length,
+    unstaged: files.filter(f => f.unstaged).length,
+    untracked: status.not_added.length,
+    conflicted: status.conflicted.length,
+    changed: files.length,
+    added: 0,
+    removed: 0,
+    files: files.slice(0, 200)
+  }
+
+  // +/- vs HEAD (staged + unstaged tracked changes). No HEAD yet → leave 0.
+  try {
+    const summary = await git.diffSummary(['HEAD'])
+    result.added = summary.insertions
+    result.removed = summary.deletions
+  } catch {
+    // No commits yet.
+  }
+
+  // `git diff HEAD` ignores untracked files, so a turn that only creates new
+  // files (the common case — a fresh module, a demo dir) showed +0 in the rail
+  // while the review pane counted them. Fold untracked insertions into `added`
+  // so the rail matches reality. Bounded (size cap + concurrency) like the
+  // review tree; only the capped file slice is counted so a huge untracked tree
+  // can't stall the probe.
+  try {
+    const untracked = status.not_added.slice(0, 500)
+    for (let i = 0; i < untracked.length; i += UNTRACKED_LINE_COUNT_CONCURRENCY) {
+      const batch = await Promise.all(
+        untracked.slice(i, i + UNTRACKED_LINE_COUNT_CONCURRENCY).map(path => untrackedInsertions(cwd, path))
+      )
+      result.added += batch.reduce((sum, n) => sum + n, 0)
+    }
+  } catch {
+    // Best-effort: a probe failure just leaves untracked lines uncounted.
+  }
+
+  return result
+}
+
+module.exports = {
+  branchBase,
+  fileDiffVsHead,
+  repoStatus,
+  resolveRenamePath,
+  reviewCommit,
+  reviewCommitContext,
+  reviewCreatePr,
+  reviewDiff,
+  reviewList,
+  reviewPush,
+  reviewRevParse,
+  reviewRevert,
+  reviewShipInfo,
+  reviewStage,
+  reviewUnstage
+}
--- a/apps/desktop/electron/git-review-ops.test.cjs
+++ b/apps/desktop/electron/git-review-ops.test.cjs
@@ -0,0 +1,22 @@
+'use strict'
+
+const assert = require('node:assert/strict')
+const test = require('node:test')
+
+const { resolveRenamePath } = require('./git-review-ops.cjs')
+
+test('resolveRenamePath: plain path is unchanged', () => {
+  assert.equal(resolveRenamePath('src/a.ts'), 'src/a.ts')
+})
+
+test('resolveRenamePath: simple rename resolves to the new path', () => {
+  assert.equal(resolveRenamePath('old.ts => new.ts'), 'new.ts')
+})
+
+test('resolveRenamePath: brace rename resolves to the new path', () => {
+  assert.equal(resolveRenamePath('src/{old => new}/file.ts'), 'src/new/file.ts')
+})
+
+test('resolveRenamePath: brace rename collapsing a segment', () => {
+  assert.equal(resolveRenamePath('src/{lib => }/file.ts'), 'src/file.ts')
+})
--- a/apps/desktop/electron/git-worktree-ops.cjs
+++ b/apps/desktop/electron/git-worktree-ops.cjs
@@ -0,0 +1,350 @@
+'use strict'
+
+// Git-driven worktree operations for the desktop "Start work" flow: spin up a
+// fresh worktree the lightest way (`git worktree add -b`), list real worktrees,
+// and remove them. Git is the source of truth; the renderer just drives these.
+
+const path = require('node:path')
+const fs = require('node:fs')
+const { execFile } = require('node:child_process')
+
+const { resolveRequestedPathForIpc } = require('./hardening.cjs')
+
+function runGit(gitBin, args, cwd) {
+  return new Promise((resolve, reject) => {
+    execFile(
+      gitBin,
+      args,
+      { cwd, windowsHide: true, timeout: 30_000, maxBuffer: 8 * 1024 * 1024 },
+      (err, stdout, stderr) => {
+        if (err) {
+          err.stderr = String(stderr || '')
+          reject(err)
+
+          return
+        }
+
+        resolve(String(stdout || ''))
+      }
+    )
+  })
+}
+
+// Parse `git worktree list --porcelain`. The first record is the main worktree.
+function parseWorktrees(out) {
+  const trees = []
+  let cur = null
+
+  for (const line of out.split('\n')) {
+    if (line.startsWith('worktree ')) {
+      if (cur) {
+        trees.push(cur)
+      }
+
+      cur = { path: line.slice(9).trim(), branch: null, detached: false, bare: false, locked: false }
+    } else if (!cur) {
+      continue
+    } else if (line.startsWith('branch ')) {
+      cur.branch = line
+        .slice(7)
+        .trim()
+        .replace(/^refs\/heads\//, '')
+    } else if (line === 'detached') {
+      cur.detached = true
+    } else if (line === 'bare') {
+      cur.bare = true
+    } else if (line.startsWith('locked')) {
+      cur.locked = true
+    }
+  }
+
+  if (cur) {
+    trees.push(cur)
+  }
+
+  return trees
+}
+
+async function listWorktrees(repoPath, gitBin) {
+  let resolved
+
+  try {
+    resolved = resolveRequestedPathForIpc(repoPath, { purpose: 'Worktree list' })
+  } catch {
+    return []
+  }
+
+  try {
+    const out = await runGit(gitBin, ['worktree', 'list', '--porcelain'], resolved)
+
+    return parseWorktrees(out).map((tree, index) => ({
+      path: tree.path,
+      branch: tree.branch,
+      isMain: index === 0,
+      detached: tree.detached,
+      locked: tree.locked
+    }))
+  } catch {
+    return []
+  }
+}
+
+// A git-ref-safe branch name (spaces → "-", drop forbidden chars, trim edges),
+// or "" when nothing usable remains. Mirrors the renderer's `gitRef`, so a bad
+// value can't reach `git` no matter the caller (the GUI also enforces live).
+function sanitizeBranch(name) {
+  return String(name || '')
+    .replace(/\s+/g, '-')
+    .replace(/[^\w./-]/g, '')
+    .replace(/-{2,}/g, '-')
+    .replace(/\/{2,}/g, '/')
+    .replace(/\.{2,}/g, '.')
+    .replace(/^[-./]+|[-./]+$/g, '')
+}
+
+function slugify(name) {
+  const slug = String(name || '')
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 40)
+    .replace(/-+$/g, '')
+
+  return slug || 'work'
+}
+
+const TRUNK_BRANCHES = ['main', 'master']
+
+async function gitLine(gitBin, args, cwd) {
+  try {
+    return (await runGit(gitBin, args, cwd)).trim()
+  } catch {
+    return ''
+  }
+}
+
+async function defaultBranch(gitBin, cwd) {
+  const remote = (
+    await gitLine(gitBin, ['symbolic-ref', '--quiet', '--short', 'refs/remotes/origin/HEAD'], cwd)
+  ).replace(/^origin\//, '')
+
+  if (remote) {
+    return remote
+  }
+
+  const configured = await gitLine(gitBin, ['config', '--get', 'init.defaultBranch'], cwd)
+
+  if (configured) {
+    return configured
+  }
+
+  for (const branch of TRUNK_BRANCHES) {
+    if (await gitLine(gitBin, ['show-ref', '--verify', `refs/heads/${branch}`], cwd)) {
+      return branch
+    }
+  }
+
+  return ''
+}
+
+// A brand-new project folder isn't a git repo — and a freshly-init'd one has no
+// commit to branch from — so `git worktree add` would fail. Make the dir a repo
+// with a root commit on the user's behalf so worktrees "just work". No-op for a
+// repo that already has commits; never touches the user's files (the seed commit
+// is `--allow-empty`), and never inits a dir that already lives inside a repo.
+async function ensureGitRepo(gitBin, dir) {
+  let needsRoot = false
+
+  try {
+    const inside = (await runGit(gitBin, ['rev-parse', '--is-inside-work-tree'], dir)).trim()
+
+    if (inside !== 'true') {
+      await runGit(gitBin, ['init'], dir)
+      needsRoot = true
+    } else {
+      // Repo exists; a worktree still needs a HEAD to branch from.
+      try {
+        await runGit(gitBin, ['rev-parse', '--verify', 'HEAD'], dir)
+      } catch {
+        needsRoot = true
+      }
+    }
+  } catch {
+    await runGit(gitBin, ['init'], dir)
+    needsRoot = true
+  }
+
+  if (needsRoot) {
+    // Inline identity so the seed commit lands even with no global git config.
+    await runGit(
+      gitBin,
+      [
+        '-c',
+        'user.email=hermes@localhost',
+        '-c',
+        'user.name=Hermes',
+        'commit',
+        '--allow-empty',
+        '-m',
+        'Initial commit'
+      ],
+      dir
+    )
+  }
+}
+
+// Resolve the repo's MAIN worktree root, so `.worktrees/` always nests under the
+// primary checkout even when called from a linked worktree.
+async function mainRoot(gitBin, cwd) {
+  const list = await listWorktrees(cwd, gitBin)
+  const main = list.find(tree => tree.isMain)
+
+  return main ? main.path : cwd
+}
+
+function uniqueDir(base) {
+  let dir = base
+  let n = 1
+
+  while (fs.existsSync(dir)) {
+    n += 1
+    dir = `${base}-${n}`
+  }
+
+  return dir
+}
+
+async function addExistingBranchWorktree(gitBin, root, name) {
+  const branch = sanitizeBranch(name)
+
+  if (!branch) {
+    throw new Error('Branch name is required.')
+  }
+
+  if (branch === (await defaultBranch(gitBin, root))) {
+    await runGit(gitBin, ['switch', branch], root)
+
+    return { path: root, branch, repoRoot: root }
+  }
+
+  const dir = uniqueDir(path.join(root, '.worktrees', slugify(branch)))
+  await runGit(gitBin, ['worktree', 'add', dir, branch], root)
+
+  return { path: dir, branch, repoRoot: root }
+}
+
+async function addWorktree(repoPath, options, gitBin) {
+  const resolved = resolveRequestedPathForIpc(repoPath, { purpose: 'Worktree add' })
+  // A new project's folder may not be a git repo yet — init it (with a root
+  // commit) so the worktree has something to branch from.
+  await ensureGitRepo(gitBin, resolved)
+  const root = await mainRoot(gitBin, resolved)
+  const opts = options || {}
+
+  if (opts.existingBranch) {
+    return addExistingBranchWorktree(gitBin, root, opts.existingBranch)
+  }
+
+  const slug = slugify(opts.name || `work-${Date.now().toString(36)}`)
+  const branch = sanitizeBranch(opts.branch) || `hermes/${slug}`
+  const dir = uniqueDir(path.join(root, '.worktrees', slug))
+
+  const args = ['worktree', 'add', '-b', branch, dir]
+
+  if (opts.base) {
+    args.push(String(opts.base))
+  }
+
+  try {
+    await runGit(gitBin, args, root)
+  } catch (err) {
+    // Branch name may already exist — retry checking out the existing branch
+    // into a fresh worktree dir instead of failing the whole flow.
+    if (/already exists/i.test(err.stderr || '')) {
+      await runGit(gitBin, ['worktree', 'add', dir, branch], root)
+    } else {
+      throw err
+    }
+  }
+
+  return { path: dir, branch, repoRoot: root }
+}
+
+async function removeWorktree(repoPath, worktreePath, options, gitBin) {
+  const resolvedRepo = resolveRequestedPathForIpc(repoPath, { purpose: 'Worktree remove (repo)' })
+  const resolvedTree = resolveRequestedPathForIpc(worktreePath, { purpose: 'Worktree remove (tree)' })
+  const root = await mainRoot(gitBin, resolvedRepo)
+  const args = ['worktree', 'remove']
+
+  if (options && options.force) {
+    args.push('--force')
+  }
+
+  args.push(resolvedTree)
+  await runGit(gitBin, args, root)
+
+  return { removed: resolvedTree }
+}
+
+// List local branches for the "convert a branch into a worktree" picker, most
+// recently committed first. Each carries whether it's already checked out in a
+// worktree and, when checked out, that worktree's path. Empty on a non-repo /
+// remote backend where the probe can't run.
+async function listBranches(repoPath, gitBin) {
+  let resolved
+
+  try {
+    resolved = resolveRequestedPathForIpc(repoPath, { purpose: 'Branch list' })
+  } catch {
+    return []
+  }
+
+  try {
+    const out = await runGit(
+      gitBin,
+      ['for-each-ref', '--format=%(refname:short)', '--sort=-committerdate', 'refs/heads'],
+      resolved
+    )
+    const trees = await listWorktrees(resolved, gitBin)
+    const pathByBranch = new Map(trees.filter(tree => tree.branch).map(tree => [tree.branch, tree.path]))
+    const trunk = await defaultBranch(gitBin, resolved)
+
+    return out
+      .split('\n')
+      .map(line => line.trim())
+      .filter(Boolean)
+      .map(name => ({
+        name,
+        checkedOut: pathByBranch.has(name),
+        isDefault: Boolean(trunk && name === trunk),
+        worktreePath: pathByBranch.get(name) || null
+      }))
+  } catch {
+    return []
+  }
+}
+
+async function switchBranch(repoPath, branch, gitBin) {
+  const resolved = resolveRequestedPathForIpc(repoPath, { purpose: 'Branch switch' })
+  const target = sanitizeBranch(branch)
+
+  if (!target) {
+    throw new Error('Branch name is required.')
+  }
+
+  await runGit(gitBin, ['switch', target], resolved)
+
+  return { branch: target }
+}
+
+module.exports = {
+  addWorktree,
+  ensureGitRepo,
+  listBranches,
+  listWorktrees,
+  parseWorktrees,
+  removeWorktree,
+  sanitizeBranch,
+  switchBranch
+}
--- a/apps/desktop/electron/git-worktree-ops.test.cjs
+++ b/apps/desktop/electron/git-worktree-ops.test.cjs
@@ -0,0 +1,214 @@
+'use strict'
+
+const assert = require('node:assert/strict')
+const { execFileSync } = require('node:child_process')
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')
+const test = require('node:test')
+
+const {
+  addWorktree,
+  ensureGitRepo,
+  listBranches,
+  parseWorktrees,
+  sanitizeBranch,
+  switchBranch
+} = require('./git-worktree-ops.cjs')
+
+test('sanitizeBranch: spaces → hyphens, forbidden chars dropped, edges trimmed', () => {
+  assert.equal(sanitizeBranch('beach vibes'), 'beach-vibes')
+  assert.equal(sanitizeBranch('feat/cool thing'), 'feat/cool-thing')
+  assert.equal(sanitizeBranch('  wip~^:? '), 'wip')
+  assert.equal(sanitizeBranch('///'), '')
+})
+
+test('parseWorktrees: main checkout + linked worktree', () => {
+  const out = [
+    'worktree /repo',
+    'HEAD abc123',
+    'branch refs/heads/main',
+    '',
+    'worktree /repo/.worktrees/feat',
+    'HEAD def456',
+    'branch refs/heads/hermes/feat',
+    ''
+  ].join('\n')
+
+  const trees = parseWorktrees(out)
+
+  assert.equal(trees.length, 2)
+  assert.equal(trees[0].path, '/repo')
+  assert.equal(trees[0].branch, 'main')
+  assert.equal(trees[1].path, '/repo/.worktrees/feat')
+  assert.equal(trees[1].branch, 'hermes/feat')
+})
+
+test('parseWorktrees: detached + locked flags', () => {
+  const out = ['worktree /repo/wt', 'HEAD abc', 'detached', 'locked reason', ''].join('\n')
+  const trees = parseWorktrees(out)
+
+  assert.equal(trees.length, 1)
+  assert.equal(trees[0].detached, true)
+  assert.equal(trees[0].locked, true)
+  assert.equal(trees[0].branch, null)
+})
+
+test('parseWorktrees: empty input', () => {
+  assert.deepEqual(parseWorktrees(''), [])
+})
+
+test('ensureGitRepo: inits a plain dir with a root commit so worktrees branch', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-wt-'))
+  const git = (...args) => execFileSync('git', args, { cwd: dir }).toString().trim()
+
+  try {
+    await ensureGitRepo('git', dir)
+    assert.match(git('rev-parse', '--verify', 'HEAD'), /^[0-9a-f]{7,}$/)
+
+    // The whole point: a worktree can now branch off the seeded root commit.
+    execFileSync('git', ['worktree', 'add', '-b', 'wt', path.join(dir, '.worktrees', 'wt')], { cwd: dir })
+    assert.ok(fs.existsSync(path.join(dir, '.worktrees', 'wt')))
+
+    // Idempotent: an already-committed repo gets no extra commit.
+    await ensureGitRepo('git', dir)
+    assert.equal(git('rev-list', '--count', 'HEAD'), '1')
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('switchBranch: switches a normal checkout branch', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-switch-'))
+  const git = (...args) => execFileSync('git', args, { cwd: dir }).toString().trim()
+
+  try {
+    await ensureGitRepo('git', dir)
+    execFileSync('git', ['branch', 'feature'], { cwd: dir })
+
+    await switchBranch(dir, 'feature', 'git')
+
+    assert.equal(git('branch', '--show-current'), 'feature')
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('listBranches: lists locals and flags the checked-out branch', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-branches-'))
+
+  try {
+    await ensureGitRepo('git', dir)
+    const current = execFileSync('git', ['branch', '--show-current'], { cwd: dir }).toString().trim()
+    execFileSync('git', ['branch', 'feature'], { cwd: dir })
+
+    const branches = await listBranches(dir, 'git')
+    const names = branches.map(b => b.name).sort()
+
+    assert.deepEqual(names, [current, 'feature'].sort())
+    // The repo's own checkout is flagged; the unused branch is convertible.
+    assert.equal(branches.find(b => b.name === current).checkedOut, true)
+    assert.equal(branches.find(b => b.name === current).isDefault, true)
+    assert.equal(fs.realpathSync(branches.find(b => b.name === current).worktreePath), fs.realpathSync(dir))
+    assert.equal(branches.find(b => b.name === 'feature').checkedOut, false)
+    assert.equal(branches.find(b => b.name === 'feature').isDefault, false)
+    assert.equal(branches.find(b => b.name === 'feature').worktreePath, null)
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('listBranches: flags a free default branch as default, not checked out', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-branches-default-'))
+  const git = (...args) => execFileSync('git', args, { cwd: dir }).toString().trim()
+
+  try {
+    await ensureGitRepo('git', dir)
+    const trunk = git('branch', '--show-current')
+    execFileSync('git', ['switch', '-c', 'rawr'], { cwd: dir })
+
+    const branches = await listBranches(dir, 'git')
+    const defaultBranch = branches.find(b => b.name === trunk)
+
+    assert.equal(defaultBranch.checkedOut, false)
+    assert.equal(defaultBranch.isDefault, true)
+    assert.equal(defaultBranch.worktreePath, null)
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('listBranches: a branch claimed by a worktree is flagged checked out', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-branches-wt-'))
+
+  try {
+    await ensureGitRepo('git', dir)
+    execFileSync('git', ['branch', 'feature'], { cwd: dir })
+    // addWorktree converts the existing "feature" branch into a worktree.
+    const result = await addWorktree(dir, { existingBranch: 'feature' }, 'git')
+
+    assert.equal(result.branch, 'feature')
+    assert.ok(fs.existsSync(result.path))
+
+    const branches = await listBranches(dir, 'git')
+
+    assert.equal(branches.find(b => b.name === 'feature').checkedOut, true)
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('listBranches: empty on a non-repo path', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-nonrepo-'))
+
+  try {
+    assert.deepEqual(await listBranches(dir, 'git'), [])
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('addWorktree: existingBranch checks the branch out without a new branch', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-convert-'))
+  const git = (...args) => execFileSync('git', args, { cwd: dir }).toString().trim()
+
+  try {
+    await ensureGitRepo('git', dir)
+    execFileSync('git', ['branch', 'cool/feature'], { cwd: dir })
+
+    const before = git('branch', '--list').split('\n').length
+    const result = await addWorktree(dir, { existingBranch: 'cool/feature' }, 'git')
+
+    // No new branch was created — only the existing one is checked out.
+    assert.equal(git('branch', '--list').split('\n').length, before)
+    assert.equal(result.branch, 'cool/feature')
+    // Dir is named off the branch slug, nested under the main repo's .worktrees.
+    assert.match(result.path, /[/\\]\.worktrees[/\\]cool-feature/)
+    assert.equal(
+      execFileSync('git', ['branch', '--show-current'], { cwd: result.path }).toString().trim(),
+      'cool/feature'
+    )
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
+
+test('addWorktree: existing default branch switches the main checkout, not .worktrees/main', async () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-convert-default-'))
+  const git = (...args) => execFileSync('git', args, { cwd: dir }).toString().trim()
+
+  try {
+    await ensureGitRepo('git', dir)
+    const trunk = git('branch', '--show-current')
+    execFileSync('git', ['switch', '-c', 'rawr'], { cwd: dir })
+
+    const result = await addWorktree(dir, { existingBranch: trunk }, 'git')
+
+    assert.equal(result.branch, trunk)
+    assert.equal(fs.realpathSync(result.path), fs.realpathSync(dir))
+    assert.equal(git('branch', '--show-current'), trunk)
+    assert.equal(fs.existsSync(path.join(dir, '.worktrees', trunk)), false)
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true })
+  }
+})
--- a/apps/desktop/electron/git-worktrees.cjs
+++ b/apps/desktop/electron/git-worktrees.cjs
@@ -1,174 +0,0 @@
-'use strict'
-
-// Resolve git-worktree relationships for a set of session cwds, reading git's
-// on-disk metadata directly (no `git` spawn per path):
-//
-//   - A normal checkout has a `.git` DIRECTORY at its root → it's the main
-//     worktree; its repo root IS that directory's parent.
-//   - A linked worktree has a `.git` FILE: `gitdir: <repo>/.git/worktrees/<name>`.
-//     That admin dir's `commondir` points back at the shared `<repo>/.git`, whose
-//     parent is the main repo root.
-//
-// Grouping by repoRoot therefore clusters a repo's main checkout with all of its
-// linked worktrees, regardless of how the worktree directories are named. The
-// branch (read from the worktree's own HEAD) gives each worktree a meaningful
-// label.
-
-const fs = require('node:fs')
-const path = require('node:path')
-const { resolveRequestedPathForIpc } = require('./hardening.cjs')
-
-// Walk up from `start` to the nearest ancestor that carries a `.git` entry
-// (file for a linked worktree, dir for the main checkout). Capped so a stray
-// path can't loop forever.
-function findGitHost(start, fsImpl) {
-  let dir = start
-
-  for (let i = 0; i < 64; i += 1) {
-    const dotgit = path.join(dir, '.git')
-
-    try {
-      if (fsImpl.existsSync(dotgit)) {
-        return dir
-      }
-    } catch {
-      return null
-    }
-
-    const parent = path.dirname(dir)
-
-    if (parent === dir) {
-      return null
-    }
-
-    dir = parent
-  }
-
-  return null
-}
-
-function readBranch(gitDir, fsImpl) {
-  try {
-    const head = fsImpl.readFileSync(path.join(gitDir, 'HEAD'), 'utf8').trim()
-    const ref = head.match(/^ref:\s*refs\/heads\/(.+)$/)
-
-    if (ref) {
-      return ref[1]
-    }
-
-    // Detached HEAD: surface a short sha so the worktree still gets a label.
-    return /^[0-9a-f]{7,40}$/i.test(head) ? head.slice(0, 8) : null
-  } catch {
-    return null
-  }
-}
-
-// Given the directory that owns the `.git` entry, resolve its worktree identity.
-function resolveFromHost(host, fsImpl) {
-  const dotgit = path.join(host, '.git')
-  let stat
-
-  try {
-    stat = fsImpl.statSync(dotgit)
-  } catch {
-    return null
-  }
-
-  if (stat.isDirectory()) {
-    return {
-      repoRoot: host,
-      worktreeRoot: host,
-      isMainWorktree: true,
-      branch: readBranch(dotgit, fsImpl)
-    }
-  }
-
-  // Linked worktree: `.git` is a file pointing at the admin dir.
-  let contents
-
-  try {
-    contents = fsImpl.readFileSync(dotgit, 'utf8').trim()
-  } catch {
-    return null
-  }
-
-  const match = contents.match(/^gitdir:\s*(.+)$/m)
-
-  if (!match) {
-    return null
-  }
-
-  const adminDir = path.resolve(host, match[1].trim())
-
-  // `commondir` resolves to the shared `<repo>/.git`; fall back to walking two
-  // levels up from `<repo>/.git/worktrees/<name>` if it's missing.
-  let commonDir
-
-  try {
-    const rel = fsImpl.readFileSync(path.join(adminDir, 'commondir'), 'utf8').trim()
-    commonDir = path.resolve(adminDir, rel)
-  } catch {
-    commonDir = path.dirname(path.dirname(adminDir))
-  }
-
-  return {
-    repoRoot: path.dirname(commonDir),
-    worktreeRoot: host,
-    isMainWorktree: false,
-    branch: readBranch(adminDir, fsImpl)
-  }
-}
-
-function resolveWorktree(startPath, fsImpl = fs) {
-  let resolved
-
-  try {
-    resolved = resolveRequestedPathForIpc(startPath, { purpose: 'Worktree lookup' })
-  } catch {
-    return null
-  }
-
-  let start = resolved
-
-  try {
-    const stat = fsImpl.statSync(resolved)
-
-    if (!stat.isDirectory()) {
-      start = path.dirname(resolved)
-    }
-  } catch {
-    return null
-  }
-
-  const host = findGitHost(start, fsImpl)
-
-  if (!host) {
-    return null
-  }
-
-  return resolveFromHost(host, fsImpl)
-}
-
-// Batch entry point for the renderer: maps each requested cwd to its worktree
-// info (or null when it isn't inside a git checkout / can't be read). Dedupes so
-// many sessions sharing a cwd cost one lookup.
-async function worktreesForIpc(cwds, options = {}) {
-  const fsImpl = options.fs || fs
-  const list = Array.isArray(cwds) ? cwds : []
-  const out = {}
-
-  for (const cwd of list) {
-    if (typeof cwd !== 'string' || !cwd.trim() || cwd in out) {
-      continue
-    }
-
-    out[cwd] = resolveWorktree(cwd, fsImpl)
-  }
-
-  return out
-}
-
-module.exports = {
-  resolveWorktree,
-  worktreesForIpc
-}
--- a/Show More
+++ b/Show More