feat(mcp): adopt mcp__server__tool naming convention

Port from anomalyco/opencode#33533. Native MCP tools now register as mcp__<server>__<tool> (double-underscore delimiter) instead of mcp_<server>_<tool>, aligning with the convention used by Claude Code, Codex, and OpenCode. The double-underscore delimiter disambiguates the server/tool boundary even when either component contains underscores (the single-underscore form was ambiguous, which is why is_mcp_tool_parallel_safe already had to track provenance in a side-map). It also unifies native registration with the Anthropic-OAuth wire form (_MCP_TOOL_PREFIX = 'mcp__'), so the single->double promotion that path performed is now a no-op for native tools while still handling legacy replayed names. - tools/mcp_tool.py: add MCP_TOOL_NAME_PREFIX + mcp_prefixed_tool_name() helper; route _convert_mcp_schema, utility schemas, refresh stale-set, and the parallel-safe prefix gate through it - agent/transports/codex_event_projector.py: mirror convention in the deterministic call_id input for MCP server-executed tool calls - tests: update produced-name assertions to the new convention
2026-07-04 09:07:20 +08:00 · 2026-06-25 17:08:44 -07:00
1110 changed files with 10652 additions and 68851 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -66,12 +66,8 @@ runtime/

 # ---------- Not needed inside the Docker image ----------

-# Desktop app source (Tauri/Electron); never installed in the container.
-# apps/shared is the dashboard↔desktop websocket helper and is linked from
-# web/package.json as a file: workspace dep — keep it in the build context.
+# Desktop app source (Tauri/Electron); never installed in the container
 apps/
-!apps/shared/
-!apps/shared/**

 # Test suite — not shipped in production images
 tests/
--- a/.envrc
+++ b/.envrc
@@ -1,5 +1,5 @@
 watch_file pyproject.toml uv.lock
 watch_file package-lock.json package.json web/package.json ui-tui/package.json website/package.json apps/shared/package.json apps/desktop/package.json ui-tui/packages/hermes-ink/package.json
-watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix nix/hermes-agent.nix nix/desktop.nix
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix

 use flake
--- a/.github/actions/hermes-smoke-test/action.yml
+++ b/.github/actions/hermes-smoke-test/action.yml
@@ -0,0 +1,50 @@
+name: Hermes smoke test
+description: >
+  Run the image's built-in entrypoint against `--help` and `dashboard --help`
+  to catch basic runtime regressions before publishing.  Requires the image
+  to already be loaded into the local Docker daemon under `image`.
+
+  Works identically on amd64 and arm64 runners.
+
+inputs:
+  image:
+    description: Fully-qualified image tag (e.g. nousresearch/hermes-agent:test)
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Ensure /tmp/hermes-test is hermes-writable
+      shell: bash
+      run: |
+        # The image runs as the hermes user (UID 10000).  GitHub Actions
+        # creates /tmp/hermes-test root-owned by default, which hermes
+        # can't write to — chown it to match the in-container UID before
+        # bind-mounting.  Real users doing `docker run -v ~/.hermes:...`
+        # with their own UID hit the same issue and have their own
+        # remediations (HERMES_UID env var, or chown locally).
+        mkdir -p /tmp/hermes-test
+        sudo chown -R 10000:10000 /tmp/hermes-test
+
+    - name: hermes --help
+      shell: bash
+      run: |
+        # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so
+        # this exercises the actual production startup path. PR #30136
+        # review caught that an --entrypoint override here had been
+        # silently neutered by the s6-overlay migration — stage2-hook
+        # ignores its CMD args, so the smoke test was a no-op.
+        docker run --rm \
+          -v /tmp/hermes-test:/opt/data \
+          "${{ inputs.image }}" --help
+
+    - name: hermes dashboard --help
+      shell: bash
+      run: |
+        # Regression guard for #9153: dashboard was present in source but
+        # missing from the published image.  If this fails, something in
+        # the Dockerfile is excluding the dashboard subcommand from the
+        # installed package.
+        docker run --rm \
+          -v /tmp/hermes-test:/opt/data \
+          "${{ inputs.image }}" dashboard --help
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,6 @@ permissions:
  pull-requests: write # needed by lint (PR comment) + supply-chain (PR comment)
  actions: read # needed by osv-scanner (SARIF upload)
  security-events: write # needed by osv-scanner (SARIF upload)
-  packages: write # needed by docker build

 concurrency:
  group: ci-${{ github.ref }}
@@ -33,7 +32,6 @@ jobs:
  # (all lanes true) so post-merge validation is never weakened.
  # ─────────────────────────────────────────────────────────────────────
  detect:
-    name: Detect affected areas
    runs-on: ubuntu-latest
    outputs:
      python: ${{ steps.classify.outputs.python }}
@@ -55,15 +53,11 @@ jobs:
  # Skipped workflows (if condition is false) don't spin up runners.
  # ─────────────────────────────────────────────────────────────────────
  tests:
-    name: Python tests
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/tests.yml
-    with:
-      slice_count: 8

  lint:
-    name: Python lints
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/lint.yml
@@ -71,49 +65,35 @@ jobs:
      event_name: ${{ needs.detect.outputs.event_name }}

  typecheck:
-    name: TypeScript
    needs: detect
    if: needs.detect.outputs.frontend == 'true'
    uses: ./.github/workflows/typecheck.yml

  docs-site:
-    name: Docs Site
    needs: detect
    if: needs.detect.outputs.site == 'true'
    uses: ./.github/workflows/docs-site-checks.yml

  history-check:
-    name: Deny unrelated histories
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request'
    uses: ./.github/workflows/history-check.yml

  contributor-check:
-    name: Check contributors
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/contributor-check.yml

  uv-lockfile:
-    name: Check uv.lock
    needs: detect
    uses: ./.github/workflows/uv-lockfile-check.yml

  docker-lint:
-    name: Lint Docker scripts
    needs: detect
    if: needs.detect.outputs.docker_meta == 'true'
    uses: ./.github/workflows/docker-lint.yml

-  docker:
-    name: Build&Test Docker image
-    needs: detect
-    if: needs.detect.outputs.python == 'true' || needs.detect.outputs.frontend == 'true' || needs.detect.outputs.docker_meta == 'true'
-    uses: ./.github/workflows/docker.yml
-    secrets: inherit
-
  supply-chain:
-    name: Supply-chain scan
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request' && (needs.detect.outputs.scan == 'true' || needs.detect.outputs.deps == 'true' || needs.detect.outputs.mcp_catalog == 'true')
    uses: ./.github/workflows/supply-chain-audit.yml
@@ -124,7 +104,7 @@ jobs:
      mcp_catalog: ${{ needs.detect.outputs.mcp_catalog == 'true' }}

  osv-scanner:
-    name: OSV scan
+    needs: detect
    uses: ./.github/workflows/osv-scanner.yml

  # ─────────────────────────────────────────────────────────────────────
@@ -147,8 +127,6 @@ jobs:
      - docker-lint
      - supply-chain
      - osv-scanner
-      # we don't require docker to pass rn because it's so slow lol
-      # - docker
    if: always()
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/docker-lint.yml
+++ b/.github/workflows/docker-lint.yml
@@ -2,7 +2,7 @@ name: Docker / shell lint

 # Lints the container build inputs: Dockerfile (via hadolint) and any shell
 # scripts under docker/ (via shellcheck). These catch the class of regression
-# the behavioral docker smoke test can't — unquoted variable
+# the behavioral docker-publish smoke test can't — unquoted variable
 # expansions, silently-failing RUN commands, etc.
 #
 # Rules and ignores are documented in .hadolint.yaml at the repo root.
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,9 +1,24 @@
-name: Docker Build, Test, and Publish
+name: Docker Build and Publish

 on:
+  push:
+    branches: [main]
+    paths:
+      - '**/*.py'
+      - 'pyproject.toml'
+      - 'uv.lock'
+      - 'Dockerfile'
+      - 'docker/**'
+      - '.github/workflows/docker-publish.yml'
+      - '.github/actions/hermes-smoke-test/**'
+
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+
  release:
    types: [published]
-  workflow_call:

 permissions:
  contents: read
@@ -24,7 +39,11 @@ env:
  IMAGE_NAME: nousresearch/hermes-agent

 jobs:
-  # Build, test, and optionally push the amd64 image.
+  # ---------------------------------------------------------------------------
+  # Build amd64 natively.  This job also runs the smoke tests (basic --help
+  # and the dashboard subcommand regression guard from #9153), because amd64
+  # is the only arch we can `load` into the local daemon on an amd64 runner.
+  # ---------------------------------------------------------------------------
  build-amd64:
    # Only run on the upstream repository, not on forks
    if: github.repository == 'NousResearch/hermes-agent'
@@ -34,19 +53,24 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      # The image build + integration tests run on every event
-      # (PRs, push-to-main, release). Publish steps below are gated to
-      # push-to-main / release only.
+      # The image build + smoke test + integration tests run ONLY on
+      # push-to-main and release — never on PRs. They are the heaviest jobs
+      # in CI (~15-45 min) and a broken build surfaces on the main push (and
+      # is gated pre-merge by docker-lint + uv-lockfile-check). Every step
+      # below is skipped on PRs, so the job still reports green and the
+      # required check never hangs.
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        if: github.event_name != 'pull_request'
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

-      # Build once, load into the local daemon for testing.  Cached
+      # Build once, load into the local daemon for smoke testing.  Cached
      # to gha with a per-arch scope; the push step below reuses every
      # layer from this build.
-      - name: Build image (amd64)
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+      - name: Build image (amd64, smoke test)
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -58,12 +82,25 @@ jobs:
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64

+      - name: Smoke test image
+        if: github.event_name != 'pull_request'
+        uses: ./.github/actions/hermes-smoke-test
+        with:
+          image: ${{ env.IMAGE_NAME }}:test
+
+      # ---------------------------------------------------------------------
      # Run the docker-integration test suite against the freshly-built
-      # image already loaded into the local daemon (`:test`).
+      # image already loaded into the local daemon (`:test`).  These tests
+      # are excluded from the sharded `tests.yml :: test` matrix on purpose
+      # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each
+      # shard would otherwise reach the session-scoped ``built_image``
+      # fixture in ``tests/docker/conftest.py`` and start a 3-7min
+      # ``docker build`` — guaranteed to
+      # die in fixture setup.
      #
-      # Piggybacking here avoids a second image build: the build step
-      # already loaded the image into the daemon under
-      # `${IMAGE_NAME}:test`, so we just point ``HERMES_TEST_IMAGE`` at
+      # Piggybacking here avoids a second image build: the smoke test
+      # already proved the image loads + runs, so the daemon has it under
+      # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at
      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
      #
@@ -73,20 +110,26 @@ jobs:
      # cheapest path to coverage on every PR that touches docker code.
      # ---------------------------------------------------------------------
      - name: Install uv (for docker tests)
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        if: github.event_name != 'pull_request'
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5

      - name: Set up Python 3.11 (for docker tests)
+        if: github.event_name != 'pull_request'
        run: uv python install 3.11

      - name: Install Python dependencies (for docker tests)
+        if: github.event_name != 'pull_request'
        run: |
+          uv venv .venv --python 3.11
+          source .venv/bin/activate
          # ``dev`` extra pulls in pytest, pytest-asyncio —
          # everything tests/docker/ needs.  We deliberately avoid ``all``
          # here because the docker tests only drive the container via
          # subprocess and don't import hermes_agent's optional deps.
-          uv sync --locked --python 3.11 --extra dev
+          uv pip install -e ".[dev]"

      - name: Run docker integration tests
+        if: github.event_name != 'pull_request'
        env:
          # Skip rebuild; use the image already loaded by the build step.
          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
@@ -96,11 +139,12 @@ jobs:
          OPENAI_API_KEY: ""
          NOUS_API_KEY: ""
        run: |
-          scripts/run_tests.sh tests/docker/ --file-timeout 600
+          source .venv/bin/activate
+          python -m pytest tests/docker/ -v --tb=short

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -111,7 +155,7 @@ jobs:
      - name: Push amd64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -135,7 +179,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: digest-amd64
          path: /tmp/digests/*
@@ -143,7 +187,10 @@ jobs:
          retention-days: 1

  # ---------------------------------------------------------------------------
-  # Build, test, and optionally push the arm64 image.
+  # Build arm64 natively on GitHub's free arm64 runner.  This replaces the
+  # previous QEMU-emulated arm64 build, which was ~5-10x slower and shared
+  # a cache scope with amd64.  Matches the amd64 job's shape: build+load,
+  # smoke test, then on push/release push by digest.
  # ---------------------------------------------------------------------------
  build-arm64:
    if: github.repository == 'NousResearch/hermes-agent'
@@ -153,26 +200,29 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

+      # arm64 build runs only on push-to-main and release (see build-amd64).
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        if: github.event_name != 'pull_request'
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

      # Log in to ghcr.io so the registry-backed build cache below can be
      # read (cache-from) on every event and written (cache-to) on
      # push/release.  Uses the workflow's GITHUB_TOKEN, which is valid for
      # the whole job — unlike the gha cache backend's short-lived Azure SAS
      # token, which expired mid-build on slow cold-cache arm64 runs and
-      # crashed the build before the tests ran (the reason the gha cache
+      # crashed the build before the smoke test (the reason the gha cache
      # was removed from arm64 PRs in the first place).
      - name: Log in to ghcr.io (build cache)
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # Build once, load into the local daemon for testing, then push
+      # Build once, load into the local daemon for smoke testing, then push
      # by digest below. Reads AND writes the registry-backed cache so the
      # push reuses layers from this build and the next build starts warm.
      #
@@ -180,8 +230,9 @@ jobs:
      # cache that previously broke here: its credential is the job-lifetime
      # GITHUB_TOKEN, not a short-lived SAS token, so the cold-build-outlives-
      # token failure mode cannot recur.
-      - name: Build image (arm64, cached publish)
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+      - name: Build image (arm64, smoke test, cached publish)
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -193,29 +244,15 @@ jobs:
          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max

-      - name: Install uv for docker tests
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
-
-      - name: Set up Python 3.11 for docker tests
-        run: uv python install 3.11
-
-      - name: Install Python dependencies for docker tests
-        run: |
-          uv sync --locked --python 3.11 --extra dev
-
-      - name: Run docker tests
-        env:
-          # Skip rebuild; use the image already loaded by the build step.
-          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
-          OPENROUTER_API_KEY: ""
-          OPENAI_API_KEY: ""
-          NOUS_API_KEY: ""
-        run: |
-          scripts/run_tests.sh tests/docker/ --file-timeout 600
+      - name: Smoke test image
+        if: github.event_name != 'pull_request'
+        uses: ./.github/actions/hermes-smoke-test
+        with:
+          image: ${{ env.IMAGE_NAME }}:test

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -223,7 +260,7 @@ jobs:
      - name: Push arm64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -245,7 +282,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: digest-arm64
          path: /tmp/digests/*
@@ -267,17 +304,17 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Download digests
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          path: /tmp/digests
          pattern: digest-*
          merge-multiple: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -37,7 +37,7 @@ jobs:
          fetch-depth: 0 # need full history for merge-base + worktree

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      - name: Install ruff + ty
        uses: ./.github/actions/retry
@@ -110,7 +110,7 @@ jobs:
          cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY"

      - name: Upload reports as artifact
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
        with:
          name: lint-reports
          path: .lint-reports/
@@ -164,7 +164,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      - name: Install ruff
        uses: ./.github/actions/retry
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -3,17 +3,17 @@ name: Build Skills Index
 on:
  schedule:
    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: "0 6,18 * * *"
-  workflow_dispatch: # Manual trigger
+    - cron: '0 6,18 * * *'
+  workflow_dispatch:  # Manual trigger
  push:
    branches: [main]
    paths:
-      - "scripts/build_skills_index.py"
-      - ".github/workflows/skills-index.yml"
+      - 'scripts/build_skills_index.py'
+      - '.github/workflows/skills-index.yml'

 permissions:
  contents: read
-  actions: write # to trigger deploy-site.yml on schedule
+  actions: write   # to trigger deploy-site.yml on schedule

 jobs:
  build-index:
@@ -21,11 +21,11 @@ jobs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
        with:
-          python-version: "3.11"
+          python-version: '3.11'

      - name: Install dependencies
        run: pip install httpx==0.28.1 pyyaml==6.0.2
@@ -36,7 +36,7 @@ jobs:
        run: python scripts/build_skills_index.py

      - name: Upload index artifact
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: skills-index
          path: website/static/api/skills-index.json
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,11 +2,6 @@ name: Tests

 on:
  workflow_call:
-    inputs:
-      slice_count:
-        description: Number of parallel test slices
-        type: number
-        default: 8

 permissions:
  contents: read
@@ -17,11 +12,13 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  generate:
-    name: "Generate slices"
+  test:
    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.matrix.outputs.matrix }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        slice: [1, 2, 3, 4, 5, 6]
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -30,26 +27,13 @@ jobs:
        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: test_durations.json
+          # main always writes a new suffix, but jobs pick the latest one with the same prefix
+          # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
+          # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
+          # If there are no exact matches, the action searches for partial matches of the restore keys.
+          # When the action finds a partial match, the most recent cache is restored to the path directory.
          key: test-durations

-      - name: Generate test slices
-        id: matrix
-        run: |
-          MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
-          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
-
-  test:
-    name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
-    needs: generate
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-
      - name: Install ripgrep (prebuilt binary)
        run: |
          set -euo pipefail
@@ -65,7 +49,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -94,19 +78,33 @@ jobs:
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
-        # Per-file isolation via scripts/run_tests.sh: each test file runs
-        # in its own freshly-spawned `python -m pytest <file>` subprocess
+      - name: Run tests (slice ${{ matrix.slice }}/6)
+        # Per-file isolation via scripts/run_tests_parallel.py: discovers
+        # every test_*.py file under tests/ (excluding integration/ + e2e/),
+        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
        # with bounded parallelism. No xdist, no shared workers, no
        # module-level state leakage between files.
        #
-        # File list is pre-computed by the generate job (--generate-slices)
-        # which runs LPT distribution once and passes the file list to each
-        # matrix job via --files. Previously each job re-discovered files and
-        # re-ran LPT independently — redundant N times.
+        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
+        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
+        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
+        # every file a fresh interpreter — the only isolation boundary
+        # that matters in practice (cross-file leakage was the original
+        # flake source; intra-file is the test author's responsibility).
+        #
+        # Why drop xdist entirely: xdist's persistent workers accumulate
+        # state across files, which is exactly the leakage we wanted to
+        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
+        # the job with cleaner semantics.
+        #
+        # Matrix slicing (--slice I/N): files are distributed across 6
+        # jobs by cached duration (LPT algorithm) so each job gets
+        # roughly equal wall time. Without a cache, files default to 2s
+        # estimate and get split roughly evenly by count — still correct,
+        # just not perfectly balanced.
        run: |
          source .venv/bin/activate
-          scripts/run_tests.sh --files '${{ matrix.slice.files }}'
+          python scripts/run_tests_parallel.py --slice ${{ matrix.slice }}/6
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -116,7 +114,7 @@ jobs:
      - name: Upload per-slice durations
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: test-durations-slice-${{ matrix.slice.index }}
+          name: test-durations-slice-${{ matrix.slice }}
          path: test_durations.json
          retention-days: 1

@@ -175,7 +173,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
--- a/.github/workflows/typecheck.yml
+++ b/.github/workflows/typecheck.yml
@@ -6,7 +6,6 @@ on:

 jobs:
  typecheck:
-    name: Check TypeScript
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -23,7 +22,8 @@ jobs:
      # native builds. Skipping install scripts drops node-pty's node-gyp
      # header fetch — the transient flake that killed this job pre-`tsc` — and
      # is faster. retry covers the remaining registry blips.
-      - uses: ./.github/actions/retry
+      - 
+        uses: ./.github/actions/retry
        with:
          command: npm ci --ignore-scripts
      - run: npm run --prefix ${{ matrix.package }} typecheck
@@ -35,7 +35,6 @@ jobs:
  # users build apps/desktop from source on install/update. Run the real
  # `vite build` here so that class of break fails in CI instead.
  desktop-build:
-    name: Build desktop app
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -45,7 +44,8 @@ jobs:
          cache: npm
      # Keep install scripts here: the production build may need node-pty's
      # native binary. retry handles the transient install-time fetch flakes.
-      - uses: ./.github/actions/retry
+      - 
+        uses: ./.github/actions/retry
        with:
          command: npm ci
      - run: npm run --prefix apps/desktop build
--- a/.github/workflows/upload_to_pypi.yml
+++ b/.github/workflows/upload_to_pypi.yml
@@ -5,11 +5,11 @@ name: Publish to PyPI
 on:
  push:
    tags:
-      - "v20*" # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
  workflow_dispatch:
    inputs:
      confirm_tag:
-        description: "Tag to publish (e.g. v2026.5.15). Must already exist."
+        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
        required: true
        type: string

@@ -27,7 +27,7 @@ jobs:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          # On workflow_dispatch, check out the confirmed tag.
@@ -43,17 +43,17 @@ jobs:
          fi

      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
        with:
-          python-version: "3.13"
+          python-version: '3.13'

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6

      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
        with:
-          node-version: "22"
+          node-version: '22'

      - name: Build web dashboard
        run: cd web && npm ci && npm run build
@@ -81,7 +81,7 @@ jobs:
        run: uv build --sdist --wheel

      - name: Upload distribution artifacts
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -94,17 +94,17 @@ jobs:
      name: pypi
      url: https://pypi.org/p/hermes-agent
    permissions:
-      id-token: write # OIDC trusted publishing
+      id-token: write  # OIDC trusted publishing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
        with:
          skip-existing: true

@@ -116,12 +116,12 @@ jobs:
    needs: publish
    runs-on: ubuntu-latest
    permissions:
-      contents: write # attach assets to the existing release
-      id-token: write # sigstore signing
+      contents: write   # attach assets to the existing release
+      id-token: write   # sigstore signing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -145,7 +145,7 @@ jobs:

      - name: Sign with Sigstore
        if: env.skip_sign != 'true'
-        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc # v3.3.0
+        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc  # v3.3.0
        with:
          inputs: >-
            ./dist/*.tar.gz
--- a/.github/workflows/uv-lockfile-check.yml
+++ b/.github/workflows/uv-lockfile-check.yml
@@ -4,7 +4,7 @@ name: uv.lock check
 # that modify pyproject.toml without regenerating uv.lock (or vice versa)
 # must not merge, because the Docker build's `uv sync --frozen` step will
 # fail on a stale lockfile and we'd rather catch it here than in the
-# docker workflow on main.
+# docker-publish workflow on main.
 #
 # ─────────────────────────────────────────────────────────────────────────
 # IMPORTANT: this check runs against the MERGED state, not just your branch
@@ -63,7 +63,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      # `uv lock --check` re-resolves the project from pyproject.toml and
      # compares the result to uv.lock, exiting non-zero if they disagree.
@@ -100,7 +100,7 @@ jobs:

          This check is blocking because the Docker image build uses
          `uv sync --frozen --extra all`, which rejects stale lockfiles
-          — catching it here avoids a ~15 min failed docker run
+          — catching it here avoids a ~15 min failed docker-publish run
          on `main` post-merge.
          EOF
            echo "::error title=uv.lock out of sync::Run \`uv lock\` locally and commit the result. If on a PR, sync with main first."
--- a/.gitignore
+++ b/.gitignore
@@ -137,9 +137,3 @@ RELEASE_v*.md
 # Desktop demo-run scratch output (hermes writes demo/*.txt during recorded
 # walkthroughs). Throwaway artifacts, never part of the app.
 apps/desktop/demo/
-
-# PR infographics are rendered locally and embedded in PR descriptions via the
-# image-provider (fal.media) URL — they are NEVER committed to the repo. The
-# PR body is the archive. See the hermes-agent-dev skill's
-# pr-infographic-workflow reference (storage rule + lapse #8 / #COMMIT-1).
-infographic/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -123,17 +123,6 @@ conservative at the waist.
  without E2E proof, and plugins that touch core files.** Plugins live in their
  own directory and work within the ABCs/hooks we provide; if a plugin needs
  more, widen the generic plugin surface, don't special-case it in core.
- **Third-party products / other people's projects integrated into the core
-  tree.** Observability backends, vendor SaaS integrations, analytics dashboards,
-  and similar "someone else's product" plugins do NOT land under `plugins/` in
-  this repo. They place an ongoing maintenance burden on us to keep them working
-  against a fast-moving core, for a backend we don't own. Ship them as a
-  **standalone plugin repo** users install into `~/.hermes/plugins/` (or via a
-  pip entry point), and promote them in the Nous Research Discord
-  (`#plugins-skills-and-skins`). This is a coupling-and-maintenance decision, not
-  a quality bar — the plugin can be excellent and still be a close. PRs that add
-  such a directory to the tree are closed with a pointer to publish it as its own
-  repo.

 ### Before you call it a bug — verify the premise (and when NOT to close)

@@ -491,7 +480,7 @@ The dashboard embeds the real `hermes --tui` — **not** a rewrite.  See `hermes

 ### Electron Desktop Chat App (`apps/desktop/`)

-A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). The WebSocket/JSON-RPC transport lives in the framework-agnostic `apps/shared` package (`@hermes/shared` — `JsonRpcGatewayClient` + WS URL helpers), which the web dashboard (`web/`) also consumes; **desktop has no build/runtime dependency on the dashboard frontend** — it spawns a headless `hermes serve` backend server (the same gateway `dashboard` serves, minus the browser UI). `dashboard` and `serve` share `cmd_dashboard`/`start_server` but are independent surfaces — neither launches the other. The one exception is a backward-compat *fallback*: `serve` is newer, so the desktop spawn (`electron/backend-command.cjs` + `backendSupportsServe()` in `main.cjs`) detects whether the resolved runtime registers `serve` and, only when it does not (an older managed install / PATH `hermes` the app hasn't updated yet), rewrites the argv to the legacy `dashboard --no-open`. Without that, a new app against an un-upgraded runtime would crash on an unknown subcommand and brick every mid-upgrade user. It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.
+A **separate** chat surface from both the classic CLI and the dashboard's embedded TUI. It is an Electron + React + nanostore renderer (`@assistant-ui/react`) that talks to a `tui_gateway` backend over JSON-RPC (`requestGateway(method, params)`). It does NOT embed `hermes --tui` — it has its own composer, transcript, and slash-command pipeline. Route desktop bugs to the `hermes-desktop-app-work` skill, not `hermes-dashboard-work`.

 **Slash commands in the desktop app are curated client-side, then dispatched to the backend.** The pipeline:

@@ -794,24 +783,6 @@ landing in this tree. PRs that add a new directory under
 provider as its own repo. Existing in-tree providers stay; bug fixes
 to them are welcome.

-**No new third-party-product plugins in-tree (policy, June 2026):** the
-same rule applies beyond memory providers. Plugins that integrate
-someone else's product or project — observability/metrics backends,
-vendor SaaS connectors, analytics dashboards, paid-service tie-ins —
-must ship as **standalone plugin repos** that users install into
-`~/.hermes/plugins/` (or via pip entry points). They register through
-the existing plugin discovery path and use the ABCs/hooks/ctx surface
-we expose; nothing special is needed in core. The reason is
-maintenance load: every product we absorb into the tree becomes our
-burden to keep working against a fast-moving core, for a backend we
-don't own. Promote standalone plugins in the Nous Research Discord
-(`#plugins-skills-and-skins`). PRs that add such a directory under
-`plugins/` are closed with a pointer to publish it as its own repo —
-this is a coupling decision, not a quality judgment. (The
-`observability/`, `kanban/`, `disk-cleanup/`, etc. directories already
-in the tree are existing precedent, not an invitation to add more
-third-party-product plugins alongside them.)
-
 ### Model-provider plugins (`plugins/model-providers/<name>/`)

 Every inference backend (openrouter, anthropic, gmi, deepseek, nvidia, …)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -85,23 +85,6 @@ This isn't a quality bar — it's a coupling-and-maintenance decision. Memory pr

 ---

-## Third-Party Product Integrations: Ship as a Standalone Plugin
-
-The same rule extends to **any plugin that integrates someone else's product or project** — observability/metrics backends, vendor SaaS connectors, analytics dashboards, paid-service tie-ins, and similar third-party integrations. **These do not land in this repo.**
-
-The reason is maintenance load, not quality. Every external product absorbed into the core tree becomes ours to keep working against a fast-moving codebase, for a backend we don't own and can't control. Hermes ships a lot and the core moves quickly; coupling third-party products into it creates an open-ended burden on the maintainers.
-
-Publish these as a **standalone plugin repo** instead:
-
- Implement the relevant ABC and use the existing plugin discovery path (`~/.hermes/plugins/`, project `.hermes/plugins/`, or a pip entry point) — see [Build a Hermes Plugin](https://hermes-agent.nousresearch.com/docs/guides/build-a-hermes-plugin)
- Register lifecycle hooks (`pre_tool_call`, `post_tool_call`, `pre_llm_call`, `post_llm_call`, `on_session_start`, `on_session_end`), tools (`ctx.register_tool`), and CLI subcommands (`ctx.register_cli_command`) through the surface we already expose — no core changes needed
- If your plugin needs a capability the framework doesn't expose, that's a feature request to **widen the generic plugin surface** (a new hook or `ctx` method) — never special-case your plugin in core
- Promote it in the [Nous Research Discord](https://discord.gg/NousResearch) `#plugins-skills-and-skins` channel so users can find and install it
-
-A well-built third-party-product plugin can clear automated review and still be closed for this reason — it's a placement decision, not a verdict on the code. PRs that add such a directory under `plugins/` will be closed with a pointer to publish it as its own repo.
-
---
-
 ## Development Setup

 ### Prerequisites
@@ -149,20 +132,13 @@ this way, make sure you run the `hermes` entrypoint from this venv; running the
 system `python3 -m hermes_cli.main` can pick up unrelated system Python
 packages.

-Create the venv **outside** the cloned source tree. A venv that lives inside
-the directory the agent operates from can be wiped by a relative-path command
-the agent runs against its own checkout (`rm -rf venv`, `uv venv venv`, etc.),
-which silently destroys the running runtime mid-session. Keeping it outside the
-tree means no relative path from the workspace resolves to it.
-
 ```bash
 git clone https://github.com/NousResearch/hermes-agent.git
 cd hermes-agent

-# Create venv with Python 3.11, OUTSIDE the source tree
-uv venv ~/.hermes/venvs/hermes-dev --python 3.11
-export VIRTUAL_ENV="$HOME/.hermes/venvs/hermes-dev"
-export PATH="$VIRTUAL_ENV/bin:$PATH"
+# Create venv with Python 3.11
+uv venv venv --python 3.11
+export VIRTUAL_ENV="$(pwd)/venv"

 # Install with all extras (messaging, cron, CLI menus, dev tools)
 uv pip install -e ".[all,dev]"
--- a/32
+++ b/32
@@ -119,9 +119,6 @@ COPY package.json package-lock.json ./
 COPY web/package.json web/
 COPY ui-tui/package.json ui-tui/
 COPY ui-tui/packages/hermes-ink/ ui-tui/packages/hermes-ink/
-# apps/shared/ is copied IN FULL because web/package.json references it as a
-# `file:` workspace dependency (same pattern as hermes-ink above).
-COPY apps/shared/ apps/shared/

 # `npm_config_install_links=false` forces npm to install `file:` deps as
 # symlinks instead of copies.  This is the default since npm 10+, which is
@@ -187,19 +184,12 @@ RUN uv sync --frozen --no-install-project --extra all --extra messaging --extra
 # invalidate the (relatively slow) web + ui-tui build layer.
 COPY web/ web/
 COPY ui-tui/ ui-tui/
-COPY apps/shared/ apps/shared/
 RUN cd web && npm run build && \
    cd ../ui-tui && npm run build

 # ---------- Source code ----------
 # .dockerignore excludes node_modules, so the installs above survive.
-# --link decouples this layer from parents for cache purposes; --chmod bakes
-# the final read-only permissions at copy time so we skip the separate
-# `chmod -R` pass that previously walked ~30k files across the venv +
-# node_modules + source (21s amd64 / 222s arm64 — #49113).  `a+rX,go-w`
-# gives the non-root hermes user read + traverse but no write; root retains
-# write so the build steps below don't need chmod u+w dances.
-COPY --link --chmod=a+rX,go-w . .
+COPY . .

 # ---------- Permissions ----------
 # Link hermes-agent itself (editable). Deps are already installed in the
@@ -207,15 +197,19 @@ COPY --link --chmod=a+rX,go-w . .
 # resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

-# Wire the exec shim and install-method stamp.  Files under /opt/hermes are
-# already root-owned (COPY, uv sync, npm install all run as root) and
-# read-only for the hermes user (go-w from the --chmod above).
-
+# Keep /opt/hermes immutable for the runtime hermes user. Hosted/container
+# instances must not be able to self-edit the installed source or venv; user
+# data, skills, plugins, config, logs, and dashboard uploads live under
+# /opt/data instead. Root can still repair the image during build/boot, but
+# supervised Hermes processes drop to the non-root hermes user.
 USER root
 RUN mkdir -p /opt/hermes/bin && \
    cp /opt/hermes/docker/hermes-exec-shim.sh /opt/hermes/bin/hermes && \
    chmod 0755 /opt/hermes/bin/hermes && \
-    printf 'docker\n' > /opt/hermes/.install_method
+    printf 'docker\n' > /opt/hermes/.install_method && \
+    chown -R root:root /opt/hermes && \
+    chmod -R a+rX /opt/hermes && \
+    chmod -R a-w /opt/hermes
 # The ``.install_method`` stamp is baked next to the running code (the install
 # tree), NOT into $HERMES_HOME. $HERMES_HOME (/opt/data) is a shared data
 # volume that is commonly bind-mounted from the host and even shared with a
@@ -242,11 +236,13 @@ RUN mkdir -p /opt/hermes/bin && \
 #
 # The arg is optional — local `docker build` without --build-arg simply
 # omits the file, and the runtime falls back to live-git lookup.  CI
-# (.github/workflows/docker.yml) passes ${{ github.sha }} so
+# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
 # every published image has it.
 ARG HERMES_GIT_SHA=
 RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
-        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha; \
+        chmod u+w /opt/hermes && \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
+        chmod a-w /opt/hermes /opt/hermes/.hermes_build_sha; \
    fi

 # ---------- s6-overlay service wiring ----------
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), OpenRouter, OpenAI, your own endpoint, and [many others](https://hermes-agent.nousresearch.com/docs/integrations/providers). Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [NovitaAI](https://novita.ai) (AI-native cloud for Model API, Agent Sandbox, and GPU Cloud), [NVIDIA NIM](https://build.nvidia.com) (Nemotron), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -232,14 +232,10 @@ scripts/run_tests.sh
 Manual clone fallback (for throwaway clones/CI where you intentionally do not
 want the managed install layout):

-Create the venv outside the cloned source tree — a venv inside the directory
-the agent operates from can be wiped by a relative-path command the agent runs
-against its own checkout, destroying the running runtime mid-session.
-
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
-uv venv ~/.hermes/venvs/hermes-dev --python 3.11
-source ~/.hermes/venvs/hermes-dev/bin/activate
+uv venv .venv --python 3.11
+source .venv/bin/activate
 uv pip install -e ".[all,dev]"
 scripts/run_tests.sh
 ```
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -722,50 +722,10 @@ def init_agent(
    elif agent.provider == "moa":
        from agent.moa_loop import MoAClient
        agent.api_mode = "chat_completions"
-
-        # Route reference-model outputs to the agent's tool_progress_callback so
-        # every surface that already consumes it (CLI spinner/scrollback, TUI,
-        # desktop, gateway) can show each reference's answer as a labelled block
-        # before the aggregator acts. The facade emits "moa.reference" and
-        # "moa.aggregating" events; we forward them through the same callback
-        # the tool lifecycle uses. Best-effort and cache-safe — these are
-        # display-only events, they never touch the message history.
-        def _moa_reference_relay(event: str, **kwargs: Any) -> None:
-            cb = getattr(agent, "tool_progress_callback", None)
-            if cb is None:
-                return
-            try:
-                if event == "moa.reference":
-                    label = str(kwargs.get("label") or "")
-                    text = str(kwargs.get("text") or "")
-                    idx = kwargs.get("index")
-                    count = kwargs.get("count")
-                    cb(
-                        "moa.reference",
-                        label,
-                        text,
-                        None,
-                        moa_index=idx,
-                        moa_count=count,
-                    )
-                elif event == "moa.aggregating":
-                    cb(
-                        "moa.aggregating",
-                        str(kwargs.get("aggregator") or ""),
-                        None,
-                        None,
-                        moa_ref_count=kwargs.get("ref_count"),
-                    )
-            except Exception:
-                pass
-
-        agent.client = MoAClient(
-            agent.model or "default",
-            reference_callback=_moa_reference_relay,
-        )
+        agent.client = MoAClient(agent.model or "default")
        agent._client_kwargs = {}
        agent.api_key = api_key or "moa-virtual-provider"
-        agent.base_url = "moa://local"
+        agent.base_url = base_url or "moa://local"
        if not agent.quiet_mode:
            print(f"🤖 AI Agent initialized with MoA preset: {agent.model}")
    elif agent.api_mode == "bedrock_converse":
@@ -1307,12 +1267,6 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

-    # Intent-ack continuation config: "auto" (default — codex_responses only,
-    # the historical gate), true (all api_modes), false (never), or a list of
-    # model-name substrings.  Resolved against the active api_mode/model in the
-    # conversation loop's intent-ack block.
-    agent._intent_ack_continuation = _agent_section.get("intent_ack_continuation", "auto")
-
    # Universal task-completion guidance toggle.  Default True.  Surfaced
    # as a separate flag from tool_use_enforcement because the guidance
    # applies to ALL models, not just the model families enforcement
@@ -1676,10 +1630,8 @@ def init_agent(
            f"Model {agent.model} has a context window of {_ctx:,} tokens, "
            f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
            f"by Hermes Agent.  Choose a model with at least "
-            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context.  If your server "
-            f"reports a window smaller than the model's true window, set "
-            f"model.context_length in config.yaml to the real value "
-            f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
+            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
+            f"model.context_length in config.yaml to override."
        )

    # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@@ -42,14 +42,6 @@ from utils import base_url_host_matches, base_url_hostname, env_var_enabled, ato
 logger = logging.getLogger(__name__)


-# Max consecutive successful credential-pool token refreshes of the SAME entry
-# on a persistent auth failure before we give up and let the fallback chain
-# activate. A single-entry OAuth pool can re-mint a fresh token indefinitely
-# even when the upstream keeps rejecting it, so without this cap the retry loop
-# spins forever and never reaches ``_try_activate_fallback``. See #26080.
-_MAX_AUTH_REFRESH_ATTEMPTS = 2
-
-
 def _ra():
    """Lazy ``run_agent`` reference for test-patch routing."""
    import run_agent
@@ -783,30 +775,6 @@ def recover_with_credential_pool(
            return False, has_retried_429
        refreshed = pool.try_refresh_current()
        if refreshed is not None:
-            # ``try_refresh_current()`` re-mints a fresh OAuth token and reports
-            # success even when the upstream keeps rejecting it — a single-entry
-            # pool (common for OAuth/Max subscribers) has nothing to rotate to,
-            # so a bare "refreshed → retry" loop spins forever on the same dead
-            # token and the configured fallback never activates. Cap consecutive
-            # same-entry refreshes and fall through to fallback once exceeded.
-            # See #26080.
-            refreshed_id = getattr(refreshed, "id", None)
-            if refreshed_id is not None:
-                refresh_counts = getattr(agent, "_auth_pool_refresh_counts", None)
-                if refresh_counts is None:
-                    refresh_counts = {}
-                    agent._auth_pool_refresh_counts = refresh_counts
-                refresh_key = (agent.provider, refreshed_id)
-                refresh_counts[refresh_key] = refresh_counts.get(refresh_key, 0) + 1
-                if refresh_counts[refresh_key] > _MAX_AUTH_REFRESH_ATTEMPTS:
-                    _ra().logger.warning(
-                        "Credential auth failure persists after %s refreshes for "
-                        "pool entry %s — treating as unrecoverable and allowing "
-                        "fallback to activate.",
-                        refresh_counts[refresh_key] - 1,
-                        refreshed_id,
-                    )
-                    return False, has_retried_429
            _ra().logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}")
            agent._swap_credential(refreshed)
            return True, has_retried_429
@@ -1078,34 +1046,6 @@ def restore_primary_runtime(agent) -> bool:
            api_mode=rt.get("compressor_api_mode", ""),
        )

-        # ── Re-select from the credential pool if one is available ──
-        # The snapshot's api_key was captured at construction time.  Across
-        # turns the pool may have rotated (token revocation, billing/rate-limit
-        # exhaustion, cooldown), leaving the snapshot key stale.  Restoring it
-        # blindly re-fails on the first request and burns through the remaining
-        # pool entries before cross-provider fallback even gets a chance.  Ask
-        # the pool for its current best entry and swap the live credential in.
-        # When the pool is absent, empty, or the entry has no usable key, we
-        # keep the snapshot key (the existing behavior).  Fixes #25205.
-        pool = getattr(agent, "_credential_pool", None)
-        if pool is not None and pool.has_available():
-            entry = pool.select()
-            if entry is not None:
-                entry_key = (
-                    getattr(entry, "runtime_api_key", None)
-                    or getattr(entry, "access_token", "")
-                )
-                if entry_key:
-                    # ``_swap_credential`` rebuilds the OpenAI/Anthropic client,
-                    # reapplies base-url-scoped headers, and carries the
-                    # accumulated base_url / OAuth-detection fixes (#33163).
-                    agent._swap_credential(entry)
-                    logger.info(
-                        "Restore re-selected pool entry %s (%s)",
-                        getattr(entry, "id", "?"),
-                        getattr(entry, "label", "?"),
-                    )
-
        # ── Reset fallback chain for the new turn ──
        agent._fallback_activated = False
        agent._fallback_index = 0
@@ -1281,11 +1221,7 @@ def dump_api_request_debug(
            dump_payload["error"] = error_info

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        # Sanitize the session ID into a traversal-free path segment — it can
-        # originate from untrusted input (X-Hermes-Session-Id header), and an
-        # unsanitized "../"-shaped ID would write the dump outside logs_dir.
-        safe_sid = _ra()._safe_session_filename_component(agent.session_id)
-        dump_file = agent.logs_dir / f"request_dump_{safe_sid}_{timestamp}.json"
+        dump_file = agent.logs_dir / f"request_dump_{agent.session_id}_{timestamp}.json"

        # Redact secrets before persisting/printing. This dump captures the
        # full request body (system prompt, tool defs, context-embedded
@@ -1484,15 +1420,6 @@ def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: boo
        keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", ""))
        if keepalive_http is not None:
            client_kwargs["http_client"] = keepalive_http
-    # Delegate all rate-limit / 5xx retry to hermes's outer conversation loop,
-    # which honors Retry-After and applies adaptive/jittered backoff. The OpenAI
-    # SDK default (max_retries=2) uses its own 1-2s backoff that ignores
-    # Retry-After and double-retries inside our loop — the same deadlock the
-    # Anthropic clients hit (#26293). This is the single chokepoint every primary
-    # OpenAI/aggregator client passes through (init, switch_model, recovery,
-    # restore, request-scoped); auxiliary_client builds its own clients and keeps
-    # SDK retries because it is NOT wrapped by the conversation loop.
-    client_kwargs.setdefault("max_retries", 0)
    # Uses the module-level `OpenAI` name, resolved lazily on first
    # access via __getattr__ below. Tests patch via `run_agent.OpenAI`.
    client = _ra().OpenAI(**client_kwargs)
@@ -1572,10 +1499,6 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
    # _client_kwargs is a dict — snapshot a shallow copy so mutating the
    # live dict doesn't poison the rollback target.
    _snapshot["_client_kwargs"] = dict(getattr(agent, "_client_kwargs", {}) or {})
-    # Snapshot the credential pool reference so a failed client rebuild can
-    # restore the original pool (issue #52727: pool reload is part of this
-    # switch and must be reversible on rollback).
-    _snapshot["_credential_pool"] = getattr(agent, "_credential_pool", _MISSING)

    try:
        # Clear the per-config context_length override so the new model's
@@ -1600,36 +1523,8 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        if api_key:
            agent.api_key = api_key

-        # ── Reload credential pool for the new provider (issue #52727) ──
-        # Without this, ``recover_with_credential_pool`` sees a
-        # ``pool.provider != agent.provider`` mismatch and short-circuits,
-        # leaving the new provider with no rotation/recovery on 401/429 and
-        # burning the original pool's entries. Only reload when the provider
-        # actually changed (or the pool was missing) — re-selecting the same
-        # provider must not churn the pool reference. A reload failure is
-        # logged + swallowed: the switch itself must still complete.
-        old_norm = (old_provider or "").strip().lower()
-        new_norm = (new_provider or "").strip().lower()
-        if old_norm != new_norm or getattr(agent, "_credential_pool", None) is None:
-            try:
-                from agent.credential_pool import load_pool
-                agent._credential_pool = load_pool(new_provider)
-            except Exception as _pool_exc:  # noqa: BLE001
-                logger.warning(
-                    "switch_model: credential pool reload failed for %s (%s); "
-                    "continuing without pool rotation this turn",
-                    new_provider, _pool_exc,
-                )
-
        # ── Build new client ──
-        if (new_provider or "").strip().lower() == "moa":
-            from agent.moa_loop import MoAClient
-
-            agent.api_key = api_key or "moa-virtual-provider"
-            agent.base_url = "moa://local"
-            agent._client_kwargs = {}
-            agent.client = MoAClient(agent.model or "default")
-        elif api_mode == "anthropic_messages":
+        if api_mode == "anthropic_messages":
            from agent.anthropic_adapter import (
                build_anthropic_client,
                resolve_anthropic_token,
@@ -2209,21 +2104,8 @@ def looks_like_codex_intermediate_ack(
    user_message: str,
    assistant_content: str,
    messages: List[Dict[str, Any]],
-    require_workspace: bool = True,
 ) -> bool:
-    """Detect a planning/ack message that should continue instead of ending the turn.
-
-    ``require_workspace`` (default True) keeps the original codex-coding scope:
-    the ack must reference a filesystem/repo workspace. The conversation loop
-    passes ``require_workspace=False`` when the user has explicitly opted into
-    intent-ack continuation for all api_modes (``agent.intent_ack_continuation``
-    is ``true`` or a model-list), so general autonomous workflows ("I'll run a
-    health check on the server", "I'll start the deployment") — which carry a
-    future-ack and an action verb but no filesystem reference — are caught too.
-    The future-ack + short-content + no-prior-tools + action-verb requirements
-    always apply, which is what keeps conversational "I'll help you brainstorm"
-    replies from tripping it.
-    """
+    """Detect a planning/ack message that should continue instead of ending the turn."""
    if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
        return False

@@ -2276,67 +2158,17 @@ def looks_like_codex_intermediate_ack(
        "path",
    )

-    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
-    if not assistant_mentions_action:
-        return False
-
-    # Opted-in (all-api_mode) path: a future-ack + action verb + no prior tool
-    # call is enough — the user asked us to keep going when the model only
-    # announces intent, regardless of whether a filesystem is involved.
-    if not require_workspace:
-        return True
-
    user_text = (user_message or "").strip().lower()
    user_targets_workspace = (
        any(marker in user_text for marker in workspace_markers)
        or "~/" in user_text
        or "/" in user_text
    )
+    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
    assistant_targets_workspace = any(
        marker in assistant_text for marker in workspace_markers
    )
-    return user_targets_workspace or assistant_targets_workspace
-
-
-def intent_ack_continuation_mode(agent) -> str:
-    """Classify the resolved intent-ack continuation mode for this turn.
-
-    Returns one of:
-      * ``"off"``        — never continue.
-      * ``"codex_only"`` — historical scope: continue only on the
-        ``codex_responses`` api_mode, and only for codebase/workspace acks
-        (``require_workspace=True``).
-      * ``"all"``        — user opted in for every api_mode; continue on any
-        future-ack + action verb (``require_workspace=False``).
-
-    Mirrors the four-mode shape of ``agent.tool_use_enforcement``: ``"auto"``
-    (default) → codex_only; ``True``/"true"/"always"/"yes"/"on" → all;
-    ``False``/"false"/"never"/"no"/"off" → off; ``list`` → all when a substring
-    matches the active model name, else off.
-    """
-    mode = getattr(agent, "_intent_ack_continuation", "auto")
-
-    if mode is True or (isinstance(mode, str) and mode.lower() in {"true", "always", "yes", "on"}):
-        return "all"
-    if mode is False or (isinstance(mode, str) and mode.lower() in {"false", "never", "no", "off"}):
-        return "off"
-    if isinstance(mode, list):
-        model_lower = (agent.model or "").lower()
-        return "all" if any(p.lower() in model_lower for p in mode if isinstance(p, str)) else "off"
-    # "auto" or any unrecognised value — historical codex-only behavior.
-    return "codex_only" if agent.api_mode == "codex_responses" else "off"
-
-
-def intent_ack_continuation_enabled(agent) -> bool:
-    """Whether intent-ack continuation should fire at all for this turn.
-
-    The ``codex_ack_continuations < 2`` per-turn cap and the
-    ``looks_like_codex_intermediate_ack`` detector are applied by the caller;
-    this only decides the on/off gate. Callers that also need to know whether
-    the workspace requirement applies should use ``intent_ack_continuation_mode``
-    directly (``"codex_only"`` ⇒ require_workspace=True, ``"all"`` ⇒ False).
-    """
-    return intent_ack_continuation_mode(agent) != "off"
+    return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action



--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -673,9 +673,6 @@ def _build_anthropic_client_with_bearer_hook(
    kwargs = {
        "timeout": timeout_obj,
        "http_client": http_client,
-        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
-        # default max_retries=2 ignores it and double-retries. (#26293)
-        "max_retries": 0,
        # The SDK requires *something* for api_key/auth_token. Our
        # event hook overrides Authorization per request so this value
        # is never sent. The sentinel string makes accidental leaks
@@ -760,12 +757,6 @@ def build_anthropic_client(
    _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
    kwargs = {
        "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
-        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
-        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
-        # its own 1-2s backoff that ignores Retry-After and double-retries
-        # inside our loop — burning request slots against a bucket that won't
-        # refill for minutes. (#26293)
-        "max_retries": 0,
    }
    if normalized_base_url:
        # Azure Anthropic endpoints require an ``api-version`` query parameter.
@@ -861,9 +852,6 @@ def build_anthropic_bedrock_client(region: str):
    return _anthropic_sdk.AnthropicBedrock(
        aws_region=region,
        timeout=Timeout(timeout=900.0, connect=10.0),
-        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
-        # default max_retries=2 ignores it and double-retries. (#26293)
-        max_retries=0,
        default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
    )

@@ -926,72 +914,44 @@ def _read_claude_code_credentials_from_keychain() -> Optional[Dict[str, Any]]:
    return None


-def _read_claude_code_credentials_from_file() -> Optional[Dict[str, Any]]:
-    """Read Claude Code OAuth credentials from ~/.claude/.credentials.json.
-
-    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
-    """
-    cred_path = Path.home() / ".claude" / ".credentials.json"
-    if not cred_path.exists():
-        return None
-    try:
-        data = json.loads(cred_path.read_text(encoding="utf-8"))
-    except (json.JSONDecodeError, OSError, IOError) as e:
-        logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
-        return None
-
-    oauth_data = data.get("claudeAiOauth")
-    if not (oauth_data and isinstance(oauth_data, dict)):
-        return None
-    access_token = oauth_data.get("accessToken", "")
-    if not access_token:
-        return None
-    return {
-        "accessToken": access_token,
-        "refreshToken": oauth_data.get("refreshToken", ""),
-        "expiresAt": oauth_data.get("expiresAt", 0),
-        "source": "claude_code_credentials_file",
-    }
-
-
 def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    """Read refreshable Claude Code OAuth credentials.

-    Reads from two possible sources and reconciles them:
+    Checks two sources in order:
      1. macOS Keychain (Darwin only) — "Claude Code-credentials" entry
      2. ~/.claude/.credentials.json file

-    Selection rules when both are present:
-      - If exactly one is non-expired, prefer that one. (Handles the case
-        where Claude Code refreshes one source but not the other — observed
-        in the wild on Claude Code 2.1.x.)
-      - Otherwise, prefer the source with the later ``expiresAt`` so that
-        any subsequent refresh uses the most recent ``refreshToken``.
-
    This intentionally excludes ~/.claude.json primaryApiKey. Opencode's
    subscription flow is OAuth/setup-token based with refreshable credentials,
    and native direct Anthropic provider usage should follow that path rather
    than auto-detecting Claude's first-party managed key.

-    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
+    Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
    """
+    # Try macOS Keychain first (covers Claude Code >=2.1.114)
    kc_creds = _read_claude_code_credentials_from_keychain()
-    file_creds = _read_claude_code_credentials_from_file()
+    if kc_creds:
+        return kc_creds

-    if kc_creds and file_creds:
-        kc_valid = is_claude_code_token_valid(kc_creds)
-        file_valid = is_claude_code_token_valid(file_creds)
-        if kc_valid and not file_valid:
-            return kc_creds
-        if file_valid and not kc_valid:
-            return file_creds
-        # Both valid or both expired: prefer the later expiresAt so the
-        # downstream refresh path uses the freshest refresh_token.
-        kc_exp = kc_creds.get("expiresAt", 0) or 0
-        file_exp = file_creds.get("expiresAt", 0) or 0
-        return kc_creds if kc_exp >= file_exp else file_creds
+    # Fall back to JSON file
+    cred_path = Path.home() / ".claude" / ".credentials.json"
+    if cred_path.exists():
+        try:
+            data = json.loads(cred_path.read_text(encoding="utf-8"))
+            oauth_data = data.get("claudeAiOauth")
+            if oauth_data and isinstance(oauth_data, dict):
+                access_token = oauth_data.get("accessToken", "")
+                if access_token:
+                    return {
+                        "accessToken": access_token,
+                        "refreshToken": oauth_data.get("refreshToken", ""),
+                        "expiresAt": oauth_data.get("expiresAt", 0),
+                        "source": "claude_code_credentials_file",
+                    }
+        except (json.JSONDecodeError, OSError, IOError) as e:
+            logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)

-    return kc_creds or file_creds
+    return None


 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
@@ -1074,40 +1034,8 @@ def refresh_anthropic_oauth_pure(refresh_token: str, *, use_json: bool = False)


 def _refresh_oauth_token(creds: Dict[str, Any]) -> Optional[str]:
-    """Attempt to refresh an expired Claude Code OAuth token.
-
-    Claude Code's OAuth refresh tokens are single-use: a successful refresh
-    rotates the pair and invalidates the old refresh token. Claude Code itself
-    also refreshes on its own schedule (IDE/CLI activity), so by the time
-    Hermes notices an expired token, Claude Code may have already rotated it.
-    POSTing our now-stale refresh token in that window races Claude Code and
-    fails with ``invalid_grant``.
-
-    So before refreshing, re-read the live credential sources. If Claude Code
-    has already produced a valid token, adopt it and skip the POST entirely.
-    Only fall back to refreshing ourselves when no fresh credential is found.
-    """
-    # Claude Code may have already refreshed — adopt its token rather than
-    # racing it with our (possibly already-rotated) refresh token. Only adopt
-    # when the live re-read produced a DIFFERENT token with a real future
-    # expiry: re-adopting the same credential we were just handed would be a
-    # no-op, and a 0/absent ``expiresAt`` means "managed key / unknown expiry"
-    # (see is_claude_code_token_valid) which must NOT be treated as a fresh
-    # refresh here.
-    current = read_claude_code_credentials()
-    if current:
-        current_token = current.get("accessToken", "")
-        current_exp = current.get("expiresAt", 0) or 0
-        if (
-            current_token
-            and current_token != creds.get("accessToken", "")
-            and current_exp > 0
-            and is_claude_code_token_valid(current)
-        ):
-            logger.debug("Adopted Claude Code's already-refreshed OAuth token")
-            return current_token
-
-    refresh_token = (current or {}).get("refreshToken", "") or creds.get("refreshToken", "")
+    """Attempt to refresh an expired Claude Code OAuth token."""
+    refresh_token = creds.get("refreshToken", "")
    if not refresh_token:
        logger.debug("No refresh token available — cannot refresh")
        return None
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -102,7 +102,6 @@ OpenAI = _OpenAIProxy()  # module-level name, resolves lazily on call/isinstance

 from agent.credential_pool import load_pool
 from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length
-from agent.process_bootstrap import build_keepalive_http_client
 from hermes_cli.config import get_hermes_home
 from hermes_constants import OPENROUTER_BASE_URL
 from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars
@@ -110,23 +109,6 @@ from utils import base_url_host_matches, base_url_hostname, env_float, model_for
 logger = logging.getLogger(__name__)


-def _openai_http_client_kwargs(
-    base_url: Optional[str],
-    *,
-    async_mode: bool = False,
-) -> Dict[str, Any]:
-    """Inject keepalive httpx client with env-only proxy (not macOS system proxy)."""
-    client = build_keepalive_http_client(str(base_url or ""), async_mode=async_mode)
-    if client is None:
-        return {}
-    return {"http_client": client}
-
-
-def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
-    kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
-    return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
-
-
 # ── Interrupt protection for atomic auxiliary tasks ──────────────────────
 # Some auxiliary tasks must NOT be aborted mid-flight by a gateway interrupt
 # (e.g. an incoming user message while the agent is busy). Context
@@ -684,28 +666,6 @@ def _pool_runtime_base_url(entry: Any, fallback: str = "") -> str:
    return str(url or "").strip().rstrip("/")


-# Hostnames (lowercase, exact) that the auxiliary Anthropic path is allowed to
-# be pointed at via config.yaml model.base_url. Anything else falls back to the
-# Anthropic default — operators routing main-session traffic through a
-# non-Anthropic host (e.g. OpenRouter, OpenAI) with provider=anthropic in config
-# must NOT have that foreign host leak into the auxiliary client. See #52608.
-_ANTHROPIC_COMPATIBLE_HOSTS = frozenset({
-    "api.anthropic.com",
-})
-
-
-def _is_anthropic_compatible_host(url: str) -> bool:
-    """Return True if ``url``'s hostname is an Anthropic endpoint we trust for aux calls."""
-    if not url:
-        return False
-    try:
-        from urllib.parse import urlparse
-        host = (urlparse(url).hostname or "").strip().lower().rstrip(".")
-        return host in _ANTHROPIC_COMPATIBLE_HOSTS
-    except Exception:
-        return False
-
-
 def _nous_min_key_ttl_seconds() -> int:
    try:
        return max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800")))
@@ -1632,7 +1592,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            _merged_aux = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_aux:
                extra["default_headers"] = _merged_aux
-            _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
+            _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
            _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
            return _client, model

@@ -1672,7 +1632,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
        _merged_aux2 = _apply_user_default_headers(extra.get("default_headers"))
        if _merged_aux2:
            extra["default_headers"] = _merged_aux2
-        _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
+        _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
        _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
        return _client, model

@@ -1687,21 +1647,20 @@ def _try_openrouter(explicit_api_key: str = None, model: str = None) -> Tuple[Op
    pool_present, entry = _select_pool_entry("openrouter")
    if pool_present:
        or_key = explicit_api_key or _pool_runtime_api_key(entry)
-        if or_key:
-            base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
-            logger.debug("Auxiliary client: OpenRouter via pool")
-            return _create_openai_client(api_key=or_key, base_url=base_url,
-                           default_headers=build_or_headers()), model or _OPENROUTER_MODEL
-        # Pool exists but is exhausted (no usable runtime key) — fall through to
-        # the OPENROUTER_API_KEY env-var path rather than failing outright.
-        logger.debug("Auxiliary client: OpenRouter pool exhausted, trying OPENROUTER_API_KEY")
+        if not or_key:
+            _mark_provider_unhealthy("openrouter", ttl=60)
+            return None, None
+        base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
+        logger.debug("Auxiliary client: OpenRouter via pool")
+        return OpenAI(api_key=or_key, base_url=base_url,
+                       default_headers=build_or_headers()), model or _OPENROUTER_MODEL

    or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY")
    if not or_key:
        _mark_provider_unhealthy("openrouter", ttl=60)
        return None, None
    logger.debug("Auxiliary client: OpenRouter")
-    return _create_openai_client(api_key=or_key, base_url=OPENROUTER_BASE_URL,
+    return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
                   default_headers=build_or_headers()), model or _OPENROUTER_MODEL


@@ -1794,7 +1753,7 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
-        _create_openai_client(
+        OpenAI(
            api_key=api_key,
            base_url=base_url,
        ),
@@ -2071,7 +2030,7 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
    if _custom_headers:
        _extra["default_headers"] = _custom_headers
    if custom_mode == "codex_responses":
-        real_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
+        real_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
        return CodexAuxiliaryClient(real_client, model), model
    if custom_mode == "anthropic_messages":
        # Third-party Anthropic-compatible gateway (MiniMax, Zhipu GLM,
@@ -2085,14 +2044,14 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
                "Custom endpoint declares api_mode=anthropic_messages but the "
                "anthropic SDK is not installed — falling back to OpenAI-wire."
            )
-            return _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra), model
+            return OpenAI(api_key=custom_key, base_url=_clean_base, **_extra), model
        return (
            AnthropicAuxiliaryClient(real_client, model, custom_key, custom_base, is_oauth=False),
            model,
        )
    # URL-based anthropic detection for custom endpoints that didn't set
    # api_mode explicitly (e.g. kimi.com/coding reached via custom config).
-    _fallback_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
+    _fallback_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
    _fallback_client = _maybe_wrap_anthropic(
        _fallback_client, model, custom_key, custom_base, custom_mode,
    )
@@ -2121,7 +2080,7 @@ def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str
        return None, None
    api_key, base_url = resolved
    logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
-    real_client = _create_openai_client(api_key=api_key, base_url=base_url)
+    real_client = OpenAI(api_key=api_key, base_url=base_url)
    return CodexAuxiliaryClient(real_client, model), model


@@ -2158,7 +2117,7 @@ def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
            return None, None
        base_url = _CODEX_AUX_BASE_URL
    logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", model)
-    real_client = _create_openai_client(
+    real_client = OpenAI(
        api_key=codex_token,
        base_url=base_url,
        default_headers=_codex_cloudflare_headers(codex_token),
@@ -2258,7 +2217,7 @@ def _try_azure_foundry(
    if _dq:
        extra["default_query"] = _dq

-    client = _create_openai_client(api_key=api_key, base_url=_clean_base, **extra)
+    client = OpenAI(api_key=api_key, base_url=_clean_base, **extra)

    if runtime_api_mode == "codex_responses":
        # GPT-5.x / o-series / codex models on Azure Foundry are
@@ -2297,16 +2256,9 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
    if not token:
        return None, None

-    # Allow base URL override from config.yaml model.base_url, but only when:
-    #   1. the configured provider is anthropic (otherwise a non-Anthropic
-    #      base_url, e.g. Codex endpoint, would leak into Anthropic requests), AND
-    #   2. the override URL actually points at an Anthropic-compatible endpoint.
-    # Without gate (2), operators who route main-session traffic through a
-    # non-Anthropic provider that accepts Anthropic-format requests (e.g.
-    # OpenRouter at openrouter.ai/api/v1, with provider=anthropic in config.yaml)
-    # would have every auxiliary side-channel call (memory extractors,
-    # reflection, vision, title generation) 401 from the foreign host —
-    # see issue #52608.
+    # Allow base URL override from config.yaml model.base_url, but only
+    # when the configured provider is anthropic — otherwise a non-Anthropic
+    # base_url (e.g. Codex endpoint) would leak into Anthropic requests.
    base_url = _pool_runtime_base_url(entry, _ANTHROPIC_DEFAULT_BASE_URL) if pool_present else _ANTHROPIC_DEFAULT_BASE_URL
    try:
        from hermes_cli.config import load_config
@@ -2316,7 +2268,7 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
            cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
            if cfg_provider == "anthropic":
                cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/")
-                if cfg_base_url and _is_anthropic_compatible_host(cfg_base_url):
+                if cfg_base_url:
                    base_url = cfg_base_url
    except Exception:
        pass
@@ -2802,25 +2754,6 @@ def _is_model_incompatible_error(exc: Exception) -> bool:
    ))


-def _is_invalid_aux_response_error(exc: Exception) -> bool:
-    """Detect provider responses that authenticated but cannot serve aux shape.
-
-    Some OpenAI-compatible routes return HTTP 200 with an empty/malformed
-    ChatCompletion instead of a normal provider error.  That is still a
-    provider/model capability failure for auxiliary tasks: downstream callers
-    need ``choices[0].message`` and should be able to continue through the
-    same fallback path as explicit model-incompatibility errors.
-    """
-    if not isinstance(exc, RuntimeError):
-        return False
-    msg = str(exc).lower()
-    return (
-        "auxiliary " in msg
-        and "llm returned invalid response" in msg
-        and "choices[0].message" in msg
-    )
-
-
 def _evict_cached_clients(provider: str) -> None:
    """Drop cached auxiliary clients for a provider so fresh creds are used."""
    normalized = _normalize_aux_provider(provider)
@@ -3643,37 +3576,6 @@ def _resolve_auto(
    # config.yaml (auxiliary.<task>.provider) still win over this.
    main_provider = str(runtime_provider or _read_main_provider() or "")
    main_model = str(runtime_model or _read_main_model() or "")
-
-    # MoA virtual provider: the "model" is a preset name (e.g. "opus-gpt") and
-    # there is no real "moa" HTTP endpoint, so resolving an aux client against
-    # provider="moa"/model=<preset> sends the preset name as the model id and
-    # the provider 400s ("opus-gpt is not a valid model ID"). Auxiliary tasks
-    # (title generation, compression, vision, …) don't need the reference
-    # fan-out — they should run on the aggregator, which is the preset's acting
-    # model. Resolve the MoA preset to its aggregator slot and continue Step 1
-    # with that real provider+model. Mirrors the MoA context-length resolution.
-    if main_provider == "moa":
-        try:
-            from hermes_cli.config import load_config
-            from hermes_cli.moa_config import resolve_moa_preset
-
-            _preset = resolve_moa_preset(load_config().get("moa") or {}, main_model)
-            _agg = _preset.get("aggregator") or {}
-            _agg_provider = str(_agg.get("provider") or "").strip()
-            _agg_model = str(_agg.get("model") or "").strip()
-            if _agg_provider and _agg_model and _agg_provider.lower() != "moa":
-                main_provider = _agg_provider
-                main_model = _agg_model
-                # The MoA virtual runtime carries a non-HTTP base_url
-                # ("moa://local") and a placeholder api_key; they belong to the
-                # facade, not the aggregator's real provider. Drop them so the
-                # aggregator resolves through its own provider credentials.
-                runtime_base_url = ""
-                runtime_api_key = ""
-                runtime_api_mode = ""
-        except Exception:
-            logger.debug("MoA aux resolution to aggregator failed", exc_info=True)
-
    if (main_provider and main_model
            and main_provider not in {"auto", ""}):
        resolved_provider = main_provider
@@ -3820,10 +3722,6 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
    _merged_async = _apply_user_default_headers(async_kwargs.get("default_headers"))
    if _merged_async:
        async_kwargs["default_headers"] = _merged_async
-    async_kwargs = {
-        **_openai_http_client_kwargs(sync_base_url, async_mode=True),
-        **async_kwargs,
-    }
    return AsyncOpenAI(**async_kwargs), model


@@ -4034,7 +3932,7 @@ def resolve_provider_client(
                               "but no Codex OAuth token found (run: hermes model)")
                return None, None
            final_model = _normalize_resolved_model(model, provider)
-            raw_client = _create_openai_client(
+            raw_client = OpenAI(
                api_key=codex_token,
                base_url=_CODEX_AUX_BASE_URL,
                default_headers=_codex_cloudflare_headers(codex_token),
@@ -4115,7 +4013,7 @@ def resolve_provider_client(
            _merged_custom = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_custom:
                extra["default_headers"] = _merged_custom
-            client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **extra)
+            client = OpenAI(api_key=custom_key, base_url=_clean_base, **extra)
            client = _wrap_if_needed(client, final_model, custom_base, custom_key)
            return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                    else (client, final_model))
@@ -4219,7 +4117,7 @@ def resolve_provider_client(
                        _fb_headers = _apply_user_default_headers(_fb_extra.get("default_headers"))
                        if _fb_headers:
                            _fb_extra["default_headers"] = _fb_headers
-                        client = _create_openai_client(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
+                        client = OpenAI(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
                        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                                else (client, final_model))
                    sync_anthropic = AnthropicAuxiliaryClient(
@@ -4228,7 +4126,7 @@ def resolve_provider_client(
                    if async_mode:
                        return AsyncAnthropicAuxiliaryClient(sync_anthropic), final_model
                    return sync_anthropic, final_model
-                client = _create_openai_client(api_key=custom_key, base_url=_clean_base2, **_extra2)
+                client = OpenAI(api_key=custom_key, base_url=_clean_base2, **_extra2)
                # codex_responses or inherited auto-detect (via _wrap_if_needed).
                # _wrap_if_needed reads the closed-over `api_mode` (the task-level
                # override). Named-provider entry api_mode=codex_responses also
@@ -4370,7 +4268,7 @@ def resolve_provider_client(
        _merged_main = _apply_user_default_headers(headers)
        if _merged_main:
            headers = _merged_main
-        client = _create_openai_client(api_key=api_key, base_url=base_url,
+        client = OpenAI(api_key=api_key, base_url=base_url,
                        **({"default_headers": headers} if headers else {}))

        # Copilot GPT-5+ models (except gpt-5-mini) require the Responses
@@ -4906,7 +4804,7 @@ def _refresh_nous_auxiliary_client(
        return None, model

    fresh_key, fresh_base_url = runtime
-    sync_client = _create_openai_client(api_key=fresh_key, base_url=fresh_base_url)
+    sync_client = OpenAI(api_key=fresh_key, base_url=fresh_base_url)
    final_model = model

    current_loop = None
@@ -5489,24 +5387,10 @@ def _build_call_kwargs(
        # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
        # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
        # there.
-        #
-        # NVIDIA NIM (integrate.api.nvidia.com and local NIM endpoints) is a
-        # second exception: some models—notably minimaxai/minimax-m3—return HTTP
-        # 200 with an empty choices[] payload when max_tokens is omitted. The main
-        # NVIDIA chat path already sends an output cap via the provider profile;
-        # preserve it on the auxiliary path too.
        _effective_base = base_url or (
            _current_custom_base_url() if provider == "custom" else ""
        )
-        _provider_norm = str(provider or "").strip().lower()
-        _is_nvidia_nim = (
-            _provider_norm in {"nvidia", "nvidia-nim", "nim", "build-nvidia", "nemotron"}
-            or base_url_host_matches(_effective_base, "integrate.api.nvidia.com")
-        )
-        if (
-            _is_anthropic_compat_endpoint(provider, _effective_base)
-            or _is_nvidia_nim
-        ):
+        if _is_anthropic_compat_endpoint(provider, _effective_base):
            kwargs["max_tokens"] = max_tokens

    if tools:
@@ -5561,9 +5445,6 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
        if not choices or not hasattr(choices[0], "message"):
            raise AttributeError("missing choices[0].message")
    except (AttributeError, TypeError, IndexError) as exc:
-        recovered = _recover_aux_response_message(response)
-        if recovered is not None:
-            return recovered
        response_type = type(response).__name__
        response_preview = str(response)[:120]
        raise RuntimeError(
@@ -5575,64 +5456,6 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
    return response


-def _recover_aux_response_message(response: Any) -> Optional[Any]:
-    """Synthesize chat-completions shape from Responses-style text fields.
-
-    Auxiliary callers consume ``choices[0].message``.  Some compatible
-    endpoints return text outside ``choices`` (for example ``output_text`` or
-    ``output`` items).  Preserve that response before declaring it malformed.
-    """
-    text = _extract_aux_response_text(response)
-    if not text:
-        return None
-
-    choice = SimpleNamespace(
-        message=SimpleNamespace(content=text),
-        finish_reason=getattr(response, "finish_reason", None) or "stop",
-    )
-    try:
-        response.choices = [choice]
-        return response
-    except Exception:
-        return SimpleNamespace(
-            id=getattr(response, "id", ""),
-            model=getattr(response, "model", ""),
-            object=getattr(response, "object", "chat.completion"),
-            choices=[choice],
-            usage=getattr(response, "usage", None),
-        )
-
-
-def _extract_aux_response_text(response: Any) -> str:
-    output_text = _obj_get(response, "output_text")
-    if isinstance(output_text, str) and output_text.strip():
-        return output_text.strip()
-
-    output = _obj_get(response, "output")
-    if not isinstance(output, list):
-        return ""
-
-    parts: List[str] = []
-    for item in output:
-        item_type = _obj_get(item, "type")
-        if item_type and item_type != "message":
-            continue
-        for part in (_obj_get(item, "content") or []):
-            part_type = _obj_get(part, "type")
-            if part_type in {"output_text", "text", None}:
-                text = _obj_get(part, "text")
-                if isinstance(text, str) and text.strip():
-                    parts.append(text.strip())
-    return "\n".join(parts).strip()
-
-
-def _obj_get(obj: Any, key: str, default: Any = None) -> Any:
-    value = getattr(obj, key, default)
-    if value is default and isinstance(obj, dict):
-        value = obj.get(key, default)
-    return value
-
-
 def call_llm(
    task: str = None,
    *,
@@ -6030,21 +5853,11 @@ def call_llm(
        # When the provider returns a 429 rate-limit (not billing), fall
        # back to an alternative provider instead of exhausting retries
        # against the same rate-limited endpoint.
-        #
-        # ── Auth error fallback (#21165) ─────────────────────────────
-        # When the resolved provider returns 401 and neither the Nous
-        # refresh path nor explicit provider credential refresh applies,
-        # fall back to an alternative provider instead of dropping the
-        # auxiliary task on the floor (silent compression failure /
-        # message loss). Auth is NOT a capacity error: it only bypasses
-        # the explicit-provider gate when the user is in auto mode.
        should_fallback = (
-            _is_auth_error(first_err)
-            or _is_payment_error(first_err)
+            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
        # Respect explicit provider choice for transient errors (auth, request
        # validation, etc.) but allow fallback when the provider clearly cannot
@@ -6067,12 +5880,9 @@ def call_llm(
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_auth_error(first_err):
-                reason = "auth error"
-            elif _is_payment_error(first_err):
+            if _is_payment_error(first_err):
                reason = "payment error"
                # Resolve the actual provider label (resolved_provider may be
                # "auto"; the client's base_url tells us which backend got the
@@ -6085,8 +5895,6 @@ def call_llm(
                reason = "rate limit"
            elif _is_model_incompatible_error(first_err):
                reason = "model incompatible with route"
-            elif _is_invalid_aux_response_error(first_err):
-                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
@@ -6521,17 +6329,11 @@ async def async_call_llm(
                        raise

        # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
-        # Auth error fallback (#21165): a 401 that survived the refresh path
-        # falls back in auto mode just like the sync call_llm() path. Auth is
-        # NOT a capacity error, so on an explicit provider it still respects
-        # the user's choice (handled by the is_auto/is_capacity_error gate).
        should_fallback = (
-            _is_auth_error(first_err)
-            or _is_payment_error(first_err)
+            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
        # Capacity errors (payment/quota/connection/rate-limit) bypass the
        # explicit-provider gate — the provider cannot serve the request
@@ -6546,12 +6348,9 @@ async def async_call_llm(
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_auth_error(first_err):
-                reason = "auth error"
-            elif _is_payment_error(first_err):
+            if _is_payment_error(first_err):
                reason = "payment error"
                _mark_provider_unhealthy(
                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
@@ -6560,8 +6359,6 @@ async def async_call_llm(
                reason = "rate limit"
            elif _is_model_incompatible_error(first_err):
                reason = "model incompatible with route"
-            elif _is_invalid_aux_response_error(first_err):
-                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -28,7 +28,6 @@ from typing import Any, Dict, Optional
 from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
 from agent.error_classifier import FailoverReason
-from agent.gemini_native_adapter import is_native_gemini_base_url
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
    _sanitize_surrogates,
@@ -38,18 +37,6 @@ from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname, env_float, env_int

 logger = logging.getLogger(__name__)
-_OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
-
-# When the fallback chain is fully exhausted on a non-rate-limit failure
-# (e.g. every provider returns a non-retryable client error like HTTP 400),
-# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
-# and does not reset _fallback_index=0 to replay the entire chain again.
-# Without this, a client/gateway that re-submits immediately would re-marshal
-# the full (potentially 80k-token) context once per provider every turn and
-# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
-# billing reasons keep their own 60s cooldown (set above); this is the
-# narrower non-rate-limit case.  See issue #24996.
-_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0


 def _ra():
@@ -128,23 +115,6 @@ def _is_openai_codex_backend(agent) -> bool:
    )


-def _validated_openrouter_provider_sort(raw_sort: Any) -> Optional[str]:
-    """Return a normalized OpenRouter provider.sort value or None."""
-    if not isinstance(raw_sort, str):
-        return None
-    sort_value = raw_sort.strip().lower()
-    if not sort_value:
-        return None
-    if sort_value in _OPENROUTER_PROVIDER_SORT_VALUES:
-        return sort_value
-    logger.warning(
-        "Ignoring invalid OpenRouter provider.sort value %r (allowed: %s)",
-        raw_sort,
-        ", ".join(sorted(_OPENROUTER_PROVIDER_SORT_VALUES)),
-    )
-    return None
-
-
 def _env_float(name: str, default: float) -> float:
    try:
        return float(os.getenv(name, str(default)))
@@ -259,11 +229,6 @@ def interruptible_api_call(agent, api_kwargs: dict):
                        invalidate_runtime_client(region)
                    raise
                result["response"] = normalize_converse_response(raw_response)
-            elif agent.provider == "moa":
-                # MoA is a virtual chat-completions provider backed by the
-                # in-process MoAClient facade. Do not rebuild a request-local
-                # OpenAI client from the virtual runtime metadata.
-                result["response"] = agent.client.chat.completions.create(**api_kwargs)
            else:
                request_client = _set_request_client(
                    agent._create_request_openai_client(
@@ -733,9 +698,8 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        _prefs["ignore"] = agent.providers_ignored
    if agent.providers_order:
        _prefs["order"] = agent.providers_order
-    _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
-    if _provider_sort:
-        _prefs["sort"] = _provider_sort
+    if agent.provider_sort:
+        _prefs["sort"] = agent.provider_sort
    if agent.provider_require_parameters:
        _prefs["require_parameters"] = True
    if agent.provider_data_collection:
@@ -1051,23 +1015,18 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
                    "arguments": tool_call.function.arguments
                },
            }
-            # Tool-call arguments are intentionally NOT redacted here. This
-            # dict enters the in-memory conversation history that is replayed
-            # to the model on every subsequent turn AND persisted to state.db,
-            # which is itself replayed verbatim on session resume
-            # (get_messages_as_conversation). Masking a credential to `***`
-            # here poisons that replay: the model reads back its own
-            # `PGPASSWORD='***' psql ...` call and copies the placeholder into
-            # the next tool call, breaking every credential-dependent command
-            # on the second turn (#43083). The masking also provided no real
-            # protection — the same secret still leaks verbatim through tool
-            # OUTPUT (file contents, command output, diffs, the compaction
-            # block), none of which this pass ever touched. Keeping secrets
-            # out of the replayable store is a separate tokenization/vault
-            # concern, not something arg-redaction can deliver without
-            # breaking replay. Storage-time redaction remains governed by the
-            # `security.redact_secrets` toggle. (#19798 introduced this;
-            # #43083 removed it.)
+            # Defence-in-depth: redact credentials from tool call arguments
+            # before they enter conversation history. Tool execution uses the
+            # raw API response object, not this dict, so redacting the
+            # persisted shape is safe and only affects storage. Catches the
+            # case where a model accidentally inlines a secret into a tool
+            # call (e.g. `terminal(command="curl -H 'Authorization: Bearer
+            # sk-...'")`). (#19798)
+            if isinstance(tc_dict["function"]["arguments"], str):
+                from agent.redact import redact_sensitive_text
+                tc_dict["function"]["arguments"] = redact_sensitive_text(
+                    tc_dict["function"]["arguments"]
+                )
            # Preserve extra_content (e.g. Gemini thought_signature) so it
            # is sent back on subsequent API calls.  Without this, Gemini 3
            # thinking models reject the request with a 400 error.
@@ -1134,22 +1093,8 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
        if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
            agent._rate_limited_until = time.monotonic() + 60
    if agent._fallback_index >= len(agent._fallback_chain):
-        # Chain exhausted.  If we actually walked a non-empty chain and the
-        # failure was NOT a rate-limit/billing event (those already armed
-        # their own 60s cooldown above), arm a short cooldown so the next
-        # turn's restore_primary_runtime stays gated instead of resetting
-        # _fallback_index=0 and re-marshaling the whole context across every
-        # provider again.  Guards the cross-turn replay storm in #24996.
-        if (
-            len(agent._fallback_chain) > 0
-            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
-        ):
-            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
-            agent._rate_limited_until = max(
-                _existing_cooldown,
-                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
-            )
        return False
+
    fb = agent._fallback_chain[agent._fallback_index]
    agent._fallback_index += 1
    fb_provider = (fb.get("provider") or "").strip().lower()
@@ -1265,16 +1210,14 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
            agent._transport_cache.clear()
        agent._fallback_activated = True

-        # Rebind the credential pool to the fallback provider when the provider
-        # changes.  Keeping the primary pool attached would make downstream
-        # recovery (rate_limit / billing / auth) mutate the wrong credential
-        # set and can overwrite the fallback's base_url back to the primary
-        # endpoint.  See #33163.
-        #
+        # Clear the credential pool when the fallback provider doesn't match
+        # the pool's provider.  The pool was seeded for the primary provider;
+        # leaving it attached means downstream recovery (rate_limit / billing /
+        # auth) calls ``_swap_credential`` with a primary entry which overwrites
+        # the agent's ``base_url`` back to the primary's endpoint — every
+        # fallback request then 404s against the wrong host.  See #33163.
        # When the fallback shares the pool's provider (e.g. both openrouter
-        # entries with different routing) the pool is preserved.  When the
-        # providers differ, load the fallback provider's own pool if one exists
-        # so provider-specific rotation continues to work after the switch.
+        # entries with different routing) the pool is preserved.
        _existing_pool = getattr(agent, "_credential_pool", None)
        if _existing_pool is not None:
            _pool_provider = (getattr(_existing_pool, "provider", "") or "").strip().lower()
@@ -1285,22 +1228,6 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
                    fb_provider, fb_model, _pool_provider,
                )
                agent._credential_pool = None
-        if getattr(agent, "_credential_pool", None) is None:
-            try:
-                from agent.credential_pool import load_pool
-
-                fallback_pool = load_pool(fb_provider)
-                if fallback_pool and fallback_pool.has_credentials():
-                    agent._credential_pool = fallback_pool
-                    logger.info(
-                        "Fallback to %s/%s: attached fallback credential pool",
-                        fb_provider, fb_model,
-                    )
-            except Exception as exc:
-                logger.debug(
-                    "Fallback to %s/%s: could not attach credential pool: %s",
-                    fb_provider, fb_model, exc,
-                )

        # Honor per-provider / per-model request_timeout_seconds for the
        # fallback target (same knob the primary client uses).  None = use
@@ -1531,9 +1458,8 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str:
                provider_preferences["ignore"] = agent.providers_ignored
            if agent.providers_order:
                provider_preferences["order"] = agent.providers_order
-            _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
-            if _provider_sort:
-                provider_preferences["sort"] = _provider_sort
+            if agent.provider_sort:
+                provider_preferences["sort"] = agent.provider_sort
            if provider_preferences and (
                (agent.provider or "").strip().lower() == "openrouter"
                or agent._is_openrouter_url()
@@ -1912,6 +1838,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
        stream_kwargs = {
            **api_kwargs,
            "stream": True,
+            "stream_options": {"include_usage": True},
            "timeout": _httpx.Timeout(
                connect=_conn_cap,
                read=_stream_read_timeout,
@@ -1919,14 +1846,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pool=_conn_cap,
            ),
        }
-        # OpenAI's `stream_options={"include_usage": True}` drives usage
-        # accounting on OpenAI-compatible endpoints (incl. the Gemini OpenAI
-        # compat shim and aggregators like OpenRouter).  Google's *native*
-        # Gemini REST endpoint rejects the keyword outright
-        # (`Completions.create() got an unexpected keyword argument
-        # 'stream_options'`), so omit it only for that endpoint.
-        if not is_native_gemini_base_url(agent.base_url):
-            stream_kwargs["stream_options"] = {"include_usage": True}
        request_client = _set_request_client(
            agent._create_request_openai_client(
                reason="chat_completion_stream_request",
@@ -2327,15 +2246,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                                _fire_first_delta()
                                agent._fire_reasoning_delta(thinking_text)

-            # Return the native Anthropic Message for downstream processing.
-            # If the stream was interrupted (the event loop broke out above on
-            # agent._interrupt_requested), do NOT call get_final_message() — on
-            # a partially-consumed stream the SDK may hang draining remaining
-            # events or return a Message with incomplete tool_use blocks (partial
-            # JSON in `input`). The outer poll loop raises InterruptedError, so
-            # this return value is discarded anyway.
-            if agent._interrupt_requested:
-                return None
+            # Return the native Anthropic Message for downstream processing
            return stream.get_final_message()

    def _call():
@@ -2480,19 +2391,12 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            diag=request_client_holder.get("diag"),
                        )
                        _close_request_client_once("stream_mid_tool_retry_cleanup")
-                        if agent.api_mode == "anthropic_messages":
-                            try:
-                                agent._anthropic_client.close()
-                                agent._rebuild_anthropic_client()
-                            except Exception:
-                                pass
-                        else:
-                            try:
-                                agent._replace_primary_openai_client(
-                                    reason="stream_mid_tool_retry_pool_cleanup"
-                                )
-                            except Exception:
-                                pass
+                        try:
+                            agent._replace_primary_openai_client(
+                                reason="stream_mid_tool_retry_pool_cleanup"
+                            )
+                        except Exception:
+                            pass
                        continue

                    # SSE error events from proxies (e.g. OpenRouter sends
@@ -2540,19 +2444,12 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            _close_request_client_once("stream_retry_cleanup")
                            # Also rebuild the primary client to purge
                            # any dead connections from the pool.
-                            if agent.api_mode == "anthropic_messages":
-                                try:
-                                    agent._anthropic_client.close()
-                                    agent._rebuild_anthropic_client()
-                                except Exception:
-                                    pass
-                            else:
-                                try:
-                                    agent._replace_primary_openai_client(
-                                        reason="stream_retry_pool_cleanup"
-                                    )
-                                except Exception:
-                                    pass
+                            try:
+                                agent._replace_primary_openai_client(
+                                    reason="stream_retry_pool_cleanup"
+                                )
+                            except Exception:
+                                pass
                            continue
                        # Retries exhausted. Log the final failure with
                        # full diagnostic detail (chain, headers,
@@ -2664,17 +2561,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0)
        else:
            _stream_stale_timeout = _stream_stale_timeout_base
-        # Reasoning-model floor: known reasoning models (Nemotron 3 Ultra,
-        # OpenAI o1/o3, Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ,
-        # xAI Grok reasoning, etc.) routinely exceed the default 180s chat-
-        # model threshold during their thinking phase.  The cloud gateway
-        # upstream kills the socket first, surfacing as BrokenPipeError.
-        # Raises the floor only — never overrides explicit user config
-        # (handled by get_provider_stale_timeout above).
-        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-        _reasoning_floor = get_reasoning_stale_timeout_floor(api_kwargs.get("model"))
-        if _reasoning_floor is not None:
-            _stream_stale_timeout = max(_stream_stale_timeout, _reasoning_floor)

    t = threading.Thread(target=_call, daemon=True)
    t.start()
@@ -2723,17 +2609,10 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pass
            # Rebuild the primary client too — its connection pool
            # may hold dead sockets from the same provider outage.
-            if agent.api_mode == "anthropic_messages":
-                try:
-                    agent._anthropic_client.close()
-                    agent._rebuild_anthropic_client()
-                except Exception:
-                    pass
-            else:
-                try:
-                    agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
-                except Exception:
-                    pass
+            try:
+                agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
+            except Exception:
+                pass
            # Reset the timer so we don't kill repeatedly while
            # the inner thread processes the closure.
            last_chunk_time["t"] = time.time()
@@ -2809,30 +2688,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                role="assistant", content=_partial_text, tool_calls=None,
                reasoning_content=None,
            )
-            # Detect provider output-layer content filtering (e.g. MiniMax
-            # "output new_sensitive (1027)", Azure/OpenAI content_filter,
-            # Anthropic safety refusal).  The raw error is about to be
-            # swallowed into a finish_reason=length stub, so classify it HERE
-            # while we still have it and stamp the stub.  Retrying such a
-            # content-deterministic filter on the same primary just re-hits
-            # the filter — the conversation loop reads this tag and activates
-            # the fallback chain instead of burning continuation retries.
-            # error_classifier is the single source of truth for "what counts
-            # as a content filter" (#32421).
-            _content_filter_terminated = False
-            try:
-                from agent.error_classifier import classify_api_error, FailoverReason
-                _cls = classify_api_error(
-                    result["error"],
-                    provider=str(getattr(agent, "provider", "") or ""),
-                    model=str(getattr(agent, "model", "") or ""),
-                )
-                _content_filter_terminated = (
-                    _cls.reason == FailoverReason.content_policy_blocked
-                )
-            except Exception:
-                _content_filter_terminated = False
-            _stub = SimpleNamespace(
+            return SimpleNamespace(
                id=PARTIAL_STREAM_STUB_ID,
                model=getattr(agent, "model", "unknown"),
                choices=[SimpleNamespace(
@@ -2841,9 +2697,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                usage=None,
                _dropped_tool_names=_partial_names or None,
            )
-            if _content_filter_terminated:
-                _stub._content_filter_terminated = True
-            return _stub
        raise result["error"]
    return result["response"]

--- a/agent/coding_context.py
+++ b/agent/coding_context.py
@@ -60,8 +60,6 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 logger = logging.getLogger("hermes.coding_context")

 CODING_TOOLSET = "coding"
@@ -649,14 +647,12 @@ def _enabled_mcp_servers(config: Optional[dict[str, Any]]) -> list[str]:


 def _git(cwd: Path, *args: str) -> str:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        out = subprocess.run(
            ["git", "-C", str(cwd), *args],
            capture_output=True,
            text=True,
            timeout=_GIT_TIMEOUT,
-            **_popen_kwargs,
        )
    except (OSError, subprocess.SubprocessError):
        return ""
--- a/agent/context_breakdown.py
+++ b/agent/context_breakdown.py
@@ -1,156 +0,0 @@
-"""Live session context-window breakdown for UI surfaces.
-
-Estimates how the next provider request is composed: system prompt tiers,
-tool schemas, and conversation history. Uses the same rough char/4 heuristic
-as ``agent.model_metadata.estimate_request_tokens_rough`` so numbers align
-with compression thresholds — not exact tokenizer counts.
-"""
-
-from __future__ import annotations
-
-import json
-import re
-from typing import Any, Dict, List, Optional, Sequence, Tuple
-
-_SKILLS_BLOCK_RE = re.compile(r"<available_skills>.*?</available_skills>", re.DOTALL)
-
-_SUBAGENT_TOOL_NAMES = frozenset({"delegate_task"})
-
-_CATEGORY_COLORS = {
-    "system_prompt": "var(--context-usage-system)",
-    "tool_definitions": "var(--context-usage-tools)",
-    "rules": "var(--context-usage-rules)",
-    "skills": "var(--context-usage-skills)",
-    "mcp": "var(--context-usage-mcp)",
-    "subagent_definitions": "var(--context-usage-subagents)",
-    "memory": "var(--context-usage-memory)",
-    "conversation": "var(--context-usage-conversation)",
-}
-
-
-def _chars_to_tokens(text: str) -> int:
-    if not text:
-        return 0
-    return (len(text) + 3) // 4
-
-
-def _json_tokens(value: Any) -> int:
-    if not value:
-        return 0
-    return _chars_to_tokens(json.dumps(value, ensure_ascii=False))
-
-
-def _tool_name(tool: dict) -> str:
-    fn = tool.get("function") if isinstance(tool, dict) else None
-    if isinstance(fn, dict):
-        return str(fn.get("name") or "")
-    return str(tool.get("name") or "")
-
-
-def _split_tools(tools: Sequence[dict]) -> Tuple[List[dict], List[dict], List[dict]]:
-    builtin: List[dict] = []
-    mcp: List[dict] = []
-    subagent: List[dict] = []
-    for tool in tools:
-        name = _tool_name(tool)
-        if name.startswith("mcp_"):
-            mcp.append(tool)
-        elif name in _SUBAGENT_TOOL_NAMES:
-            subagent.append(tool)
-        else:
-            builtin.append(tool)
-    return builtin, mcp, subagent
-
-
-def _memory_blocks(agent: Any) -> Tuple[str, str]:
-    memory_block = ""
-    user_block = ""
-    store = getattr(agent, "_memory_store", None)
-    if store is None:
-        return memory_block, user_block
-    try:
-        if getattr(agent, "_memory_enabled", True):
-            memory_block = store.format_for_system_prompt("memory") or ""
-        if getattr(agent, "_user_profile_enabled", True):
-            user_block = store.format_for_system_prompt("user") or ""
-    except Exception:
-        pass
-    return memory_block, user_block
-
-
-def _strip_blocks(text: str, *blocks: str) -> str:
-    out = text
-    for block in blocks:
-        if block:
-            out = out.replace(block, "")
-    return out.strip()
-
-
-def compute_session_context_breakdown(
-    agent: Any,
-    messages: Optional[List[dict]] = None,
-) -> Dict[str, Any]:
-    """Return a Cursor-style context usage breakdown for one live agent."""
-    from agent.model_metadata import estimate_messages_tokens_rough
-    from agent.system_prompt import build_system_prompt_parts
-
-    parts = build_system_prompt_parts(agent)
-    stable = parts.get("stable", "") or ""
-    context = parts.get("context", "") or ""
-    volatile = parts.get("volatile", "") or ""
-
-    skills_match = _SKILLS_BLOCK_RE.search(stable)
-    skills_index = skills_match.group(0) if skills_match else ""
-
-    memory_block, user_block = _memory_blocks(agent)
-    memory_text = "\n\n".join(part for part in (memory_block, user_block) if part).strip()
-
-    system_core = _strip_blocks(stable, skills_index)
-    system_tail = _strip_blocks(volatile, memory_block, user_block)
-    system_prompt_text = "\n\n".join(part for part in (system_core, system_tail) if part).strip()
-
-    tools = list(getattr(agent, "tools", None) or [])
-    builtin_tools, mcp_tools, subagent_tools = _split_tools(tools)
-
-    conversation_tokens = estimate_messages_tokens_rough(messages or [])
-
-    categories = [
-        ("system_prompt", "System prompt", _chars_to_tokens(system_prompt_text)),
-        ("tool_definitions", "Tool definitions", _json_tokens(builtin_tools)),
-        ("rules", "Rules", _chars_to_tokens(context)),
-        ("skills", "Skills", _chars_to_tokens(skills_index)),
-        ("mcp", "MCP", _json_tokens(mcp_tools)),
-        ("subagent_definitions", "Subagent definitions", _json_tokens(subagent_tools)),
-        ("memory", "Memory", _chars_to_tokens(memory_text)),
-        ("conversation", "Conversation", conversation_tokens),
-    ]
-
-    estimated_total = sum(tokens for _, _, tokens in categories)
-
-    comp = getattr(agent, "context_compressor", None)
-    context_max = int(getattr(comp, "context_length", 0) or 0) if comp else 0
-    measured_used = int(getattr(comp, "last_prompt_tokens", 0) or 0) if comp else 0
-    context_used = measured_used if measured_used > 0 else estimated_total
-    context_percent = (
-        max(0, min(100, round(context_used / context_max * 100)))
-        if context_max
-        else 0
-    )
-
-    return {
-        "categories": [
-            {
-                "color": _CATEGORY_COLORS.get(category_id, "var(--ui-text-tertiary)"),
-                "id": category_id,
-                "label": label,
-                "tokens": tokens,
-            }
-            for category_id, label, tokens in categories
-            if tokens > 0
-        ],
-        "context_max": context_max,
-        "context_percent": context_percent,
-        "context_used": context_used,
-        "estimated_total": estimated_total,
-        "model": getattr(agent, "model", "") or "",
-    }
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -12,7 +12,6 @@ from pathlib import Path
 from typing import Awaitable, Callable

 from agent.model_metadata import estimate_tokens_rough
-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags

 _QUOTED_REFERENCE_VALUE = r'(?:`[^`\n]+`|"[^"\n]+"|\'[^\'\n]+\')'
 REFERENCE_PATTERN = re.compile(
@@ -291,7 +290,6 @@ def _expand_git_reference(
    args: list[str],
    label: str,
 ) -> tuple[str | None, str | None]:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["git", *args],
@@ -300,7 +298,6 @@ def _expand_git_reference(
            text=True,
            timeout=30,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"{ref.raw}: git command timed out (30s)", None
@@ -328,9 +325,9 @@ async def _fetch_url_content(
 async def _default_url_fetcher(url: str) -> str:
    from tools.web_tools import web_extract_tool

-    raw = await web_extract_tool([url], format="markdown")
+    raw = await web_extract_tool([url], format="markdown", use_llm_processing=True)
    payload = json.loads(raw)
-    docs = payload.get("results", [])
+    docs = payload.get("data", {}).get("documents", [])
    if not docs:
        return ""
    doc = docs[0]
@@ -486,7 +483,6 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]:


 def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["rg", "--files", str(path.relative_to(cwd))],
@@ -495,7 +491,6 @@ def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
            text=True,
            timeout=10,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
        return None
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -288,29 +288,6 @@ def replay_compression_warning(agent: Any) -> None:
            pass


-def conversation_history_after_compression(agent: Any, messages: list) -> Optional[list]:
-    """Return the correct flush baseline after a compression boundary.
-
-    Legacy compression rotates to a fresh child session. That child has not
-    seen the compacted transcript through the normal same-turn flush path yet,
-    so callers must clear ``conversation_history`` to ``None`` and let the next
-    persistence call write the whole compacted list.
-
-    In-place compaction is different: ``archive_and_compact()`` has already
-    soft-archived the previous active rows and inserted ``messages`` as the new
-    active live transcript under the same session id. If the same agent turn
-    continues with ``conversation_history=None``, the identity-based flush path
-    treats those already-persisted compacted dicts as new and appends them a
-    second time, doubling the active context and retriggering compression.
-
-    A shallow copy is intentional: it captures the current compacted dict
-    identities as history while allowing later same-turn appends to remain new.
-    """
-    if bool(getattr(agent, "_last_compaction_in_place", False)):
-        return list(messages)
-    return None
-
-
 def compress_context(
    agent: Any,
    messages: list,
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -28,7 +28,6 @@ import uuid
 from typing import Any, Dict, List, Optional

 from agent.codex_responses_adapter import _summarize_user_message_for_log
-from agent.conversation_compression import conversation_history_after_compression
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
 from agent.iteration_budget import IterationBudget
@@ -588,13 +587,6 @@ def run_conversation(
    compression_attempts = 0
    _turn_exit_reason = "unknown"  # Diagnostic: why the loop ended

-    # Per-turn tally of consecutive successful credential-pool token refreshes,
-    # keyed by (provider, pool-entry-id). A persistent upstream 401 lets
-    # ``try_refresh_current()`` "succeed" forever on a single-entry OAuth pool,
-    # so this tally caps same-entry refreshes and lets the fallback chain take
-    # over instead of spinning. Reset here so each turn starts fresh. See #26080.
-    agent._auth_pool_refresh_counts = {}
-
    # Optional opt-in runtime: if api_mode == codex_app_server, hand the
    # turn to the codex app-server subprocess (terminal/file ops/patching
    # all run inside Codex). Default Hermes path is bypassed entirely.
@@ -835,6 +827,7 @@ def run_conversation(
                    aggregator=moa_config.get("aggregator") or {},
                    temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
                    aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
+                    max_tokens=int(moa_config.get("max_tokens", 4096) or 4096),
                )
                if _moa_context:
                    for _msg in reversed(api_messages):
@@ -1699,56 +1692,6 @@ def run_conversation(

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
-                        # ── Content-filter stream stall → fallback (#32421) ──
-                        # When the provider's output-layer safety filter (e.g.
-                        # MiniMax "output new_sensitive (1027)", Azure
-                        # content_filter) kills the stream mid-delivery, the
-                        # raw error was classified at the swallow point and the
-                        # stub tagged ``_content_filter_terminated``.  This
-                        # filter is content-deterministic — continuation
-                        # retries against the SAME primary just re-hit it and
-                        # burn paid attempts (the loop used to give up with
-                        # "Response remained truncated after 3 continuation
-                        # attempts" and never consult the fallback chain).
-                        # Escalate to the configured fallback BEFORE retrying.
-                        _cf_terminated = getattr(
-                            response, "_content_filter_terminated", False
-                        )
-                        if (
-                            _cf_terminated
-                            and agent._fallback_index < len(agent._fallback_chain)
-                        ):
-                            agent._vprint(
-                                f"{agent.log_prefix}🛡️  Content filter terminated "
-                                f"stream — activating fallback provider...",
-                                force=True,
-                            )
-                            agent._emit_status(
-                                "Content filter terminated stream; switching to fallback..."
-                            )
-                            if agent._try_activate_fallback():
-                                # Roll the partial content (if any was already
-                                # appended in a prior continuation pass) back to
-                                # the last clean turn so the fallback provider
-                                # gets a coherent continuation point.
-                                if truncated_response_parts:
-                                    messages = agent._get_messages_up_to_last_assistant(messages)
-                                agent._session_messages = messages
-                                length_continue_retries = 0
-                                truncated_response_parts = []
-                                retry_count = 0
-                                compression_attempts = 0
-                                _retry.primary_recovery_attempted = False
-                                _retry.restart_with_rebuilt_messages = True
-                                break
-                            # No fallback available — fall through to normal
-                            # continuation (best-effort, may loop).
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  No fallback provider "
-                                f"configured — retrying with same provider "
-                                f"(may re-hit filter)...",
-                                force=True,
-                            )
                        if assistant_message is not None and not _trunc_has_tool_calls:
                            length_continue_retries += 1
                            interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
@@ -2068,21 +2011,9 @@ def run_conversation(
                    agent.thinking_callback("")
                api_elapsed = time.time() - api_start_time
                agent._vprint(f"{agent.log_prefix}⚡ Interrupted during API call.", force=True)
-                interrupted = True
-                # Preserve any assistant text already streamed to the user
-                # before the stop landed. Dropping it leaves history with no
-                # record of the half-finished reply on screen, so the next turn
-                # the model "forgets" what it just said — exactly what users hit
-                # when they stop to redirect mid-response.
-                _partial = agent._strip_think_blocks(
-                    getattr(agent, "_current_streamed_assistant_text", "") or ""
-                ).strip()
-                if _partial:
-                    messages.append({"role": "assistant", "content": _partial})
-                    final_response = _partial
-                else:
-                    final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
                agent._persist_session(messages, conversation_history)
+                interrupted = True
+                final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
                break

            except Exception as api_error:
@@ -2316,15 +2247,6 @@ def run_conversation(
                    # "unknown variant `image_url`, expected `text`".
                    "unknown variant `image_url`, expected `text`",
                    "unknown variant image_url, expected text",
-                    # OpenRouter routes a request to upstream endpoints and,
-                    # when none of the candidate endpoints for the model accept
-                    # image input, returns HTTP 404 "No endpoints found that
-                    # support image input". Without this phrase the agent never
-                    # strips the images, the retry loop re-sends the same
-                    # rejected request until exhaustion, and the gateway leaves
-                    # every subsequent message queued behind the stuck turn —
-                    # the P1 in issue #21160. The 404 passes the 4xx gate below.
-                    "no endpoints found that support image input",
                )
                _err_lower = _err_body.lower()
                _looks_like_image_rejection = any(
@@ -2896,9 +2818,10 @@ def run_conversation(
                            approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )
-                        conversation_history = conversation_history_after_compression(
-                            agent, messages
-                        )
+                        # Compression created a new session — clear history
+                        # so _flush_messages_to_session_db writes compressed
+                        # messages to the new session, not skipping them.
+                        conversation_history = None
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
                            agent._buffer_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
@@ -2910,25 +2833,15 @@ def run_conversation(
                    # Fall through to normal error handling if compression
                    # is exhausted or didn't help.

-                # Eager fallback for rate-limit errors (429 or quota exhaustion)
-                # and transport errors (connection failure / timeout / provider
-                # overloaded).  Rate limits and billing: switch immediately —
-                # the primary provider won't recover within the retry window.
-                # Transport errors: allow 1 retry first (transient hiccups
-                # recover), then fall back if the provider is truly unreachable.
+                # Eager fallback for rate-limit errors (429 or quota exhaustion).
+                # When a fallback model is configured, switch immediately instead
+                # of burning through retries with exponential backoff -- the
+                # primary provider won't recover within the retry window.
                is_rate_limited = classified.reason in {
                    FailoverReason.rate_limit,
                    FailoverReason.billing,
                }
-                _is_transport_failure = classified.reason in {
-                    FailoverReason.timeout,
-                    FailoverReason.overloaded,
-                }
-                _should_fallback = (
-                    is_rate_limited
-                    or (_is_transport_failure and retry_count >= 2)
-                )
-                if _should_fallback and agent._fallback_index < len(agent._fallback_chain):
+                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
                    # Don't eagerly fallback if credential pool rotation may
                    # still recover.  See _pool_may_recover_from_rate_limit
                    # for the single-credential-pool and CloudCode-quota
@@ -2943,10 +2856,6 @@ def run_conversation(
                            agent._buffer_status(
                                "⚠️ Billing or credits exhausted — switching to fallback provider..."
                            )
-                        elif _is_transport_failure:
-                            agent._buffer_status(
-                                "⚠️ Provider unreachable — switching to fallback provider..."
-                            )
                        else:
                            agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
@@ -3121,9 +3030,10 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history
+                    # so _flush_messages_to_session_db writes compressed
+                    # messages to the new session, not skipping them.
+                    conversation_history = None

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3287,9 +3197,10 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history
+                    # so _flush_messages_to_session_db writes compressed
+                    # messages to the new session, not skipping them.
+                    conversation_history = None

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3551,13 +3462,6 @@ def run_conversation(
                    ):
                        _retry.primary_recovery_attempted = True
                        retry_count = 0
-                        # Primary transport recovery starts a fresh attempt
-                        # cycle. Re-open fallback state so a follow-on 429 can
-                        # still activate fallback_providers after stale
-                        # pre-recovery fallback/credential-pool bookkeeping.
-                        _retry.has_retried_429 = False
-                        agent._fallback_index = 0
-                        agent._fallback_activated = False
                        continue
                    # Try fallback before giving up entirely
                    if agent._has_pending_fallback():
@@ -3623,65 +3527,6 @@ def run_conversation(
                            force=True,
                        )

-                    # Detect thinking-timeout pattern: a known reasoning model
-                    # hit a transport-layer error before the first content
-                    # token arrived.  Distinct from _is_stream_drop above
-                    # (which fires for large file-write stream drops) and
-                    # from any classifier reason that's not a transport
-                    # timeout.  Reuses the reasoning-model allowlist from
-                    # agent/reasoning_timeouts.py (Fixes #52217) so the
-                    # trigger is consistent with what the per-model
-                    # stale-timeout floor covers.  After the classifier
-                    # override at agent/error_classifier.py:720-738 (this
-                    # PR), transport disconnects on reasoning models route
-                    # to FailoverReason.timeout rather than
-                    # context_overflow, so this branch actually fires.
-                    # Detection and message text live in
-                    # agent.thinking_timeout_guidance so they're
-                    # unit-testable without driving the full retry loop.
-                    # (Part 2 of Fixes #52310.)
-                    from agent.thinking_timeout_guidance import (
-                        is_thinking_timeout,
-                    )
-                    _is_thinking_timeout = is_thinking_timeout(
-                        classified,
-                        _model,
-                        error_msg,
-                    )
-                    if _is_thinking_timeout:
-                        agent._vprint(
-                            f"{agent.log_prefix}   💡 The model's thinking "
-                            f"phase exceeded the upstream proxy's idle "
-                            f"timeout before the first content token "
-                            f"arrived. This is a known issue with "
-                            f"reasoning models behind cloud gateways "
-                            f"(NVIDIA NIM, OpenAI, Anthropic, DeepSeek).",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      Workarounds in priority order:",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      1. Set "
-                            f"`providers.{_provider}.models.{_model}.stale_timeout_seconds: 900` "
-                            f"in `~/.hermes/config.yaml` to extend the per-call "
-                            f"timeout. (Hermes's built-in floor is 600s for "
-                            f"known reasoning models — if you still see this "
-                            f"after raising, the upstream cap is even shorter.)",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      2. Lower `reasoning_budget` or set "
-                            f"`reasoning_effort: medium` on this model if the provider supports it.",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      3. Use a smaller / faster reasoning "
-                            f"model if the task doesn't require deep thinking.",
-                            force=True,
-                        )
-
                    logger.error(
                        "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s",
                        agent.log_prefix, max_retries, _final_summary,
@@ -3698,22 +3543,7 @@ def run_conversation(
                            _final_response += f"\n\n{_billing_guidance}"
                    else:
                        _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
-                    if _is_thinking_timeout:
-                        # Thinking-timeout guidance overrides the generic
-                        # stream-drop guidance — the latter is wrong for
-                        # this case (it suggests splitting large file
-                        # writes, which isn't what happened).  See the
-                        # reasoning-model override at
-                        # agent/error_classifier.py:720-738 and the
-                        # detection block above for context.
-                        from agent.thinking_timeout_guidance import (
-                            build_thinking_timeout_guidance,
-                        )
-                        _final_response += build_thinking_timeout_guidance(
-                            provider=_provider,
-                            model=_model,
-                        )
-                    elif _is_stream_drop:
+                    if _is_stream_drop:
                        _final_response += (
                            "\n\nThe provider's stream connection keeps "
                            "dropping — this often happens when generating "
@@ -3745,12 +3575,7 @@ def run_conversation(
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
-                                # Cap at 10 minutes. Anthropic Tier 1 input-token
-                                # buckets reset in ~171s, so a 120s cap caused us to
-                                # retry before the actual reset window and re-trip the
-                                # limit. 600s covers all realistic provider reset
-                                # windows while still rejecting pathological values. (#26293)
-                                _retry_after = min(float(_ra_raw), 600)
+                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
@@ -3831,17 +3656,6 @@ def run_conversation(
            _retry.restart_with_compressed_messages = False
            continue

-        if _retry.restart_with_rebuilt_messages:
-            # A content-filter stream stall (#32421) was escalated to the
-            # fallback chain and the partial content rolled back.  Re-issue
-            # the API call against the now-active fallback provider.  Refund
-            # the budget/count for the stalled attempt so the fallback gets a
-            # fair turn.
-            api_call_count -= 1
-            agent.iteration_budget.refund()
-            _retry.restart_with_rebuilt_messages = False
-            continue
-
        if _retry.restart_with_length_continuation:
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
@@ -4416,9 +4230,10 @@ def run_conversation(
                        approx_tokens=agent.context_compressor.last_prompt_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history so
+                    # _flush_messages_to_session_db writes compressed messages
+                    # to the new session (see preflight compression comment).
+                    conversation_history = None
                
                # Save session log incrementally (so progress is visible even if interrupted)
                agent._session_messages = messages
@@ -4460,11 +4275,7 @@ def run_conversation(
                            "as final response"
                        )
                        final_response = _recovered
-                        # Streaming delivered a fragment, not a confirmed
-                        # final preview. Leave response_previewed false so
-                        # gateway fallback delivery can send the recovered
-                        # text plus the abnormal-turn explanation.
-                        agent._response_was_previewed = False
+                        agent._response_was_previewed = True
                        break

                    # If the previous turn already delivered real content alongside
@@ -4709,20 +4520,14 @@ def run_conversation(
                # status from earlier failed attempts in this turn.
                agent._clear_status_buffer()

-                from agent.agent_runtime_helpers import (
-                    intent_ack_continuation_mode,
-                )
-
-                _ack_mode = intent_ack_continuation_mode(agent)
                if (
-                    _ack_mode != "off"
+                    agent.api_mode == "codex_responses"
                    and agent.valid_tool_names
                    and codex_ack_continuations < 2
                    and agent._looks_like_codex_intermediate_ack(
                        user_message=user_message,
                        assistant_content=final_response,
                        messages=messages,
-                        require_workspace=(_ack_mode == "codex_only"),
                    )
                ):
                    codex_ack_continuations += 1
@@ -4803,11 +4608,7 @@ def run_conversation(
                        "_verification_stop_synthetic": True,
                    })
                    agent._session_messages = messages
-                    # Run the verification-stop loop silently — the nudge is an
-                    # internal turn that should not add noise to the user's
-                    # terminal. Keep a debug breadcrumb in agent.log for tracing.
-                    logger.debug("verification stop-loop nudge issued (attempt %d)",
-                                 agent._verification_stop_nudges)
+                    agent._emit_status("↻ Verification required before finishing")
                    continue

                messages.append(final_msg)
--- a/agent/copilot_acp_client.py
+++ b/agent/copilot_acp_client.py
@@ -21,14 +21,8 @@ from pathlib import Path
 from types import SimpleNamespace
 from typing import Any

-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
 from agent.file_safety import get_read_block_error, is_write_denied
 from agent.redact import redact_sensitive_text
-from tools.environments.local import hermes_subprocess_env

 ACP_MARKER_BASE_URL = "acp://copilot"
 _DEFAULT_TIMEOUT_SECONDS = 900.0
@@ -100,10 +94,7 @@ def _resolve_home_dir() -> str:


 def _build_subprocess_env() -> dict[str, str]:
-    # Copilot ACP is a model-driving CLI executor: it legitimately needs LLM
-    # provider credentials. Route through the central helper so Tier-1 secrets
-    # (gateway bot tokens, GitHub auth, infra) are still stripped (#29157).
-    env = hermes_subprocess_env(inherit_credentials=True)
+    env = os.environ.copy()
    home = _resolve_home_dir()
    env["HOME"] = home
    from hermes_constants import apply_subprocess_home_env
@@ -233,73 +224,11 @@ def _render_message_content(content: Any) -> str:
    return str(content).strip()


-def _build_openai_tool_call(
-    *,
-    call_id: str,
-    name: str,
-    arguments: str,
-) -> ChatCompletionMessageToolCall:
-    """Build an OpenAI-compatible tool-call object for downstream handling."""
-    return ChatCompletionMessageToolCall(
-        id=call_id,
-        call_id=call_id,
-        response_item_id=None,
-        type="function",
-        function=Function(name=name, arguments=arguments),
-    )
-
-
-def _completion_to_stream_chunks(completion: SimpleNamespace) -> list[SimpleNamespace]:
-    """Convert a one-shot ACP response into OpenAI-style stream chunks."""
-    choice = completion.choices[0]
-    message = choice.message
-    tool_call_deltas = None
-    if message.tool_calls:
-        tool_call_deltas = []
-        for index, tool_call in enumerate(message.tool_calls):
-            tool_call_deltas.append(
-                SimpleNamespace(
-                    index=index,
-                    id=getattr(tool_call, "id", None),
-                    type=getattr(tool_call, "type", "function"),
-                    function=SimpleNamespace(
-                        name=getattr(tool_call.function, "name", None),
-                        arguments=getattr(tool_call.function, "arguments", None),
-                    ),
-                )
-            )
-
-    delta = SimpleNamespace(
-        role="assistant",
-        content=message.content or None,
-        tool_calls=tool_call_deltas,
-        reasoning_content=message.reasoning_content,
-        reasoning=message.reasoning,
-    )
-    data_chunk = SimpleNamespace(
-        choices=[
-            SimpleNamespace(
-                index=0,
-                delta=delta,
-                finish_reason=choice.finish_reason,
-            )
-        ],
-        model=completion.model,
-        usage=None,
-    )
-    usage_chunk = SimpleNamespace(
-        choices=[],
-        model=completion.model,
-        usage=completion.usage,
-    )
-    return [data_chunk, usage_chunk]
-
-
-def _extract_tool_calls_from_text(text: str) -> tuple[list[ChatCompletionMessageToolCall], str]:
+def _extract_tool_calls_from_text(text: str) -> tuple[list[SimpleNamespace], str]:
    if not isinstance(text, str) or not text.strip():
        return [], ""

-    extracted: list[ChatCompletionMessageToolCall] = []
+    extracted: list[SimpleNamespace] = []
    consumed_spans: list[tuple[int, int]] = []

    def _try_add_tool_call(raw_json: str) -> None:
@@ -323,10 +252,12 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[ChatCompletionMessage
            call_id = f"acp_call_{len(extracted)+1}"

        extracted.append(
-            _build_openai_tool_call(
+            SimpleNamespace(
+                id=call_id,
                call_id=call_id,
-                name=fn_name.strip(),
-                arguments=fn_args,
+                response_item_id=None,
+                type="function",
+                function=SimpleNamespace(name=fn_name.strip(), arguments=fn_args),
            )
        )

@@ -445,7 +376,6 @@ class CopilotACPClient:
        timeout: float | None = None,
        tools: list[dict[str, Any]] | None = None,
        tool_choice: Any = None,
-        stream: bool = False,
        **_: Any,
    ) -> Any:
        prompt_text = _format_messages_as_prompt(
@@ -492,14 +422,11 @@ class CopilotACPClient:
        )
        finish_reason = "tool_calls" if tool_calls else "stop"
        choice = SimpleNamespace(message=assistant_message, finish_reason=finish_reason)
-        completion = SimpleNamespace(
+        return SimpleNamespace(
            choices=[choice],
            usage=usage,
            model=model or "copilot-acp",
        )
-        if stream:
-            return _completion_to_stream_chunks(completion)
-        return completion

    def _run_prompt(self, prompt_text: str, *, timeout_seconds: float) -> tuple[str, str]:
        try:
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -11,7 +11,6 @@ import uuid
 import re
 from dataclasses import dataclass, fields, replace
 from datetime import datetime, timezone
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
@@ -448,63 +447,6 @@ def get_pool_strategy(provider: str) -> str:
 DEFAULT_MAX_CONCURRENT_PER_CREDENTIAL = 1


-def _write_through_provider_state_to_global_root(
-    provider_id: str, state: Dict[str, Any]
-) -> None:
-    """Persist a rotated OAuth ``state`` into the global-root auth.json.
-
-    Best-effort write-through for the multi-profile rotation hazard
-    (#48415 / #43589): nous, openai-codex, and xai-oauth rotate the
-    refresh_token on refresh, so when a profile pool refresh rotates a grant
-    it resolved from the root fallback, the rotated chain must land back in
-    root. Otherwise root keeps a now-revoked refresh token and every other
-    profile reading the stale root grant dies with ``refresh_token_reused`` /
-    ``invalid_grant`` once its access token expires.
-
-    Only updates ``providers.<provider_id>`` in the root store; never touches
-    the profile store (the caller already saved that). Swallows all errors — a
-    failed write-through degrades to the pre-existing behavior (root stale), it
-    must never break the profile's own successful save. Mirrors
-    ``hermes_cli.auth._write_through_xai_oauth_to_global_root`` (which covers
-    the non-pool xAI refresh path) for the credential-pool refresh path.
-    """
-    try:
-        global_path = auth_mod._global_auth_file_path()
-    except Exception:
-        return
-    if global_path is None:
-        # Classic mode (profile == root); the profile save already hit root.
-        return
-    # Seat belt: under pytest, refuse to write the real user's
-    # ~/.hermes/auth.json even when HERMES_HOME points at a profile path
-    # (mirrors the read-side guard in _load_global_auth_store). Uses the
-    # unmodified HOME env, not Path.home() which fixtures may monkeypatch.
-    if os.environ.get("PYTEST_CURRENT_TEST"):
-        real_home_env = os.environ.get("HOME", "")
-        if real_home_env:
-            real_root = Path(real_home_env) / ".hermes" / "auth.json"
-            try:
-                if global_path.resolve(strict=False) == real_root.resolve(strict=False):
-                    return
-            except Exception:
-                return
-    try:
-        if global_path.exists():
-            global_store = _load_auth_store(global_path)
-        else:
-            global_store = {}
-        if not isinstance(global_store, dict):
-            return
-        _store_provider_state(global_store, provider_id, dict(state), set_active=False)
-        auth_mod._save_auth_store(global_store, global_path)
-    except Exception as exc:  # pragma: no cover - best effort
-        logger.debug(
-            "%s pool refresh: write-through to global root failed: %s",
-            provider_id,
-            exc,
-        )
-
-
 class CredentialPool:
    def __init__(self, provider: str, entries: List[PooledCredential]):
        self.provider = provider
@@ -537,11 +479,10 @@ class CredentialPool:
                self._entries[idx] = new
                return

-    def _persist(self, *, removed_ids: Optional[List[str]] = None) -> None:
+    def _persist(self) -> None:
        write_credential_pool(
            self.provider,
            [entry.to_dict() for entry in self._entries],
-            removed_ids=removed_ids,
        )

    def _is_terminal_auth_failure(
@@ -859,28 +800,6 @@ class CredentialPool:
        try:
            with _auth_store_lock():
                auth_store = _load_auth_store()
-                # Decide BEFORE writing whether this profile is reading the
-                # grant from the global root (no own providers.<id> block) vs.
-                # genuinely shadowing it. A pool refresh rotates single-use
-                # OAuth refresh tokens, so a profile that resolved the grant
-                # from root MUST write the rotated chain back to root too —
-                # otherwise root keeps a revoked refresh token and every other
-                # profile reading the stale root grant dies with
-                # refresh_token_reused / invalid_grant once its access token
-                # expires. This mirrors the xAI write-through in
-                # hermes_cli.auth._save_xai_oauth_tokens (#43589); the pool
-                # refresh path is the Codex/xAI analog reported in #48415.
-                _wt_provider_id = {
-                    "nous": "nous",
-                    "openai-codex": "openai-codex",
-                    "xai-oauth": "xai-oauth",
-                }.get(self.provider)
-                write_through_to_root = bool(_wt_provider_id) and not (
-                    isinstance(auth_store.get("providers"), dict)
-                    and isinstance(
-                        auth_store["providers"].get(_wt_provider_id), dict
-                    )
-                )
                if self.provider == "nous":
                    state = _load_provider_state(auth_store, "nous")
                    if state is None:
@@ -936,10 +855,6 @@ class CredentialPool:
                    return

                _save_auth_store(auth_store)
-                if write_through_to_root and _wt_provider_id:
-                    _write_through_provider_state_to_global_root(
-                        _wt_provider_id, state
-                    )
        except Exception as exc:
            logger.debug("Failed to sync %s pool entry back to auth store: %s", self.provider, exc)

@@ -1125,17 +1040,13 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal xAI OAuth state: %s", clear_exc
                        )
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source == "loopback_pkce"
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "loopback_pkce"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            # For openai-codex: same race as xAI/nous — another Hermes process
            # may have consumed the refresh token between our proactive sync
@@ -1195,17 +1106,13 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
                        )
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source == "device_code"
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "device_code"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
@@ -1262,17 +1169,13 @@ class CredentialPool:
                        auth_mod.NOUS_DEVICE_CODE_SOURCE,
                        f"manual:{auth_mod.NOUS_DEVICE_CODE_SOURCE}",
                    }
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source in singleton_sources
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source not in singleton_sources
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            self._mark_exhausted(entry, None)
            return None
@@ -1434,7 +1337,7 @@ class CredentialPool:
            pruned_ids = set(entries_to_prune)
            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
-            self._persist(removed_ids=entries_to_prune)
+            self._persist()
        return available

    def _select_unlocked(self) -> Optional[PooledCredential]:
@@ -1608,11 +1511,7 @@ class CredentialPool:
            replace(entry, priority=new_priority)
            for new_priority, entry in enumerate(self._entries)
        ]
-        write_credential_pool(
-            self.provider,
-            [entry.to_dict() for entry in self._entries],
-            removed_ids=[removed.id],
-        )
+        self._persist()
        if self._current_id == removed.id:
            self._current_id = None
        return removed
@@ -2274,11 +2173,6 @@ def _seed_custom_pool(pool_key: str, entries: List[PooledCredential]) -> Tuple[b
 def load_pool(provider: str) -> CredentialPool:
    provider = (provider or "").strip().lower()
    raw_entries = read_credential_pool(provider)
-    disk_ids = {
-        entry.get("id")
-        for entry in raw_entries
-        if isinstance(entry, dict) and entry.get("id")
-    }
    raw_needs_sanitization = any(
        isinstance(payload, dict)
        and sanitize_borrowed_credential_payload(payload, provider) != payload
@@ -2307,10 +2201,8 @@ def load_pool(provider: str) -> CredentialPool:
        changed |= _normalize_pool_priorities(provider, entries)

    if changed:
-        new_ids = {entry.id for entry in entries}
        write_credential_pool(
            provider,
            [entry.to_dict() for entry in sorted(entries, key=lambda item: item.priority)],
-            removed_ids=disk_ids - new_ids,
        )
    return CredentialPool(provider, entries)
--- a/agent/curator.py
+++ b/agent/curator.py
@@ -273,21 +273,6 @@ def should_run_now(now: Optional[datetime] = None) -> bool:
 # Automatic state transitions (pure function, no LLM)
 # ---------------------------------------------------------------------------

-def _cron_referenced_skills() -> Set[str]:
-    """Skill names referenced by any cron job (incl. paused/disabled).
-
-    Best-effort: a cron-module import error or corrupt jobs store must never
-    break the curator, so any failure yields an empty set (no protection,
-    but no crash).
-    """
-    try:
-        from cron.jobs import referenced_skill_names as _refs
-        return _refs()
-    except Exception as e:
-        logger.debug("Curator could not read cron skill references: %s", e, exc_info=True)
-        return set()
-
-
 def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int]:
    """Walk every curator-managed skill and move active/stale/archived based on
    the latest real activity timestamp. Pinned skills are never touched.
@@ -307,8 +292,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
    stale_cutoff = now - timedelta(days=get_stale_after_days())
    archive_cutoff = now - timedelta(days=get_archive_after_days())

-    cron_referenced = _cron_referenced_skills()
-
    counts = {"marked_stale": 0, "archived": 0, "reactivated": 0, "checked": 0, "seeded": 0}

    for row in _u.agent_created_report():
@@ -317,15 +300,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
        if row.get("pinned"):
            continue

-        # A skill referenced by any cron job (incl. paused/disabled) is in
-        # use by definition — resuming or the next fire must find it. The
-        # scheduler only bumps usage when a job actually fires, so jobs that
-        # fire less often than archive_after_days, paused jobs, and far-future
-        # one-shots would otherwise have their skills aged out from under
-        # them. Treat referenced skills like pinned: never auto-transition.
-        if name in cron_referenced:
-            continue
-
        # First sight of a curation-eligible skill with no persisted record
        # (e.g. a newly-eligible built-in): anchor its clock to now and defer.
        if not row.get("_persisted", True):
@@ -342,18 +316,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int

        current = row.get("state", _u.STATE_ACTIVE)

-        # Never-used skills (use_count == 0) get a grace floor: don't archive
-        # one until it is at least stale_after_days old. A use=0 skill is
-        # absence of evidence, not evidence of staleness — a skill created
-        # recently may simply not have had its trigger come up yet.
-        never_used = int(row.get("use_count", 0) or 0) == 0
-        if never_used and anchor > stale_cutoff:
-            # Younger than the stale window — leave it alone entirely.
-            if current == _u.STATE_STALE:
-                _u.set_state(name, _u.STATE_ACTIVE)
-                counts["reactivated"] += 1
-            continue
-
        if anchor <= archive_cutoff and current != _u.STATE_ARCHIVED:
            ok, _msg = _u.archive_skill(name)
            if ok:
@@ -415,10 +377,8 @@ CURATOR_REVIEW_PROMPT = (
    "bodies + `references/`, `templates/`, and `scripts/` subfiles for "
    "session-specific detail — not one-session-one-skill micro-entries.\n\n"
    "Hard rules — do not violate:\n"
-    "1. DO NOT touch bundled, hub-installed, or external-dir skills "
-    "(`skills.external_dirs`). The candidate list below is already filtered "
-    "to local curator-managed skills only; external skills are externally "
-    "owned and read-only to this background curator.\n"
+    "1. DO NOT touch bundled or hub-installed skills. The candidate list "
+    "below is already filtered to agent-created skills only.\n"
    "2. DO NOT delete any skill. Archiving (moving the skill's directory "
    "into ~/.hermes/skills/.archive/) is the maximum destructive action. "
    "Archives are recoverable; deletion is not.\n"
@@ -428,19 +388,10 @@ CURATOR_REVIEW_PROMPT = (
    "back load-bearing UX (slash-command entry points referenced in docs and "
    "tips) and are filtered out of the candidate list below — never resurrect "
    "one as an archive or absorb target.\n"
-    "3c. DO NOT archive or prune any skill marked `cron=yes` in the candidate "
-    "list. A cron job depends on it and will fail to load it on its next "
-    "run. You MAY still consolidate it into an umbrella — but only because "
-    "the curator rewrites cron job skill references to follow consolidations; "
-    "never simply prune it.\n"
    "4. DO NOT use usage counters as a reason to skip consolidation. The "
    "counters are new and often mostly zero. Judge overlap on CONTENT, "
    "not on use_count. 'use=0' is not evidence a skill is valuable; it's "
-    "absence of evidence either way. Corollary: 'use=0' is ALSO not a "
-    "reason to PRUNE a skill. Never archive a never-used skill (use=0) "
-    "unless it is at least 30 days old (check last_activity / created date) "
-    "AND its content is genuinely obsolete or fully absorbed elsewhere — a "
-    "recently-created skill simply may not have had its trigger come up yet.\n"
+    "absence of evidence either way.\n"
    "5. DO NOT reject consolidation on the grounds that 'each skill has "
    "a distinct trigger'. Pairwise distinctness is the wrong bar. The "
    "right bar is: 'would a human maintainer write this as N separate "
@@ -518,9 +469,8 @@ CURATOR_REVIEW_PROMPT = (
    "skill, or `absorbed_into=\"\"` when you're truly pruning with no "
    "forwarding target. This drives cron-job skill-reference migration — "
    "guessing from your YAML summary after the fact is fragile.\n"
-    "  - terminal                       — move LOCAL candidate content into "
-    "a support subfile when package integrity requires it; never mv, cp, rm, "
-    "patch, or rewrite bundled, hub-installed, or external-dir skills\n\n"
+    "  - terminal                       — mv a sibling into the archive "
+    "OR move its content into a support subfile\n\n"
    "'keep' is a legitimate decision ONLY when the skill is already a "
    "class-level umbrella and none of the proposed merges would improve "
    "discoverability. 'This is narrow but distinct from its siblings' "
@@ -1460,14 +1410,12 @@ def _render_candidate_list() -> str:
    rows = skill_usage.agent_created_report()
    if not rows:
        return "No agent-created skills to review."
-    cron_referenced = _cron_referenced_skills()
    lines = [f"Agent-created skills ({len(rows)}):\n"]
    for r in rows:
        lines.append(
            f"- {r['name']}  "
            f"state={r['state']}  "
            f"pinned={'yes' if r.get('pinned') else 'no'}  "
-            f"cron={'yes' if r['name'] in cron_referenced else 'no'}  "
            f"activity={r.get('activity_count', 0)}  "
            f"use={r.get('use_count', 0)}  "
            f"view={r.get('view_count', 0)}  "
@@ -1895,14 +1843,6 @@ def _run_llm_review(prompt: str) -> Dict[str, Any]:
        # Disable recursive nudges — the curator must never spawn its own review.
        review_agent._memory_nudge_interval = 0
        review_agent._skill_nudge_interval = 0
-        # Tag this fork as autonomous background curation so skill_manage's
-        # background-review write guard fires. Without this the fork inherits
-        # the default "assistant_tool" origin, is_background_review() is False,
-        # and the external/bundled/hub-installed skill_manage guards never
-        # trigger during the curation pass they exist to protect against.
-        # turn_context.py binds this onto the write-origin ContextVar at turn
-        # start (see agent/turn_context.py).
-        review_agent._memory_write_origin = "background_review"

        # Redirect the forked agent's stdout/stderr to /dev/null while it
        # runs so its tool-call chatter doesn't pollute the foreground
--- a/agent/display.py
+++ b/agent/display.py
@@ -16,7 +16,6 @@ from pathlib import Path
 from typing import Any

 from utils import safe_json_loads
-from agent.redact import redact_sensitive_text
 from agent.tool_result_classification import file_mutation_result_landed

 # ANSI escape codes for coloring tool failure indicators
@@ -340,62 +339,6 @@ def _read_file_line_label(args: dict) -> str:
    return f"L{offset}-{offset + limit - 1}"


-def redact_browser_typed_text_for_display(value: Any, typed_text: Any) -> Any:
-    """Apply secret redaction to browser_type text in display-facing payloads.
-
-    Backends sometimes echo the attempted input in error strings or fallback
-    metadata.  When the raw typed value contains a recognizable secret (API
-    key, token, JWT, etc.) the redacted form differs from the raw value, so we
-    replace every occurrence of the raw value with its redacted form before a
-    browser_type result reaches logs, callbacks, the model, or chat history.
-
-    Normal typed text (search queries, addresses, form fields) matches no
-    secret pattern, so it passes through unchanged and stays readable.
-
-    Redaction is forced here regardless of the global ``security.redact_secrets``
-    preference: a typed credential leaking into chat history is a security
-    boundary, not mere log hygiene.
-    """
-    if typed_text is None:
-        return value
-    needle = str(typed_text)
-    if needle == "":
-        return value
-    redacted = redact_sensitive_text(needle, force=True)
-    if redacted == needle:
-        # Nothing secret-looking in the typed text; leave payload untouched.
-        return value
-    if isinstance(value, str):
-        return value.replace(needle, redacted)
-    if isinstance(value, dict):
-        return {
-            key: redact_browser_typed_text_for_display(item, typed_text)
-            for key, item in value.items()
-        }
-    if isinstance(value, list):
-        return [redact_browser_typed_text_for_display(item, typed_text) for item in value]
-    if isinstance(value, tuple):
-        return tuple(redact_browser_typed_text_for_display(item, typed_text) for item in value)
-    return value
-
-
-def redact_tool_args_for_display(tool_name: str, args: dict | None) -> dict | None:
-    """Return a copy of tool args safe for logs/progress UI.
-
-    For ``browser_type`` the ``text`` argument is run through the same
-    secret-pattern redactor used for logs.  Recognizable credentials (API
-    keys, tokens) are masked before the value reaches tool progress
-    notifications; normal typed text is left intact for debuggability.
-    """
-    if not isinstance(args, dict):
-        return args
-    if tool_name == "browser_type" and isinstance(args.get("text"), str):
-        safe_args = dict(args)
-        safe_args["text"] = redact_sensitive_text(args["text"], force=True)
-        return safe_args
-    return args
-
-
 def _delegate_task_goal_parts(tasks: Any, *, per_goal_len: int) -> tuple[int, list[str]]:
    if not isinstance(tasks, list):
        return 0, []
@@ -419,7 +362,6 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
        max_len = _tool_preview_max_len
    if not args:
        return None
-    args = redact_tool_args_for_display(tool_name, args) or args
    primary_args = {
        "terminal": "command", "web_search": "query", "web_extract": "urls",
        "read_file": "path", "write_file": "path", "patch": "path",
@@ -1143,7 +1085,6 @@ def get_cute_tool_message(
    When *result* is provided the line is checked for failure indicators.
    Failed tool calls get a red prefix and an informational suffix.
    """
-    args = redact_tool_args_for_display(tool_name, args) or args
    dur = f"{duration:.1f}s"
    is_failure, failure_suffix = _detect_tool_failure(tool_name, result)
    skin_prefix = get_skin_tool_prefix()
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -133,31 +133,6 @@ _RATE_LIMIT_PATTERNS = [
    "servicequotaexceededexception",
 ]

-# Patterns that indicate provider-side overload, NOT a per-credential rate
-# limit or billing problem.  The credential is valid — the server is just
-# busy — so the correct recovery is "back off and retry the same key", never
-# "rotate the credential" (rotating exhausts the pool while the endpoint is
-# still busy; a single-key user has nothing to rotate to).  Some providers
-# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
-# status path matches the body against this list before falling through to
-# the rate_limit default.  Phrases are kept narrow and overload-flavoured so a
-# normal rate-limit message ("you have been rate-limited") doesn't hit this
-# bucket. (#14038, #15297)
-_OVERLOADED_PATTERNS = [
-    "overloaded",
-    "temporarily overloaded",
-    "service is temporarily overloaded",
-    "service may be temporarily overloaded",
-    "server is overloaded",
-    "server overloaded",
-    "service overloaded",
-    "service is overloaded",
-    "upstream overloaded",
-    "currently overloaded",
-    "at capacity",
-    "over capacity",
-]
-
 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
 _USAGE_LIMIT_PATTERNS = [
    "usage limit",
@@ -355,14 +330,6 @@ _CONTENT_POLICY_BLOCKED_PATTERNS = [
    # echo back; the underscore form is provider-specific enough.
    "content_filter",
    "responsibleaipolicyviolation",
-    # MiniMax output-layer safety filter. The error string is surfaced
-    # verbatim by MiniMax SDK / OpenAI-compatible endpoints, usually in the
-    # form "output new_sensitive (1027)" when the model's *output* (often a
-    # large tool-call argument block) trips the upstream safety filter and
-    # the SSE stream is truncated mid-flight. ``new_sensitive`` is the
-    # filter name and is narrow enough that billing / format / auth error
-    # strings will not collide. See #32421.
-    "new_sensitive",
 ]

 # Auth patterns (non-status-code signals)
@@ -750,26 +717,6 @@ def classify_api_error(

    is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS)
    if is_disconnect and not status_code:
-        # Reasoning-model override: a transport disconnect on a reasoning
-        # model is much more likely the upstream proxy idle-killing a
-        # long thinking stream than a true context overflow — even on
-        # large sessions.  The default disconnect+large-session routing
-        # below would otherwise send the user into the compression
-        # branch (should_compress=True) and silently delete
-        # conversation history on a phantom context-length error.
-        # Reasoning models have multi-minute thinking phases that
-        # routinely exceed the cloud gateway's idle window (NVIDIA
-        # NIM ~120s — first-party repro at NVIDIA/NemoClaw#4846;
-        # OpenAI worker / Anthropic stream-idle similar).  The
-        # per-reasoning-model stale-timeout floor in
-        # agent/reasoning_timeouts.py raises the stale-detector
-        # threshold to tolerate long thinking, so a true
-        # transport-layer failure here is recoverable via the retry
-        # path — not via context compression.  Reclassify as timeout.
-        # (Part 1 of Fixes #52310.)
-        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-        if get_reasoning_stale_timeout_floor(model) is not None:
-            return _result(FailoverReason.timeout, retryable=True)
        # Absolute token/message-count thresholds are only a proxy for smaller
        # context windows.  Large-context sessions can have hundreds of
        # messages while still being far below their actual token budget.
@@ -896,19 +843,7 @@ def _classify_by_status(
        )

    if status_code == 429:
-        # Already checked long_context_tier above. Some providers (notably
-        # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
-        # code as a true per-credential rate limit, but the credential is
-        # valid and the correct recovery is "back off and retry the same key",
-        # NOT "rotate the credential" (which exhausts the pool while the
-        # endpoint is still busy, and does nothing for a single-key user).
-        # Disambiguate on the error body so an overload 429 takes the
-        # transient-overload path instead of burning the pool. (#14038)
-        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
-            return result_fn(
-                FailoverReason.overloaded,
-                retryable=True,
-            )
+        # Already checked long_context_tier above; this is a normal rate limit
        return result_fn(
            FailoverReason.rate_limit,
            retryable=True,
@@ -1259,17 +1194,6 @@ def _classify_by_message(
            should_fallback=True,
        )

-    # Overloaded / server-busy patterns — must come BEFORE the rate_limit and
-    # billing checks so that a message-only "overloaded" (no 503/529 status,
-    # e.g. some Anthropic-compatible proxies) classifies as a transient
-    # overload (backoff + retry) instead of falling through to `unknown` or
-    # incorrectly triggering credential rotation.
-    if any(p in error_msg for p in _OVERLOADED_PATTERNS):
-        return result_fn(
-            FailoverReason.overloaded,
-            retryable=True,
-        )
-
    # Billing patterns
    if any(p in error_msg for p in _BILLING_PATTERNS):
        return result_fn(
@@ -1359,25 +1283,19 @@ def _extract_status_code(error: Exception) -> Optional[int]:


 def _extract_error_body(error: Exception) -> dict:
-    """Extract the structured error body from an SDK exception or its cause chain."""
-    current = error
-    for _ in range(5):  # Match _extract_status_code() traversal depth.
-        body = getattr(current, "body", None)
-        if isinstance(body, dict):
-            return body
-        # Some errors have .response.json()
-        response = getattr(current, "response", None)
-        if response is not None:
-            try:
-                json_body = response.json()
-                if isinstance(json_body, dict):
-                    return json_body
-            except Exception:
-                pass
-        cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
-        if cause is None or cause is current:
-            break
-        current = cause
+    """Extract the structured error body from an SDK exception."""
+    body = getattr(error, "body", None)
+    if isinstance(body, dict):
+        return body
+    # Some errors have .response.json()
+    response = getattr(error, "response", None)
+    if response is not None:
+        try:
+            json_body = response.json()
+            if isinstance(json_body, dict):
+                return json_body
+        except Exception:
+            pass
    return {}


--- a/agent/file_safety.py
+++ b/agent/file_safety.py
@@ -77,22 +77,15 @@ def build_write_denied_prefixes(home: str) -> list[str]:
    ]


-def get_safe_write_roots() -> set[str]:
-    """Return resolved HERMES_WRITE_SAFE_ROOT paths. Supports multiple directories
-    separated by ``os.pathsep`` (``:`` on Unix, ``;`` on Windows).
-    E.g., ``/opt/data:/var/www/html`` on Unix, ``C:\\data;D:\\www`` on Windows."""
-    env = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
-    if not env:
-        return set()
-    roots: set[str] = set()
-    for path in env.split(os.pathsep):
-        if path:
-            try:
-                resolved = os.path.realpath(os.path.expanduser(path))
-                roots.add(resolved)
-            except (OSError, ValueError):
-                continue
-    return roots
+def get_safe_write_root() -> Optional[str]:
+    """Return the resolved HERMES_WRITE_SAFE_ROOT path, or None if unset."""
+    root = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
+    if not root:
+        return None
+    try:
+        return os.path.realpath(os.path.expanduser(root))
+    except Exception:
+        return None


 def is_write_denied(path: str) -> bool:
@@ -131,15 +124,9 @@ def is_write_denied(path: str) -> bool:
        except Exception:
            pass

-    safe_roots = get_safe_write_roots()
-    if safe_roots:
-        allowed = False
-        for safe_root in safe_roots:
-            if resolved == safe_root or resolved.startswith(safe_root + os.sep):
-                allowed = True
-                break
-        if not allowed:
-            return True
+    safe_root = get_safe_write_root()
+    if safe_root and not (resolved == safe_root or resolved.startswith(safe_root + os.sep)):
+        return True

    return False

--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@@ -251,78 +251,6 @@ def _supports_vision_override(
    return None


-def _resolve_inference_base_url(
-    cfg: Optional[Dict[str, Any]],
-    provider: str,
-) -> str:
-    """Best-effort base URL for the active inference provider."""
-    try:
-        from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL
-
-        runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip()
-        if runtime:
-            return runtime
-    except Exception:
-        pass
-
-    if not isinstance(cfg, dict):
-        return ""
-
-    model_cfg_raw = cfg.get("model")
-    model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
-    base_url = str(model_cfg.get("base_url") or "").strip()
-    if base_url:
-        return base_url
-
-    config_provider = str(model_cfg.get("provider") or "").strip()
-    candidate_names: set[str] = set()
-    for p in filter(None, (provider, config_provider)):
-        candidate_names.add(p)
-        if p.lower().startswith("custom:"):
-            candidate_names.add(p.split(":", 1)[1])
-        else:
-            candidate_names.add(f"custom:{p}")
-
-    providers_cfg = cfg.get("providers")
-    if isinstance(providers_cfg, dict):
-        for name in candidate_names:
-            entry = providers_cfg.get(name)
-            if isinstance(entry, dict):
-                bu = str(entry.get("base_url") or "").strip()
-                if bu:
-                    return bu
-
-    custom_providers = cfg.get("custom_providers")
-    if isinstance(custom_providers, list):
-        lowered = {n.lower() for n in candidate_names}
-        for entry_raw in custom_providers:
-            if not isinstance(entry_raw, dict):
-                continue
-            entry_name = str(entry_raw.get("name") or "").strip()
-            if entry_name not in candidate_names and entry_name.lower() not in lowered:
-                continue
-            bu = str(entry_raw.get("base_url") or "").strip()
-            if bu:
-                return bu
-
-    return ""
-
-
-def _should_probe_ollama_vision(provider: str, base_url: str) -> bool:
-    """True when the active provider likely fronts a local Ollama server."""
-    p = (provider or "").strip().lower()
-    if p == "ollama":
-        return True
-    if not base_url:
-        return False
-    try:
-        from agent.model_metadata import detect_local_server_type
-
-        return detect_local_server_type(base_url) == "ollama"
-    except Exception:
-        return False
-
-
 def _coerce_mode(raw: Any) -> str:
    """Normalize a config value into one of the valid modes."""
    if not isinstance(raw, str):
@@ -374,33 +302,15 @@ def _lookup_supports_vision(
        return override
    if not provider or not model:
        return None
-    caps = None
    try:
        from agent.models_dev import get_model_capabilities
        caps = get_model_capabilities(provider, model)
    except Exception as exc:  # pragma: no cover - defensive
        logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
-    if caps is not None:
-        return bool(caps.supports_vision)
-
-    base_url = _resolve_inference_base_url(cfg, provider)
-    if not base_url and (provider or "").strip().lower() == "ollama":
-        base_url = "http://localhost:11434/v1"
-    if _should_probe_ollama_vision(provider, base_url):
-        try:
-            from agent.model_metadata import query_ollama_supports_vision
-
-            ollama_vision = query_ollama_supports_vision(model, base_url)
-            if ollama_vision is not None:
-                return ollama_vision
-        except Exception as exc:  # pragma: no cover - defensive
-            logger.debug(
-                "image_routing: ollama vision probe failed for %s:%s — %s",
-                provider,
-                model,
-                exc,
-            )
-    return None
+        return None
+    if caps is None:
+        return None
+    return bool(caps.supports_vision)


 def decide_image_input_mode(
@@ -478,98 +388,14 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
    # BMP: "BM"
    if raw.startswith(b"BM"):
        return "image/bmp"
-    # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
-    if len(raw) >= 12 and raw[4:8] == b"ftyp":
-        brand = raw[8:12]
-        if brand in {b"avif", b"avis"}:
-            return "image/avif"
-        if brand in {
-            b"heic", b"heix", b"hevc", b"hevx",
-            b"mif1", b"msf1", b"heim", b"heis",
-        }:
-            return "image/heic"
-    # TIFF: II*\0 (little-endian) or MM\0* (big-endian)
-    if raw[:4] in {b"II*\x00", b"MM\x00*"}:
-        return "image/tiff"
-    # ICO: 00 00 01 00 (reserved=0, type=1=icon)
-    if raw[:4] == b"\x00\x00\x01\x00":
-        return "image/x-icon"
-    # SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
-    head = raw[:512].lstrip().lower()
-    if head.startswith(b"<?xml") or head.startswith(b"<svg"):
-        if b"<svg" in head:
-            return "image/svg+xml"
+    # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc.
+    if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in {
+        b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis",
+    }:
+        return "image/heic"
    return None


-# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
-# accepts natively. Anything outside this set has to be transcoded to PNG
-# before we declare media_type, otherwise the provider returns HTTP 400
-# ("Could not process image" / "Unsupported image media type") and the
-# whole turn fails with no salvage path.
-#
-# Discord (and a few other chat platforms) freely accept attachments in
-# formats outside this set -- AVIF screenshots from Chromium, HEIC from
-# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
-# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
-# it is skipped (logged) rather than transcoded.
-_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
-    "image/png", "image/jpeg", "image/gif", "image/webp",
-})
-
-
-def _transcode_to_png(raw: bytes) -> Optional[bytes]:
-    """Decode arbitrary image bytes with Pillow and re-encode as PNG.
-
-    Returns None if Pillow isn't installed or can't decode the input
-    (rare formats, corrupted bytes, missing optional decoder plugin for
-    HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
-    the image so the rest of the turn still works.
-
-    HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
-    them on demand and swallow ImportError so a missing plugin just
-    looks like 'Pillow can't decode this' rather than crashing.
-    """
-    try:
-        from PIL import Image
-    except ImportError:
-        logger.info(
-            "image_routing: Pillow not installed; cannot transcode "
-            "non-standard image format to PNG. Install with `pip install Pillow` "
-            "(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
-        )
-        return None
-    # Optional plugin registration. Silent on failure: an unsupported
-    # format will just fall through to Image.open raising below.
-    try:
-        import pillow_heif  # type: ignore
-
-        pillow_heif.register_heif_opener()
-    except Exception:
-        pass
-    try:
-        import pillow_avif  # type: ignore  # noqa: F401  -- registers AVIF on import
-    except Exception:
-        pass
-    try:
-        from io import BytesIO
-
-        with Image.open(BytesIO(raw)) as im:
-            # Pick an output mode PNG can serialise. Anything other than
-            # the standard set gets normalised to RGBA so transparency is
-            # preserved where the source had it.
-            if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
-                im = im.convert("RGBA")
-            buf = BytesIO()
-            im.save(buf, format="PNG", optimize=False)
-            return buf.getvalue()
-    except Exception as exc:
-        logger.info(
-            "image_routing: Pillow could not transcode image to PNG -- %s", exc
-        )
-        return None
-
-
 def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
    """Return image MIME type for *path*.

@@ -605,18 +431,8 @@ def _file_to_data_url(path: Path) -> Optional[str]:
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

-    Format compatibility IS handled here: if the sniffed MIME isn't one
-    of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
-    HEIC, BMP, TIFF, or ICO that some providers reject outright), we
-    transcode to PNG with Pillow before declaring media_type. This fixes
-    the user-visible "Could not process image" HTTP 400 from Anthropic on
-    Discord-attached AVIF/HEIC/BMP files.
-
-    Returns None if the file can't be read OR if the format isn't
-    universally supported AND Pillow can't transcode it (Pillow missing,
-    HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
-    caller reports those paths in ``skipped`` and the rest of the turn
-    proceeds.
+    Returns None only if the file can't be read (missing, permission
+    denied, etc.); the caller reports those paths in ``skipped``.
    """
    try:
        raw = path.read_bytes()
@@ -624,22 +440,6 @@ def _file_to_data_url(path: Path) -> Optional[str]:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path, raw=raw)
-    if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
-        transcoded = _transcode_to_png(raw)
-        if transcoded is None:
-            logger.warning(
-                "image_routing: %s is %s which is not accepted by all major "
-                "vision providers and could not be transcoded to PNG; "
-                "skipping this attachment.",
-                path, mime,
-            )
-            return None
-        logger.info(
-            "image_routing: transcoded %s (%s) -> image/png for provider compatibility",
-            path.name, mime,
-        )
-        raw = transcoded
-        mime = "image/png"
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"

--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -8,7 +8,6 @@ iteration.

 from __future__ import annotations

-import hashlib
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any
@@ -26,112 +25,20 @@ logger = logging.getLogger(__name__)
 # opening dozens of sockets at once.
 _MAX_REFERENCE_WORKERS = 8

-# Per-tool-result character budget for the advisory reference view. Tool
-# results can be huge (a full diff, a 5000-line file dump); replaying them
-# verbatim per reference per tool-loop step would blow the reference model's
-# context window and cost. We keep the agent's *actions* (tool calls) in full —
-# they are cheap, high-signal, and tell the reference what the agent did — but
-# preview each tool *result* head+tail so the reference still sees what came
-# back without replaying megabytes. The acting aggregator always gets the full,
-# untrimmed transcript; this budget only shapes the advisory copy.
-_REFERENCE_TOOL_RESULT_BUDGET = 4000
-
-# System prompt prepended to every reference-model call. References are
-# advisory — they do NOT act, call tools, or own the task. Without this
-# framing a reference receives the bare trimmed conversation and assumes it is
-# the acting agent: it then refuses ("I can't access repositories / URLs from
-# here") or tries to call tools it doesn't have. The prompt reframes the model
-# as an analyst whose job is to reason about the presented state and hand its
-# best thinking to the aggregator/orchestrator that will actually act.
-_REFERENCE_SYSTEM_PROMPT = (
-    "You are a reference advisor in a Mixture of Agents (MoA) process. You are "
-    "NOT the acting agent and you do NOT execute anything: you cannot call "
-    "tools, run commands, browse, or access files, repositories, or URLs, and "
-    "you should not try to or apologize for being unable to. A separate "
-    "aggregator/orchestrator model holds those capabilities and will take the "
-    "actual actions.\n\n"
-    "The conversation below is the current state of a task handled by that "
-    "acting agent. Your job is to give your most intelligent analysis of that "
-    "state: understand the goal, reason about the problem, and advise on what "
-    "to do next. Surface the best approach, concrete next steps and tool-use "
-    "strategy, likely pitfalls and risks, and anything the acting agent may "
-    "have missed or gotten wrong. Assume any referenced files, URLs, or "
-    "systems exist and reason about them from the context given rather than "
-    "asking for access.\n\n"
-    "Respond with your advice directly — no preamble, no disclaimers about "
-    "tools or access. Your response is private guidance handed to the "
-    "aggregator, not an answer shown to the user."
-)
-
-

 def _slot_label(slot: dict[str, str]) -> str:
    return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"


-def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
-    """Resolve a reference/aggregator slot to real runtime call kwargs.
-
-    A MoA slot is just a model selection — it must be called the same way any
-    model is called elsewhere, not through a bare ``call_llm(provider=...,
-    model=...)`` that leaves base_url/api_key/api_mode unresolved and lets the
-    auxiliary auto-detector guess. We route the slot's provider through
-    ``resolve_runtime_provider`` (the canonical provider→api_mode/base_url/
-    api_key resolver the CLI, gateway, and delegate_task all use), so the slot
-    gets its provider's real API surface — e.g. MiniMax → anthropic_messages,
-    GPT-5/o-series → max_completion_tokens, custom endpoints → their base_url.
-
-    Returns the kwargs to pass through to ``call_llm`` (provider/model plus the
-    resolved base_url/api_key when available). Falls back to the bare
-    provider/model on any resolution error so a misconfigured slot still
-    attempts the call rather than aborting the whole MoA turn.
-    """
-    provider = str(slot.get("provider") or "").strip()
-    model = str(slot.get("model") or "").strip()
-    out: dict[str, Any] = {"provider": provider, "model": model}
-    try:
-        from hermes_cli.runtime_provider import resolve_runtime_provider
-
-        rt = resolve_runtime_provider(requested=provider, target_model=model)
-        resolved_provider = str(rt.get("provider") or provider).strip().lower()
-        # call_llm treats an explicit base_url as a custom endpoint. That is
-        # correct for ordinary OpenAI-compatible targets, but wrong for OAuth /
-        # provider-backed targets whose provider branch adds auth refresh,
-        # request metadata, or request-shape adapters. Keep those providers
-        # identified by name.
-        if resolved_provider in {"nous", "openai-codex", "xai-oauth"}:
-            return out
-        # Pass the resolved endpoint through so call_llm builds the request for
-        # the provider's actual API surface instead of auto-detecting. base_url
-        # routes call_llm to the right adapter (incl. anthropic_messages mode);
-        # api_key is the resolved credential for that provider.
-        if rt.get("base_url"):
-            out["base_url"] = rt["base_url"]
-        if rt.get("api_key"):
-            out["api_key"] = rt["api_key"]
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.debug("MoA slot runtime resolution failed for %s: %s", _slot_label(slot), exc)
-    return out
-
-
 def _run_reference(
    slot: dict[str, str],
    ref_messages: list[dict[str, Any]],
    *,
-    temperature: float | None = None,
-    max_tokens: int | None = None,
+    temperature: float,
+    max_tokens: int,
 ) -> tuple[str, str]:
    """Call one reference model and return ``(label, text)``.

-    The slot is resolved to its provider's real runtime (via ``_slot_runtime``)
-    and called through the same ``call_llm`` request-building path any model
-    uses, so per-model wire-format handling (anthropic_messages,
-    max_completion_tokens, fixed/forbidden temperature) applies identically to
-    a reference as it would if that model were the acting model. MoA imposes no
-    cap of its own (``max_tokens`` defaults to ``None`` → omitted → the model's
-    real maximum); ``temperature`` is only the user's configured preset value,
-    which call_llm may still override per model.
-
    Never raises: a failed reference becomes a labelled note so the aggregator
    can still act with partial context. Designed to run inside a thread pool —
    ``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right
@@ -139,17 +46,13 @@ def _run_reference(
    """
    label = _slot_label(slot)
    try:
-        # Prepend the advisory-role system prompt so the reference understands
-        # it is analyzing state for an aggregator, not acting on the task. The
-        # trimmed view (_reference_messages) already strips the agent's own
-        # system prompt, so this is the only system message the reference sees.
-        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
        response = call_llm(
            task="moa_reference",
-            messages=messages,
+            provider=slot["provider"],
+            model=slot["model"],
+            messages=ref_messages,
            temperature=temperature,
            max_tokens=max_tokens,
-            **_slot_runtime(slot),
        )
        return label, _extract_text(response) or "(empty response)"
    except Exception as exc:
@@ -161,8 +64,8 @@ def _run_references_parallel(
    reference_models: list[dict[str, str]],
    ref_messages: list[dict[str, Any]],
    *,
-    temperature: float | None = None,
-    max_tokens: int | None = None,
+    temperature: float,
+    max_tokens: int,
 ) -> list[tuple[str, str]]:
    """Fan out all reference models in parallel, returning outputs in order.

@@ -203,140 +106,40 @@ def _run_references_parallel(
    return [r for r in results if r is not None]


-def _truncate_tool_result(text: str, budget: int = _REFERENCE_TOOL_RESULT_BUDGET) -> str:
-    """Head+tail preview of a tool result for the advisory view.
-
-    Keeps the first and last halves of the budget with a ``[... N chars
-    omitted ...]`` marker between them, so a reference sees both how the result
-    started and how it ended without replaying the whole payload.
-    """
-    if not text or len(text) <= budget:
-        return text
-    half = budget // 2
-    omitted = len(text) - 2 * half
-    return f"{text[:half]}\n[... {omitted} chars omitted ...]\n{text[-half:]}"
-
-
-def _render_tool_calls(tool_calls: Any) -> str:
-    """Render an assistant turn's tool_calls as readable text lines.
-
-    The advisory view cannot carry real ``tool_calls`` payloads (strict
-    providers reject tool_calls the reference never produced), so the agent's
-    actions are flattened to text the reference can read and reason about.
-    """
-    lines: list[str] = []
-    for tc in tool_calls or []:
-        fn = (tc.get("function") or {}) if isinstance(tc, dict) else {}
-        name = fn.get("name") or (tc.get("name") if isinstance(tc, dict) else "") or "tool"
-        args = fn.get("arguments")
-        if isinstance(args, str):
-            args_text = args
-        elif args is not None:
-            try:
-                import json
-
-                args_text = json.dumps(args, ensure_ascii=False)
-            except Exception:
-                args_text = str(args)
-        else:
-            args_text = ""
-        lines.append(f"[called tool: {name}({args_text})]" if args_text else f"[called tool: {name}]")
-    return "\n".join(lines)
-
-
 def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Build an advisory view of the conversation for reference models.
+    """Build an advisory-safe view of the conversation for reference models.

-    A reference gives an INFORMED judgement on the current state, so it must
-    see what the agent actually did — its tool calls AND the tool results that
-    came back — not just the agent's narration. We therefore preserve the whole
-    conversation flow, but flatten it into clean user/assistant *text* turns:
-
-      - system prompt: dropped (8K of Hermes boilerplate, not advisory signal).
-      - assistant turns: kept; any ``tool_calls`` are rendered inline as
-        ``[called tool: name(args)]`` text lines appended to the turn's text.
-      - ``tool``-role results: NOT dropped. Each is folded (head+tail preview,
-        see ``_truncate_tool_result``) into the *preceding* assistant turn as a
-        ``[tool result: ...]`` block, so the reference sees what came back.
-
-    This emits ZERO ``tool``-role messages and ZERO ``tool_calls`` arrays — only
-    plain user/assistant text — so strict providers (Mistral, Fireworks) that
-    reject orphan tool messages / unproduced tool_calls don't 400, while the
-    reference still has the full picture.
-
-    The view MUST end with a ``user`` turn. Anthropic (and OpenRouter→Anthropic)
-    interpret a trailing assistant turn as an assistant *prefill* to continue,
-    and no-prefill models (e.g. Claude Opus 4.8) reject it with
-    ``400 ... must end with a user message``. Rather than DELETE the agent's
-    latest context to satisfy that (which would blind the reference to the
-    current state), we APPEND a synthetic user turn asking the reference to
-    judge the state above. End-on-user is satisfied and no context is lost.
-
-    The acting aggregator always receives the full, untrimmed transcript; this
-    function only shapes the disposable advisory copy.
+    Reference calls are advisory: they never call tools and never emit the
+    ``tool_calls`` the main model did. Replaying the full transcript verbatim
+    (a) re-bills the ~8K-token Hermes system prompt per reference per
+    iteration and (b) risks 400s from strict providers (Mistral, Fireworks)
+    that reject orphan ``tool`` messages or ``tool_calls`` the reference never
+    produced. We keep only the user/assistant *text* turns, dropping the
+    system prompt, any ``tool``-role messages, and any ``tool_calls`` payloads.
    """
-    advisory_instruction = (
-        "[The conversation above is the current state of the task. Give your "
-        "most intelligent judgement: what is going on, what should happen next, "
-        "what risks or mistakes you see, and how the acting agent should "
-        "proceed.]"
-    )
-
-    rendered: list[dict[str, Any]] = []
-    last_user_content: str | None = None
+    trimmed: list[dict[str, Any]] = []
    for msg in messages:
        role = msg.get("role")
-        content = msg.get("content")
-        text = content if isinstance(content, str) else ""
-
-        if role == "system":
+        if role not in ("user", "assistant"):
+            # Drop system prompt and tool-result messages.
            continue
-        if role == "user":
-            if text.strip():
-                last_user_content = text
-            rendered.append({"role": "user", "content": text})
-        elif role == "assistant":
-            parts: list[str] = []
-            if text.strip():
-                parts.append(text.strip())
-            calls_text = _render_tool_calls(msg.get("tool_calls"))
-            if calls_text:
-                parts.append(calls_text)
-            # Empty assistant turns (no text, no calls) carry nothing advisory.
-            if parts:
-                rendered.append({"role": "assistant", "content": "\n".join(parts)})
-        elif role == "tool":
-            # Fold the tool result into the preceding assistant turn as text so
-            # the reference sees what came back, without emitting a tool-role
-            # message a reference never produced.
-            result_text = _truncate_tool_result(text)
-            block = f"[tool result: {result_text}]"
-            if rendered and rendered[-1].get("role") == "assistant":
-                rendered[-1]["content"] = rendered[-1]["content"] + "\n" + block
-            else:
-                # No assistant turn to attach to (e.g. a leading tool result);
-                # keep it as advisory context on its own assistant-role line.
-                rendered.append({"role": "assistant", "content": block})
-        # Any other role is ignored.
-
-    # End on a user turn: append a synthetic advisory request rather than
-    # deleting the agent's latest assistant context. This satisfies Anthropic's
-    # no-trailing-assistant-prefill rule while preserving full state.
-    if rendered and rendered[-1].get("role") == "assistant":
-        rendered.append({"role": "user", "content": advisory_instruction})
-    elif rendered and rendered[-1].get("role") == "user":
-        # Already ends on a user turn (fresh user prompt, no agent action yet).
-        # Leave it — the reference answers that prompt directly.
-        pass
-
-    if not rendered:
-        # Degenerate case: nothing rendered. Fall back to the latest user turn.
-        if last_user_content is not None:
-            return [{"role": "user", "content": last_user_content}]
+        content = msg.get("content")
+        if not isinstance(content, str):
+            # Skip non-text (multimodal/tool-call-only) assistant turns.
+            if not content:
+                continue
+        text = content if isinstance(content, str) else ""
+        if role == "assistant" and not text.strip():
+            # Assistant turn that was purely tool calls — nothing advisory.
+            continue
+        trimmed.append({"role": role, "content": text})
+    if not trimmed:
+        # Degenerate case (e.g. first turn was stripped): fall back to a
+        # minimal user turn so the reference still has something to answer.
        for msg in reversed(messages):
            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
                return [{"role": "user", "content": msg["content"]}]
-    return rendered
+    return trimmed



@@ -366,18 +169,12 @@ def aggregate_moa_context(
    aggregator: dict[str, str],
    temperature: float = 0.6,
    aggregator_temperature: float = 0.4,
-    max_tokens: int | None = None,
+    max_tokens: int = 4096,
 ) -> str:
    """Run configured reference models and synthesize their advice.

    Failures are returned as model-specific notes instead of aborting the normal
    agent loop; the main model can still act with partial context.
-
-    ``max_tokens`` is ``None`` by default: MoA does not cap reference or
-    aggregator output, so each model uses its own maximum. ``call_llm`` omits
-    the parameter entirely when it is ``None`` (see its docstring), which also
-    sidesteps providers that reject ``max_tokens`` outright. A hardcoded cap
-    here previously truncated long aggregator syntheses.
    """
    reference_outputs: list[tuple[str, str]] = []
    ref_messages = _reference_messages(api_messages)
@@ -406,10 +203,11 @@ def aggregate_moa_context(
    try:
        response = call_llm(
            task="moa_aggregator",
+            provider=aggregator["provider"],
+            model=aggregator["model"],
            messages=[{"role": "user", "content": synth_prompt}],
            temperature=aggregator_temperature,
            max_tokens=max_tokens,
-            **_slot_runtime(aggregator),
        )
        synthesis = _extract_text(response)
    except Exception as exc:
@@ -432,38 +230,8 @@ def aggregate_moa_context(
 class MoAChatCompletions:
    """OpenAI-chat-compatible facade where the aggregator is the acting model."""

-    def __init__(self, preset_name: str, reference_callback: Any = None):
+    def __init__(self, preset_name: str):
        self.preset_name = preset_name or "default"
-        # Optional display hook. Called as reference outputs become available so
-        # frontends can show each reference model's answer as a labelled block
-        # before the aggregator acts. Signature:
-        #   reference_callback(event, **kwargs)
-        # where event is one of:
-        #   "moa.reference"   kwargs: index, count, label, text
-        #   "moa.aggregating" kwargs: aggregator (label), ref_count
-        # Never raises into the model call — display is best-effort.
-        self.reference_callback = reference_callback
-        # State-scoped reference cache. The agent loop calls create() once per
-        # tool-loop iteration; references should re-run whenever the task STATE
-        # advances — i.e. on every new user message AND every new tool result —
-        # so each reference judges the latest state. The advisory view
-        # (_reference_messages) now renders tool calls + results as text, so its
-        # signature changes on every new tool response; the cache key is that
-        # signature, so a new tool result is a cache MISS (references re-run)
-        # while a redundant create() call with identical state is a HIT (no
-        # re-run, no re-emit). This gives "fire on every user/tool response"
-        # for free, without re-firing on a pure no-op re-call.
-        self._ref_cache_key: tuple | None = None
-        self._ref_cache_outputs: list[tuple[str, str]] = []
-
-    def _emit(self, event: str, **kwargs: Any) -> None:
-        cb = self.reference_callback
-        if cb is None:
-            return
-        try:
-            cb(event, **kwargs)
-        except Exception as exc:  # pragma: no cover - display must never break the turn
-            logger.debug("MoA reference_callback failed for %s: %s", event, exc)

    def create(self, **api_kwargs: Any) -> Any:
        from hermes_cli.config import load_config
@@ -473,10 +241,7 @@ class MoAChatCompletions:
        messages = list(api_kwargs.get("messages") or [])
        reference_models = preset.get("reference_models") or []
        aggregator = preset.get("aggregator") or {}
-        # MoA does not cap reference or aggregator output: each model uses its
-        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
-        # (it never caps by default), so a long aggregator synthesis is never
-        # truncated and providers that reject max_tokens don't 400.
+        max_tokens = int(preset.get("max_tokens", api_kwargs.get("max_tokens") or 4096) or 4096)
        temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
        aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)

@@ -488,52 +253,12 @@ class MoAChatCompletions:

        reference_outputs: list[tuple[str, str]] = []
        ref_messages = _reference_messages(messages)
-
-        # Turn-scoped cache: only run + display references when the advisory
-        # view changed (i.e. a new user turn). Within one turn the agent loop
-        # calls create() once per tool iteration with the same advisory view;
-        # reuse the cached outputs and skip both the re-run and the re-emit.
-        _sig = hashlib.sha256(
-            "\u0000".join(
-                f"{m.get('role')}:{m.get('content')}" for m in ref_messages
-            ).encode("utf-8", "replace")
-        ).hexdigest()
-        _cache_key = (self.preset_name, _sig, tuple(_slot_label(s) for s in reference_models))
-        _refs_from_cache = _cache_key == self._ref_cache_key and bool(self._ref_cache_outputs)
-
-        if _refs_from_cache:
-            reference_outputs = list(self._ref_cache_outputs)
-        else:
-            reference_outputs = _run_references_parallel(
-                reference_models,
-                ref_messages,
-                temperature=temperature,
-                max_tokens=None,
-            )
-            self._ref_cache_key = _cache_key
-            self._ref_cache_outputs = list(reference_outputs)
-
-            # Surface each reference model's answer to the display BEFORE the
-            # aggregator acts — once per turn (only on the iteration that
-            # actually ran them). The user sees one labelled block per
-            # reference (rendered like a thinking block) so the MoA process is
-            # visible rather than a silent pause. Best-effort: never blocks the
-            # turn.
-            _ref_count = len(reference_outputs)
-            for _idx, (_label, _text) in enumerate(reference_outputs, start=1):
-                self._emit(
-                    "moa.reference",
-                    index=_idx,
-                    count=_ref_count,
-                    label=_label,
-                    text=_text,
-                )
-            if _ref_count:
-                self._emit(
-                    "moa.aggregating",
-                    aggregator=_slot_label(aggregator),
-                    ref_count=_ref_count,
-                )
+        reference_outputs = _run_references_parallel(
+            reference_models,
+            ref_messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )

        agg_messages = [dict(m) for m in messages]
        if reference_outputs:
@@ -561,26 +286,21 @@ class MoAChatCompletions:
            raise RuntimeError("MoA aggregator cannot be another MoA preset")
        agg_kwargs = dict(api_kwargs)
        agg_kwargs["messages"] = agg_messages
-        # The aggregator is the acting model. Resolve its slot to the provider's
-        # real runtime (base_url/api_key/api_mode) and call it through the same
-        # request-building path any model uses — so per-model wire-format
-        # handling (anthropic_messages, max_completion_tokens, fixed/forbidden
-        # temperature) applies identically to it. MoA imposes no output cap:
-        # max_tokens is passed through from the caller (normally None → omitted
-        # → the model's real maximum). The preset's old hardcoded 4096 default
-        # is gone — it truncated long syntheses.
+        agg_kwargs["model"] = aggregator.get("model")
+        agg_kwargs["temperature"] = aggregator_temperature
        return call_llm(
            task="moa_aggregator",
+            provider=aggregator.get("provider"),
+            model=aggregator.get("model"),
            messages=agg_messages,
            temperature=aggregator_temperature,
            max_tokens=agg_kwargs.get("max_tokens"),
            tools=agg_kwargs.get("tools"),
            extra_body=agg_kwargs.get("extra_body"),
-            **_slot_runtime(aggregator),
        )


 class MoAClient:
-    def __init__(self, preset_name: str, reference_callback: Any = None):
+    def __init__(self, preset_name: str):
        self.chat = type("_MoAChat", (), {})()
-        self.chat.completions = MoAChatCompletions(preset_name, reference_callback=reference_callback)
+        self.chat.completions = MoAChatCompletions(preset_name)
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -478,16 +478,6 @@ def _infer_provider_from_url(base_url: str) -> Optional[str]:
    return None


-def _lmstudio_server_root(base_url: str) -> str:
-    """Return the LM Studio server root for native ``/api/v1`` endpoints."""
-    root = _normalize_base_url(base_url).rstrip("/")
-    for suffix in ("/api/v1", "/api", "/v1"):
-        if root.endswith(suffix):
-            root = root[: -len(suffix)].rstrip("/")
-            break
-    return root
-
-
 def _is_known_provider_base_url(base_url: str) -> bool:
    return _infer_provider_from_url(base_url) is not None

@@ -559,7 +549,6 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
    server_url = normalized
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
-    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -567,7 +556,7 @@ def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
        with httpx.Client(timeout=2.0, headers=headers) as client:
            # LM Studio exposes /api/v1/models — check first (most specific)
            try:
-                r = client.get(f"{lmstudio_url}/api/v1/models")
+                r = client.get(f"{server_url}/api/v1/models")
                if r.status_code == 200:
                    return "lm-studio"
            except Exception:
@@ -785,7 +774,7 @@ def fetch_endpoint_model_metadata(
    if is_local_endpoint(normalized):
        try:
            if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
-                server_url = _lmstudio_server_root(normalized)
+                server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
                response = requests.get(
                    server_url.rstrip("/") + "/api/v1/models",
                    headers=headers,
@@ -1199,56 +1188,6 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option
    return None


-def query_ollama_supports_vision(model: str, base_url: str, api_key: str = "") -> Optional[bool]:
-    """Return True/False when Ollama ``/api/show`` reports vision support.
-
-    Uses the ``capabilities`` field on Ollama 0.6.0+ and falls back to
-    ``model_info.*.vision.block_count`` on older servers. Returns None when
-    the server is unreachable, not Ollama, or the model is unknown.
-    """
-    import httpx
-
-    bare_model = _strip_provider_prefix(model)
-    if not bare_model or not base_url:
-        return None
-
-    try:
-        if detect_local_server_type(base_url, api_key=api_key) != "ollama":
-            return None
-    except Exception:
-        return None
-
-    server_url = base_url.rstrip("/")
-    if server_url.endswith("/v1"):
-        server_url = server_url[:-3]
-
-    headers = _auth_headers(api_key)
-
-    try:
-        with httpx.Client(timeout=3.0, headers=headers) as client:
-            resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
-            if resp.status_code != 200:
-                return None
-            data = resp.json()
-    except Exception:
-        return None
-
-    caps = data.get("capabilities")
-    if isinstance(caps, list):
-        if any(str(cap).lower() == "vision" for cap in caps):
-            return True
-        if caps:
-            return False
-
-    model_info = data.get("model_info")
-    if isinstance(model_info, dict):
-        for key in model_info:
-            if "vision.block_count" in str(key).lower():
-                return True
-
-    return None
-
-
 def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query an Ollama server's native ``/api/show`` for context length.

@@ -1358,7 +1297,6 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
    server_url = base_url.rstrip("/")
    if server_url.endswith("/v1"):
        server_url = server_url[:-3]
-    lmstudio_url = _lmstudio_server_root(base_url)

    headers = _auth_headers(api_key)

@@ -1402,7 +1340,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") ->
            # Use _model_id_matches for fuzzy matching: LM Studio stores models as
            # "publisher/slug" but users configure only "slug" after "local:" prefix.
            if server_type == "lm-studio":
-                resp = client.get(f"{lmstudio_url}/api/v1/models")
+                resp = client.get(f"{server_url}/api/v1/models")
                if resp.status_code == 200:
                    data = resp.json()
                    for m in data.get("models", []):
@@ -1708,34 +1646,6 @@ def get_model_context_length(
    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
        return config_context_length

-    # 0a. MoA virtual provider — ``model`` is a preset name, not a real model,
-    # and ``base_url`` is the local virtual endpoint, so every probe below would
-    # miss and fall through to the 256K default. The aggregator is the acting
-    # model, so resolve the context window from the aggregator slot's real
-    # provider+model instead. References are advisory-only and never bound the
-    # acting context, so they're ignored here.
-    if (provider or "").strip().lower() == "moa":
-        try:
-            from hermes_cli.config import load_config
-            from hermes_cli.moa_config import resolve_moa_preset
-            from hermes_cli.runtime_provider import resolve_runtime_provider
-
-            preset = resolve_moa_preset(load_config().get("moa") or {}, model)
-            agg = preset.get("aggregator") or {}
-            agg_provider = str(agg.get("provider") or "").strip()
-            agg_model = str(agg.get("model") or "").strip()
-            if agg_model and agg_provider and agg_provider.lower() != "moa":
-                rt = resolve_runtime_provider(requested=agg_provider, target_model=agg_model)
-                return get_model_context_length(
-                    agg_model,
-                    base_url=rt.get("base_url", "") or "",
-                    api_key=rt.get("api_key", "") or "",
-                    provider=agg_provider,
-                )
-        except Exception:
-            logger.debug("MoA aggregator context-length resolution failed", exc_info=True)
-        # Fall through to the generic default if aggregator resolution failed.
-
    # 0b. custom_providers per-model override — check before any probe.
    # This closes the gap where /model switch and display paths used to fall
    # back to 128K despite the user having a per-model context_length set.
--- a/agent/process_bootstrap.py
+++ b/agent/process_bootstrap.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import os
 import sys
 import urllib.request
-from typing import Any, Optional
+from typing import Optional

 from utils import base_url_hostname, normalize_proxy_url

@@ -142,46 +142,6 @@ def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
    return proxy


-def build_keepalive_http_client(
-    base_url: str = "",
-    *,
-    async_mode: bool = False,
-) -> Optional[Any]:
-    """Build an httpx client for OpenAI SDK calls with env-only proxy policy.
-
-    Uses explicit ``HTTPS_PROXY`` / ``NO_PROXY`` env vars via
-    ``_get_proxy_for_base_url``. A custom transport disables httpx's default
-    ``trust_env`` path, so macOS system proxy settings from
-    ``urllib.request.getproxies()`` (which omit the ExceptionsList) are not
-    applied. Mirrors ``AIAgent._build_keepalive_http_client``.
-    """
-    try:
-        import httpx
-        import socket
-
-        if "api.githubcopilot.com" in str(base_url or "").lower():
-            client_cls = httpx.AsyncClient if async_mode else httpx.Client
-            return client_cls()
-
-        sock_opts = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)]
-        if hasattr(socket, "TCP_KEEPIDLE"):
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 30))
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10))
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3))
-        elif hasattr(socket, "TCP_KEEPALIVE"):
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPALIVE, 30))
-
-        proxy = _get_proxy_for_base_url(base_url)
-        transport_cls = httpx.AsyncHTTPTransport if async_mode else httpx.HTTPTransport
-        client_cls = httpx.AsyncClient if async_mode else httpx.Client
-        return client_cls(
-            transport=transport_cls(socket_options=sock_opts),
-            proxy=proxy,
-        )
-    except Exception:
-        return None
-
-
 def _install_safe_stdio() -> None:
    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
    for stream_name in ("stdout", "stderr"):
@@ -204,5 +164,4 @@ __all__ = [
    "_install_safe_stdio",
    "_get_proxy_from_env",
    "_get_proxy_for_base_url",
-    "build_keepalive_http_client",
 ]
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -88,15 +88,12 @@ def _find_hermes_md(cwd: Path) -> Optional[Path]:
    stop_at = _find_git_root(cwd)
    current = cwd.resolve()

-    # When there is no git root, only check cwd itself – walking parents
-    # could pick up a .hermes.md planted in /tmp, /home, etc.
-    search_dirs = [current, *current.parents] if stop_at else [current]
-
-    for directory in search_dirs:
+    for directory in [current, *current.parents]:
        for name in _HERMES_MD_NAMES:
            candidate = directory / name
            if candidate.is_file():
                return candidate
+        # Stop walking at the git root (or filesystem root).
        if stop_at and directory == stop_at:
            break
    return None
@@ -620,12 +617,7 @@ DEVELOPER_ROLE_MODELS = ("gpt-5", "codex")
 PLATFORM_HINTS = {
    "whatsapp": (
        "You are on a text messaging communication platform, WhatsApp. "
-        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
-        "`code`, ```code blocks```, [links](url)) is auto-converted to "
-        "WhatsApp's native syntax (*bold*, _italic_, ~strike~, monospace) — "
-        "feel free to write in markdown, and use bullet lists ('- item') "
-        "freely. Tables are NOT supported — prefer bullet lists or labeled "
-        "key:value pairs. "
+        "Please do not use markdown as it does not render. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. The file "
        "will be sent as a native WhatsApp attachment — images (.jpg, .png, "
@@ -690,11 +682,7 @@ PLATFORM_HINTS = {
    ),
    "signal": (
        "You are on a text messaging communication platform, Signal. "
-        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
-        "`code`, ```code blocks```) is auto-converted to Signal's native "
-        "rich formatting — feel free to write in markdown, and use bullet "
-        "lists ('- item') freely (they render as • bullets). Tables are NOT "
-        "supported — prefer bullet lists or labeled key:value pairs. "
+        "Please do not use markdown as it does not render. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. Images "
        "(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
@@ -929,7 +917,8 @@ def _probe_remote_backend(env_type: str) -> str | None:
    try:
        # Import locally: tools/ imports are heavy and only relevant when a
        # non-local backend is actually configured.
-        from tools.terminal_tool import _create_environment, _get_env_config  # type: ignore
+        from tools.terminal_tool import _get_env_config  # type: ignore
+        from tools.environments import get_environment  # type: ignore
    except Exception as e:
        logger.debug("Backend probe unavailable (import failed): %s", e)
        _BACKEND_PROBE_CACHE[cache_key] = ""
@@ -937,59 +926,7 @@ def _probe_remote_backend(env_type: str) -> str | None:

    try:
        config = _get_env_config()
-        # Build the environment the same way tools/terminal_tool.py does for a
-        # live command: select the backend image, then assemble ssh/container
-        # config from the env-derived dict. (There is no `get_environment`
-        # factory — the real entry point is `_create_environment`.)
-        if env_type == "docker":
-            image = config.get("docker_image", "")
-        elif env_type == "singularity":
-            image = config.get("singularity_image", "")
-        elif env_type == "modal":
-            image = config.get("modal_image", "")
-        elif env_type == "daytona":
-            image = config.get("daytona_image", "")
-        else:
-            image = ""
-
-        ssh_config = None
-        if env_type == "ssh":
-            ssh_config = {
-                "host": config.get("ssh_host", ""),
-                "user": config.get("ssh_user", ""),
-                "port": config.get("ssh_port", 22),
-                "key": config.get("ssh_key", ""),
-                "persistent": config.get("ssh_persistent", False),
-            }
-
-        container_config = None
-        if env_type in {"docker", "singularity", "modal", "daytona"}:
-            container_config = {
-                "container_cpu": config.get("container_cpu", 1),
-                "container_memory": config.get("container_memory", 5120),
-                "container_disk": config.get("container_disk", 51200),
-                "container_persistent": config.get("container_persistent", True),
-                "modal_mode": config.get("modal_mode", "auto"),
-                "docker_volumes": config.get("docker_volumes", []),
-                "docker_mount_cwd_to_workspace": config.get("docker_mount_cwd_to_workspace", False),
-                "docker_forward_env": config.get("docker_forward_env", []),
-                "docker_env": config.get("docker_env", {}),
-                "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
-                "docker_extra_args": config.get("docker_extra_args", []),
-                "docker_persist_across_processes": config.get("docker_persist_across_processes", True),
-                "docker_orphan_reaper": config.get("docker_orphan_reaper", True),
-            }
-
-        env = _create_environment(
-            env_type=env_type,
-            image=image,
-            cwd=config.get("cwd", ""),
-            timeout=config.get("timeout", 180),
-            ssh_config=ssh_config,
-            container_config=container_config,
-            task_id="prompt-backend-probe",
-            host_cwd=config.get("host_cwd"),
-        )
+        env = get_environment(config)
        # Single-line POSIX probe — works on any Unixy backend. Wrapped in
        # `2>/dev/null` so a missing binary doesn't pollute the output.
        probe_cmd = (
--- a/agent/reasoning_timeouts.py
+++ b/agent/reasoning_timeouts.py
@@ -1,216 +0,0 @@
-"""Per-reasoning-model stale-timeout floor for known reasoning models.
-
-Reasoning models (those that emit extended thinking blocks before their
-first content token) routinely exceed Hermes's default chat-model
-stale detectors:
-
-* Stream stale detector:   ``HERMES_STREAM_STALE_TIMEOUT``     default 180s
-                           ``agent/chat_completion_helpers.py:2544``
-* Non-stream stale detector: ``HERMES_API_CALL_STALE_TIMEOUT``  default 90s
-                           ``run_agent.py:1140``
-
-For NVIDIA Nemotron 3 Ultra on the hosted NIM gateway the empirical
-upstream idle kill is ~120s (first-party reproduction at
-NVIDIA/NemoClaw#4846 — TTFB ~31s, stream dies at 120s). The same
-failure mode exists on OpenAI o1/o3, Anthropic Opus 4.x thinking,
-DeepSeek R1, Qwen QwQ, xAI Grok reasoning — every cloud reasoning
-model hits upstream-proxies / load-balancers with idle timeouts
-shorter than the model's thinking phase. Result: the stale detector
-kills the connection mid-think, surfacing as
-``BrokenPipeError``/``RemoteProtocolError`` on the next read.
-
-This module provides a floor that the existing stale-detector scaling
-blocks consult via :func:`get_reasoning_stale_timeout_floor` and
-apply as ``max(default, floor)``. It is a FLOOR:
-
-* Never overrides explicit user config (``providers.<id>.models.<model>.stale_timeout_seconds``
-  or ``request_timeout_seconds`` already wins — this code never runs
-  in that branch).
-* Never lowers an existing threshold.
-* Has zero effect on non-reasoning models — they are not in the
-  allowlist and the resolver returns ``None``.
-
-Matching uses start-anchored regex on the slug-only component of
-the model name (after stripping any aggregator prefix like
-``openai/``, ``x-ai/``, ``anthropic/``).  The right-anchor matches
-end-of-string or a ``-``/``.``/``_`` slug separator, so ``qwen3-235b``
-matches the ``qwen3`` family entry (a future model slug would be
-``qwen3-235b-instruct`` and would also match) but ``some-other-qwen3``
-does NOT match ``qwen3`` (the ``-qwen3`` is not at start of slug).
-
-The ``o1`` case is the most delicate: a model named
-``llama-4-70b-o1-preview`` is a hypothetical community derivative that
-should NOT trigger the reasoning-model floor for the user (the user
-chose a non-OpenAI model, not a reasoning model).  The start-of-slug
-anchor naturally excludes this — the matched ``o1-preview`` is at
-position 11 of the slug, not at position 0.  The previous substring-
-with-trailing-hyphen design would have over-matched here, which is
-why start-of-slug anchoring is the right shape.
-
-Fixes #52217.
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Optional
-
-
-# (slug, floor_seconds).  Each slug is matched as a discrete
-# word-boundary component via the wrapper regex in ``_match_any``
-# below.  Order is irrelevant — the first regex match wins.
-_REASONING_STALE_TIMEOUT_FLOORS: tuple[tuple[str, int], ...] = (
-    # NVIDIA Nemotron — reasoning models behind hosted NIM with
-    # documented 60-180s upstream idle kill (NVIDIA/NemoClaw#4846:
-    # 120s measured).
-    ("nemotron-3-ultra", 600),
-    ("nemotron-3-super", 600),
-    ("nemotron-3-nano",  300),
-    # DeepSeek — R1 reasoning model on hosted NIM / DeepSeek direct.
-    ("deepseek-r1", 600),
-    ("deepseek-reasoner", 600),
-    # Qwen — QwQ reasoning + Qwen3 thinking variants.  QwQ-32B
-    # preview is the stable slug; ``qwen3`` covers the family of
-    # thinking-mode Qwen3 models (qwen3-235b-a22b, qwen3-32b, etc.)
-    # without over-matching every Qwen3 instruct variant — the
-    # right-anchor requires the slug to be at the start of the
-    # remaining model name, so ``qwen3-235b-instruct`` (instruct is
-    # NOT a thinking variant) would still match.  Acceptable
-    # trade-off: instruct variants of qwen3 get the 180s floor
-    # even though they don't reason.  The cost is a slightly longer
-    # wait on a hung provider; the alternative (matching only
-    # ``qwen3-.*-thinking``) breaks the moment NVIDIA or Alibaba
-    # ships a slightly different naming shape.
-    ("qwq-32b", 300),
-    ("qwen3", 180),
-    # OpenAI o-series — known multi-minute TTFB.  Each variant
-    # enumerated explicitly so bare ``o1`` doesn't over-match
-    # ``olmo-1`` or hypothetical future community derivatives.
-    ("o1", 600),
-    ("o1-mini", 600),
-    ("o1-pro", 600),
-    ("o1-preview", 600),
-    ("o3", 600),
-    ("o3-pro", 600),
-    ("o3-mini", 300),
-    ("o4-mini", 300),
-    # Anthropic Claude 4.x thinking variants.  Anchored at
-    # ``claude-opus-4`` so non-thinking Claude 3.x or future
-    # non-reasoning Claude variants don't match.
-    ("claude-opus-4", 240),
-    ("claude-sonnet-4.5", 180),
-    ("claude-sonnet-4.6", 180),
-    # xAI Grok reasoning variants.  Explicit reasoning-only keys
-    # plus one for the ``non-reasoning`` variant so users picking
-    # the fast variant don't get the 300s floor.  Bare ``grok-3``,
-    # ``grok-4`` etc. don't match — only the explicit reasoning /
-    # non-reasoning pairs.
-    ("grok-4-fast-reasoning", 300),
-    ("grok-4.20-reasoning", 300),
-    ("grok-4-fast-non-reasoning", 180),
-)
-
-
-# Pre-compile each pattern.  Wrapper = start-of-slug + slug + end-or-
-# separator, where ``start-of-slug`` means start-of-string OR
-# immediately after the last ``/`` (aggregator separator) and
-# ``end-or-separator`` means end-of-string OR a ``-``/``.``/``_``.
-#
-# Why start-of-slug and not start-of-string: aggregator prefixes
-# like ``openai/`` should not affect matching — the slug identity is
-# the part after the last ``/``.  Stripping the aggregator prefix in
-# :func:`get_reasoning_stale_timeout_floor` before regex matching
-# gives the wrapper a clean start-of-string anchor.
-#
-# Why end-or-separator on the right: ``openai/o3-mini`` must match
-# the ``o3-mini`` slug (the right anchor is end-of-string).  And
-# ``openai/o3-mini-2025-01-31`` must also match ``o3-mini`` (the right
-# anchor is the ``-`` separator).  But ``openai/o3-mini-fork`` should
-# NOT match ``o3-mini`` if we wanted to exclude forks — though the
-# pattern ``o3-mini-fork`` would be matched as a derivative anyway,
-# so we accept that community forks inheriting the same prefix are
-# treated as reasoning models (a reasonable default — the upstream
-# gateway timing is the same).
-_PATTERN_CACHE: dict[str, re.Pattern[str]] = {}
-
-
-def _get_pattern(slug: str) -> re.Pattern[str]:
-    compiled = _PATTERN_CACHE.get(slug)
-    if compiled is None:
-        compiled = re.compile(
-            r"^"
-            + re.escape(slug)
-            + r"(?:$|[\-._])"
-        )
-        _PATTERN_CACHE[slug] = compiled
-    return compiled
-
-
-def _match_any(model_lower: str) -> Optional[float]:
-    """Return the floor for the first matching slug, else None.
-
-    Each table entry is matched as a start-of-slug prefix with the
-    slug-separator-or-end-of-string right-anchor.  Table iteration
-    order is irrelevant: longest slug wins (so ``o3-mini`` beats
-    ``o3`` on a model like ``openai/o3-mini``).
-    """
-    # Sort by slug length descending so longer / more-specific slugs
-    # win on shared prefixes (o3-mini beats o3).
-    sorted_floors = sorted(
-        _REASONING_STALE_TIMEOUT_FLOORS, key=lambda kv: -len(kv[0])
-    )
-    for slug, floor in sorted_floors:
-        if _get_pattern(slug).search(model_lower):
-            return float(floor)
-    return None
-
-
-def get_reasoning_stale_timeout_floor(model: object) -> Optional[float]:
-    """Return the stale-timeout floor (seconds) for a known reasoning model.
-
-    Returns ``None`` when the model is not in the allowlist or the
-    argument is empty / not a string.  Matching uses
-    word-boundary-anchored regex on the lowercased model name, so
-    ``openai/o3-mini`` matches the ``o3-mini`` slug but
-    ``olmo-1`` does NOT match ``o1`` (the ``o1`` substring is not
-    at a word boundary inside ``olmo-1``).
-
-    Aggregator prefixes (``openai/``, ``x-ai/``, ``anthropic/`` etc.)
-    are preserved through matching — the ``/`` is itself a word
-    boundary, so ``openai/o3-mini`` matches ``o3-mini`` because the
-    ``/`` before ``o3-mini`` satisfies the left-anchor alternation.
-
-    This is a FLOOR — callers must apply it as ``max(default, floor)``
-    and only when no explicit user-configured per-model
-    ``stale_timeout_seconds`` exists.
-
-    >>> get_reasoning_stale_timeout_floor("nvidia/nemotron-3-ultra-550b-a55b")
-    600.0
-    >>> get_reasoning_stale_timeout_floor("openai/o3-mini")
-    300.0
-    >>> get_reasoning_stale_timeout_floor("deepseek/deepseek-r1")
-    600.0
-    >>> get_reasoning_stale_timeout_floor("qwen/qwen3-235b-a22b-thinking")
-    180.0
-    >>> get_reasoning_stale_timeout_floor("x-ai/grok-4-fast-reasoning")
-    300.0
-    >>> get_reasoning_stale_timeout_floor("anthropic/claude-opus-4-6")
-    240.0
-    >>> get_reasoning_stale_timeout_floor("gpt-4o") is None
-    True
-    >>> get_reasoning_stale_timeout_floor("olmo-1") is None
-    True
-    >>> get_reasoning_stale_timeout_floor(None) is None
-    True
-    """
-    if not model or not isinstance(model, str):
-        return None
-    name = model.strip().lower()
-    if not name:
-        return None
-    # Strip aggregator prefix (everything before and including the
-    # last ``/``).  The wrapper regex anchors at start-of-string, so
-    # the slug identity is the bare model name.
-    if "/" in name:
-        name = name.rsplit("/", 1)[1]
-    return _match_any(name)
--- a/agent/redact.py
+++ b/agent/redact.py
@@ -10,7 +10,6 @@ the first 6 and last 4 characters for debuggability.
 import logging
 import os
 import re
-import shlex

 logger = logging.getLogger(__name__)

@@ -108,60 +107,12 @@ _PREFIX_PATTERNS = [
    r"ntn_[A-Za-z0-9]{10,}",            # Notion internal integration token
 ]

-# ENV assignment patterns: KEY=value where KEY contains a secret-like name.
-# Uppercase keys tolerate spaces around "=" (e.g. ``FOO_SECRET = bar``) because
-# an all-caps key is almost never prose/code.
+# ENV assignment patterns: KEY=value where KEY contains a secret-like name
 _SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
 _ENV_ASSIGN_RE = re.compile(
    rf"([A-Z0-9_]{{0,50}}{_SECRET_ENV_NAMES}[A-Z0-9_]{{0,50}})\s*=\s*(['\"]?)(\S+)\2",
 )

-# Lowercase / dotted / hyphenated config keys from config files
-# (application.properties, .env, YAML-ish dumps): ``spring.datasource.password=secret``,
-# ``app.api.key=xyz``, ``password=secret``. The uppercase _ENV_ASSIGN_RE above
-# never matched these, so config-file passwords leaked verbatim (issue #16413).
-#
-# These run only in a config-file context, NOT in prose, code, or URLs — three
-# carve-outs preserved from the original design (#4367 + the documented
-# web-URL passthrough below):
-#   1. The value is bounded by ``[^\s&]`` (stops at whitespace AND ``&``) so
-#      form-urlencoded bodies are handled pair-by-pair (by _redact_form_body),
-#      not greedily swallowed.
-#   2. _CFG_DOTTED_RE only matches when the key is NAMESPACED (contains a dot),
-#      which is unambiguously a config key — never a prose word.
-#   3. _CFG_ANCHORED_RE matches a bare secret-word key only at line start
-#      (optionally after ``export``), so conversational ``I have password=foo``
-#      mid-sentence is left alone.
-# The colon-form URL guard (skip when ``://`` present) lives at the call site.
-_SECRET_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential|auth)"
-_CFG_VALUE = r"(['\"]?)([^\s&]+?)\2(?=[\s&]|$)"
-# Namespaced (dotted) key: the secret word may sit anywhere in a dotted path.
-_CFG_DOTTED_RE = re.compile(
-    rf"((?:[A-Za-z0-9_\-]+\.)+[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*"
-    rf"|[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*\.[A-Za-z0-9_.\-]+)"
-    rf"={_CFG_VALUE}",
-    re.IGNORECASE,
-)
-# Line-anchored bare key: ``password=…`` / ``export api_key=…`` at start of line.
-_CFG_ANCHORED_RE = re.compile(
-    rf"(^[ \t]*(?:export[ \t]+)?[A-Za-z0-9_\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_\-]*)={_CFG_VALUE}",
-    re.IGNORECASE | re.MULTILINE,
-)
-
-# Unquoted YAML / colon config (e.g. ``password: secret``,
-# ``spring.datasource.password: hunter2``). The secret keyword must be part of
-# the KEY (anchored to the start of the line/indent), and the value is a single
-# whitespace-free token — so prose like ``note: secret meeting`` (keyword in the
-# value) and ``error: token expired`` are left alone. Bare ``auth`` is excluded
-# from the key set so ``Authorization:`` / ``author:`` don't match (the former
-# is masked by _AUTH_HEADER_RE); ``auth_token``/``auth-token`` still match via
-# the ``token`` keyword. Quoted values defer to _JSON_FIELD_RE via the lookahead.
-_YAML_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential)"
-_YAML_ASSIGN_RE = re.compile(
-    rf"(^[ \t]*[A-Za-z0-9_.\-]*{_YAML_CFG_NAMES}[A-Za-z0-9_.\-]*)(:[ \t]*)(?!['\"])([^\s&]+)",
-    re.IGNORECASE | re.MULTILINE,
-)
-
 # JSON field patterns: "apiKey": "value", "token": "value", etc.
 _JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer|secret_value|raw_secret|secret_input|key_material)"
 _JSON_FIELD_RE = re.compile(
@@ -174,15 +125,8 @@ _JSON_FIELD_RE = re.compile(
 # while the header name and scheme word are preserved for debuggability. The
 # previous rule only matched ``Bearer``, so ``Basic <base64 user:pass>`` and
 # ``token <pat>`` leaked verbatim into logs/transcripts.
-#
-# The credential class excludes quote characters (``"`` / ``'``): a token sitting
-# flush against a closing quote (``"Authorization: Bearer sk-..."``) must not pull
-# that quote into the match, or masking turns value corruption into *syntax*
-# corruption — the closing quote vanishes and the command/string no longer parses
-# (unterminated quote → shell EOF / Python SyntaxError). Real credentials never
-# contain ``"`` or ``'``, so excluding them is safe. See #43083.
 _AUTH_HEADER_RE = re.compile(
-    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?([^\s\"']+)",
+    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?(\S+)",
    re.IGNORECASE,
 )

@@ -210,37 +154,9 @@ _PRIVATE_KEY_RE = re.compile(
 )

 # Database connection strings: protocol://user:PASSWORD@host
-# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password.
-# The userinfo and password groups forbid whitespace ([^:\s]+ / [^@\s]+) so the
-# match can never span a line break. A real DSN password never contains
-# whitespace; without this bound the greedy [^@]+ would scan past the end of a
-# code line to the next stray "@" (e.g. a Python decorator), swallowing
-# intervening lines and corrupting tool OUTPUT for any source containing a
-# postgresql:// f-string template. See issue #33801.
+# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
 _DB_CONNSTR_RE = re.compile(
-    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:\s]+:)([^@\s]+)(@)",
-    re.IGNORECASE,
-)
-
-# Bare-token credential in a web/transport URL: ``scheme://TOKEN@host``.
-# This is the ``git remote set-url origin https://PASSWORD@github.com/...``
-# shape from issue #6396 — a single opaque credential in the userinfo position
-# with NO ``user:pass`` colon. It is unambiguously a secret: legitimate
-# round-trip URLs (OAuth callbacks, magic links, pre-signed shares — see the
-# "Web-URL redaction is intentionally OFF" note in redact_sensitive_text) carry
-# their tokens in the QUERY STRING, never in bare userinfo. The colon form
-# ``user:pass@`` is deliberately left to pass through (commit "pass web URLs
-# through unchanged", #34029) and is NOT matched here — the token class forbids
-# ``:``. DB schemes are handled by _DB_CONNSTR_RE above and excluded here.
-#
-# Guards against false positives:
-#   - 8+ char floor skips short usernames (git, admin, root, deploy, ubuntu).
-#   - The token class ``[^\s:@/]`` cannot cross ``/``, so an ``@`` sitting in a
-#     path or query (e.g. ``?q=user@example.com``) is never treated as userinfo.
-_URL_BARE_TOKEN_RE = re.compile(
-    r"((?:https?|wss?|git|ssh|ftp|ftps|sftp)://)"  # scheme
-    r"([^\s:@/]{8,})"                               # bare token (no colon/slash/@), 8+ chars
-    r"(@[^\s]+)",                                   # @host...
+    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
    re.IGNORECASE,
 )

@@ -424,40 +340,7 @@ def _redact_form_body(text: str) -> str:
    return _redact_query_string(text.strip())


-def _mask_token_nonreusable(token: str) -> str:
-    """Redact a prefix-matched credential to a NON-REUSABLE sentinel.
-
-    Unlike :func:`_mask_token` (which keeps head/tail chars — fine for logs
-    that are never fed back into a config), this emits a marker that:
-
-    * cannot be mistaken for a usable-but-truncated key, so an agent that
-      reads it from a config file and writes it back does NOT corrupt the
-      stored credential into a dead 13-char string (issue #35519); and
-    * still does not leak the secret material (no head/tail chars).
-
-    The vendor prefix label is preserved for debuggability so the agent can
-    still tell *which* credential is present (e.g. a GitHub PAT vs an OpenAI
-    key) without seeing any of its bytes.
-    """
-    if not token:
-        return "«redacted-secret»"
-    # Preserve only the recognizable vendor prefix label (e.g. "ghp_", "sk-"),
-    # never any of the random secret body.
-    label = ""
-    for sub in _PREFIX_SUBSTRINGS:
-        if token.startswith(sub):
-            label = sub
-            break
-    return f"«redacted:{label}…»" if label else "«redacted-secret»"
-
-
-def redact_sensitive_text(
-    text: str,
-    *,
-    force: bool = False,
-    code_file: bool = False,
-    file_read: bool = False,
-) -> str:
+def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = False) -> str:
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
@@ -470,17 +353,6 @@ def redact_sensitive_text(
    constants, "apiKey": "test" fixtures). Prefix patterns, auth headers,
    private keys, DB connstrings, JWTs, and URL secrets are still redacted.

-    Set file_read=True for file *content* returned to the agent (read_file /
-    search_files / cat). Secrets are STILL redacted — they are never exposed —
-    but prefix-matched credentials are replaced with a non-reusable sentinel
-    (``«redacted:ghp_…»``) instead of a head/tail-preserving mask
-    (``ghp_S1...Pn2T``). The old mask looked like a real-but-truncated key, so
-    an agent reading it from config.yaml and writing it back silently corrupted
-    the stored credential into a dead 13-char value → 401 (issue #35519). The
-    sentinel is syntactically invalid as a token, so it can't be mistaken for a
-    usable key or written back as one. Implies code_file=True (config/data
-    files shouldn't trigger the source-code ENV/JSON false-positive paths).
-
    Performance: each regex pattern is gated behind a cheap substring
    pre-check (e.g. ``"=" in text`` for ENV assignments, ``"://" in text``
    for URLs, ``"eyJ" in text`` for JWTs). On a typical hermes log line
@@ -499,15 +371,9 @@ def redact_sensitive_text(
    if not (force or _REDACT_ENABLED):
        return text

-    # file_read content shouldn't hit the source-code ENV/JSON false-positive
-    # paths either (it's config/data, not log lines).
-    if file_read:
-        code_file = True
-
    # Known prefixes (sk-, ghp_, etc.) — gate on substring presence
    if _has_known_prefix_substring(text):
-        _prefix_sub = _mask_token_nonreusable if file_read else _mask_token
-        text = _PREFIX_RE.sub(lambda m: _prefix_sub(m.group(1)), text)
+        text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)

    # ENV assignments: OPENAI_API_KEY=***  (skip for code files — false positives)
    if not code_file:
@@ -516,13 +382,6 @@ def redact_sensitive_text(
                name, quote, value = m.group(1), m.group(2), m.group(3)
                return f"{name}={quote}{_mask_token(value)}{quote}"
            text = _ENV_ASSIGN_RE.sub(_redact_env, text)
-            # Lowercase/dotted config keys (issue #16413). Skip URLs entirely —
-            # web-URL query params are intentionally passed through (see note
-            # near the bottom of this function); _DB_CONNSTR_RE still guards
-            # connection-string passwords.
-            if "://" not in text:
-                text = _CFG_DOTTED_RE.sub(_redact_env, text)
-                text = _CFG_ANCHORED_RE.sub(_redact_env, text)

        # JSON fields: "apiKey": "***"  (skip for code files — false positives)
        if ":" in text and '"' in text:
@@ -531,15 +390,6 @@ def redact_sensitive_text(
                return f'{key}: "{_mask_token(value)}"'
            text = _JSON_FIELD_RE.sub(_redact_json, text)

-        # Unquoted YAML / colon config: password: ***  (after JSON so quoted
-        # values are handled there; the lookahead in _YAML_ASSIGN_RE skips
-        # quotes). Skip URLs — web-URL query params pass through by design.
-        if ":" in text and "://" not in text:
-            def _redact_yaml(m):
-                key, sep, value = m.group(1), m.group(2), m.group(3)
-                return f"{key}{sep}{_mask_token(value)}"
-            text = _YAML_ASSIGN_RE.sub(_redact_yaml, text)
-
    # Authorization headers — _AUTH_HEADER_RE matches any scheme after
    # "[Proxy-]Authorization:" case-insensitively, so "uthorization" is the
    # cheapest substring gate that covers every casing without a casefold().
@@ -569,32 +419,9 @@ def redact_sensitive_text(
    if "BEGIN" in text and "-----" in text:
        text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)

-    # Database connection string passwords. With code_file=True, a password
-    # group that is a pure ``{...}`` brace expression is an f-string template
-    # reference (e.g. f"postgresql://{user}:{pass}@{host}"), not a literal
-    # credential — preserve it. Literal passwords are still redacted. The regex
-    # forbids whitespace in the password group, so a single-line template's
-    # group(2) is exactly the brace expression. See issue #33801.
+    # Database connection string passwords
    if "://" in text:
-        if code_file:
-            def _redact_db(m):
-                pw = m.group(2)
-                if pw.startswith("{") and pw.endswith("}"):
-                    return m.group(0)
-                return f"{m.group(1)}***{m.group(3)}"
-            text = _DB_CONNSTR_RE.sub(_redact_db, text)
-        else:
-            text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
-
-        # Bare-token userinfo in web/transport URLs: ``scheme://TOKEN@host``.
-        # The git-remote-with-embedded-password shape from #6396. Only the
-        # colon-less bare-token form is redacted — ``user:pass@`` and
-        # query-string tokens are left to pass through (see the web-URL note
-        # below). See _URL_BARE_TOKEN_RE for the false-positive guards.
-        text = _URL_BARE_TOKEN_RE.sub(
-            lambda m: f"{m.group(1)}{_mask_token(m.group(2))}{m.group(3)}",
-            text,
-        )
+        text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)

    # JWT tokens (eyJ... — base64-encoded JSON headers)
    if "eyJ" in text:
@@ -607,12 +434,7 @@ def redact_sensitive_text(
    # blanket-redacting param values by name breaks those skills mid-flow.
    # Known credential shapes (sk-, ghp_, JWTs, etc.) inside URLs are still
    # caught by _PREFIX_RE and _JWT_RE above. DB connection-string passwords
-    # are still caught by _DB_CONNSTR_RE. The ONE userinfo case still redacted
-    # is the colon-less bare-token form ``scheme://TOKEN@host`` (#6396, handled
-    # by _URL_BARE_TOKEN_RE in the ``://`` block above): a bare credential in
-    # userinfo is never a round-trip workflow token (those live in the query
-    # string), so masking it can't break a skill. The ``user:pass@`` form is
-    # left to pass through per #34029.
+    # are still caught by _DB_CONNSTR_RE.

    # Form-urlencoded bodies (only triggers on clean k=v&k=v inputs).
    if "&" in text and "=" in text:
@@ -630,66 +452,6 @@ def redact_sensitive_text(
    return text


-# Commands whose stdout is an environment-variable dump (KEY=value lines),
-# NOT source code. For these, terminal-output redaction must run the
-# ENV-assignment pass (code_file=False) so opaque tokens with no recognized
-# vendor prefix (e.g. ``MY_SERVICE_TOKEN=abc123randomstring``) are still
-# masked. For all other commands, code_file=True is used to avoid mangling
-# legitimate source/config dumps (``MAX_TOKENS=100``, ``"apiKey": "x"``
-# fixtures, ``postgresql://{user}`` f-string templates). See issue #43025.
-_ENV_DUMP_COMMANDS = frozenset({"env", "printenv", "set", "export", "declare"})
-
-
-def is_env_dump_command(command: str | None) -> bool:
-    """Return True if ``command`` dumps environment variables to stdout.
-
-    Detects ``env`` / ``printenv`` / ``set`` / ``export`` / ``declare`` as the
-    first token of any segment in a pipeline or sequence (``;`` / ``&&`` /
-    ``||`` / ``|``). Conservative: a parse failure or anything unrecognized
-    returns False (callers then fall back to the safer code_file=True path,
-    which still masks prefix-shaped keys).
-    """
-    if not command or not isinstance(command, str):
-        return False
-    # Split on shell separators, then inspect the first token of each segment.
-    segments = re.split(r"[|;&]+", command)
-    for seg in segments:
-        seg = seg.strip()
-        if not seg:
-            continue
-        try:
-            tokens = shlex.split(seg)
-        except ValueError:
-            tokens = seg.split()
-        if tokens and tokens[0] in _ENV_DUMP_COMMANDS:
-            return True
-    return False
-
-
-def redact_terminal_output(
-    output: str, command: str | None = None, *, force: bool = False
-) -> str:
-    """Redact secrets from terminal/process stdout.
-
-    Single redaction policy for ALL terminal-output surfaces — foreground
-    ``terminal`` results AND background ``process(action=poll/log/wait)``
-    output — so they can't diverge. Picks ``code_file`` based on whether
-    ``command`` is an environment dump:
-
-    - env-dump command (``env``/``printenv``/``set``/``export``/``declare``)
-      → ``code_file=False`` so the ENV-assignment pass masks opaque tokens.
-    - anything else (or unknown command) → ``code_file=True`` to avoid
-      false positives on source/config dumps.
-
-    ``force=True`` bypasses the global ``security.redact_secrets`` preference
-    for safety boundaries that must never emit raw credentials.
-    """
-    if not output:
-        return output
-    code_file = not is_env_dump_command(command or "")
-    return redact_sensitive_text(output, force=force, code_file=code_file)
-
-
 # Substrings used to gate ``_PREFIX_RE`` execution. If none of these appear in
 # the input string, the prefix regex cannot match anything, so we skip it.
 # False positives are fine (they just run the regex, which then matches
--- a/agent/replay_cleanup.py
+++ b/agent/replay_cleanup.py
@@ -1,140 +0,0 @@
-"""Replay-history sanitization shared across resume code paths.
-
-When a session's last turn dies mid-tool-loop — the process is killed by a
-restart/shutdown command, a stale-timeout fires, or an interrupt lands before
-the tool result is written — the persisted transcript can end with a dangling
-``assistant(tool_calls)`` (no matching ``tool`` answer) or an interrupted
-``assistant→tool`` block.  On resume the model sees that broken tail and
-re-issues the unanswered call, producing an endless "thinking"/reboot loop
-(#49201, #29086).
-
-These pure helpers strip those tails before the history is replayed to the
-model.  They were originally local to ``gateway/run.py`` (which fixed the
-messaging-gateway path) and are extracted here so every resume surface — the
-messaging gateway AND the TUI/WebUI gateway — shares the same cleanup instead
-of the WebUI path silently skipping it.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import Any, Dict, List
-
-logger = logging.getLogger(__name__)
-
-
-def is_interrupted_tool_result(content: Any) -> bool:
-    """Return True if a tool result indicates the tool was interrupted."""
-    if not isinstance(content, str):
-        return False
-    lowered = content.lower()
-    if "[command interrupted]" in lowered:
-        return True
-    if "exit_code" in lowered and ("130" in lowered or "-1" in lowered):
-        return "interrupt" in lowered
-    return False
-
-
-def strip_interrupted_tool_tails(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Strip interrupted assistant→tool sequences from replay history.
-
-    Older interrupted gateway turns can be followed by a queued real user
-    message, so the interrupted assistant/tool block is not necessarily the
-    final tail by the time we rebuild replay history.  Remove any contiguous
-    assistant(tool_calls) + tool-result block that contains an interrupted tool
-    result, while preserving successful tool-call sequences intact.
-    """
-    if not agent_history:
-        return agent_history
-
-    cleaned: List[Dict[str, Any]] = []
-    i = 0
-    n = len(agent_history)
-    while i < n:
-        msg = agent_history[i]
-        if msg.get("role") == "assistant" and "tool_calls" in msg:
-            j = i + 1
-            tool_results: List[Dict[str, Any]] = []
-            while j < n and agent_history[j].get("role") == "tool":
-                tool_results.append(agent_history[j])
-                j += 1
-            if tool_results and any(
-                is_interrupted_tool_result(m.get("content", ""))
-                for m in tool_results
-            ):
-                logger.debug(
-                    "Stripping interrupted assistant→tool replay block "
-                    "(indices %d–%d, tool_results=%d)",
-                    i, j - 1, len(tool_results),
-                )
-                i = j
-                continue
-        if msg.get("role") == "tool" and is_interrupted_tool_result(msg.get("content", "")):
-            logger.debug("Stripping orphan interrupted tool result from replay history")
-            i += 1
-            continue
-        cleaned.append(msg)
-        i += 1
-
-    return cleaned
-
-
-def strip_dangling_tool_call_tail(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
-
-    When a tool call itself kills the gateway process (``docker restart``,
-    ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
-    is terminated by SIGKILL *mid-call* — before the tool result is ever
-    written and before the orderly shutdown rewind
-    (``_drop_trailing_empty_response_scaffolding``) can run.  The last thing
-    persisted is the ``assistant`` message that issued the ``tool_calls``,
-    with zero matching ``tool`` rows.
-
-    On resume the model sees an unanswered tool call at the tail and naturally
-    re-issues it — which restarts the gateway again, producing the infinite
-    reboot loop in #49201.  ``strip_interrupted_tool_tails`` does not catch
-    this because there is no tool result to inspect for an interrupt marker.
-
-    This strips that dangling tail at the source so there is nothing for the
-    model to re-execute.  It only acts when the tail is an
-    ``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
-    results — a completed assistant→tool pair (any tool answers present) is
-    left untouched so genuine mid-progress tool loops still resume.
-    """
-    if not agent_history:
-        return agent_history
-
-    last = agent_history[-1]
-    if not (
-        isinstance(last, dict)
-        and last.get("role") == "assistant"
-        and last.get("tool_calls")
-    ):
-        return agent_history
-
-    logger.debug(
-        "Stripping dangling unanswered assistant(tool_calls) tail "
-        "(%d call(s)) — process likely killed mid-tool-call by a "
-        "restart/shutdown command (#49201)",
-        len(last.get("tool_calls") or []),
-    )
-    return agent_history[:-1]
-
-
-def sanitize_replay_history(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Apply both replay-tail strippers in the canonical order.
-
-    Convenience entry point for resume code paths: removes interrupted
-    assistant→tool blocks anywhere in the history, then removes a dangling
-    unanswered ``assistant(tool_calls)`` tail.  Returns the same list object
-    when there is nothing to strip.
-    """
-    if not agent_history:
-        return agent_history
-    return strip_dangling_tool_call_tail(strip_interrupted_tool_tails(agent_history))
--- a/agent/shell_hooks.py
+++ b/agent/shell_hooks.py
@@ -122,8 +122,6 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 try:
    import fcntl  # POSIX only; Windows falls back to best-effort without flock.
 except ImportError:  # pragma: no cover
@@ -443,7 +441,6 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
        return result

    t0 = time.monotonic()
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        proc = subprocess.run(
            argv,
@@ -452,7 +449,6 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
            timeout=spec.timeout,
            text=True,
            shell=False,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        result["timed_out"] = True
--- a/agent/skill_preprocessing.py
+++ b/agent/skill_preprocessing.py
@@ -5,8 +5,6 @@ import re
 import subprocess
 from pathlib import Path

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 logger = logging.getLogger(__name__)

 # Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
@@ -68,7 +66,6 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
    Failures return a short ``[inline-shell error: ...]`` marker instead of
    raising, so one bad snippet can't wreck the whole skill message.
    """
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        completed = subprocess.run(
            ["bash", "-c", command],
@@ -78,7 +75,6 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
            timeout=max(1, int(timeout)),
            check=False,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"[inline-shell timeout after {timeout}s: {command}]"
--- a/agent/skill_utils.py
+++ b/agent/skill_utils.py
@@ -507,34 +507,6 @@ def get_all_skills_dirs() -> List[Path]:
    return dirs


-def _resolve_for_skill_ownership(path) -> Path:
-    path_obj = path if isinstance(path, Path) else Path(str(path))
-    try:
-        return path_obj.expanduser().resolve()
-    except (OSError, RuntimeError):
-        return path_obj.expanduser().absolute()
-
-
-def is_external_skill_path(path) -> bool:
-    """Return True when ``path`` lives under a configured external skills dir.
-
-    ``skills.external_dirs`` are externally owned: Hermes can discover and view
-    their skills, and foreground user-directed tool calls may still edit them,
-    but autonomous lifecycle maintenance must treat them as read-only. This
-    helper centralizes the ownership boundary so curator/reporting/tool paths do
-    not each need to re-interpret the config.
-    """
-    candidate = _resolve_for_skill_ownership(path)
-    for root in get_external_skills_dirs():
-        resolved_root = _resolve_for_skill_ownership(root)
-        try:
-            candidate.relative_to(resolved_root)
-            return True
-        except ValueError:
-            continue
-    return False
-
-
 # ── Condition extraction ──────────────────────────────────────────────────


--- a/agent/thinking_timeout_guidance.py
+++ b/agent/thinking_timeout_guidance.py
@@ -1,136 +0,0 @@
-"""Thinking-timeout detection and user-facing guidance for reasoning models.
-
-When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3,
-Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning)
-hits a transport-layer error before the first content token arrives, the
-upstream proxy has almost certainly idle-killed a long thinking stream —
-not a true context overflow or a configuration error.  The user needs
-distinct guidance for this case:
-
-    "The model's thinking phase exceeded the upstream proxy's idle
-     timeout before the first content token arrived.  This is a known
-     issue with reasoning models behind cloud gateways (NVIDIA NIM,
-     OpenAI, Anthropic, DeepSeek).  Workarounds in priority order:
-     1. Set `providers.<provider>.models.<model>.stale_timeout_seconds: 900`
-        in `~/.hermes/config.yaml` to extend the per-call timeout...
-     2. Lower `reasoning_budget` or set `reasoning_effort: medium`...
-     3. Use a smaller / faster reasoning model..."
-
-The existing `_is_stream_drop` guidance at
-``agent/conversation_loop.py:3464-3486`` fires for large-file-write
-stream drops ("try execute_code with Python's open() for large files")
-which is the WRONG advice for the thinking-timeout case.  This module
-provides the detection and the message as standalone helpers so the
-detection logic is unit-testable without driving the full retry loop,
-and the message text can be regression-tested for spelling and accuracy.
-
-Part 2 of Fixes #52310.
-"""
-
-from __future__ import annotations
-
-from typing import Optional
-
-
-# Substring set that identifies a transport-layer failure on the
-# response stream.  Same shape as the existing
-# ``_SERVER_DISCONNECT_PATTERNS`` in ``agent/error_classifier.py:394``
-# but extended to also catch the OSS-level error signature
-# (``broken pipe`` / ``errno 32``) that the upstream kill surfaces
-# to the OpenAI SDK wrapper.
-_THINKING_TIMEOUT_SUBSTRINGS: tuple[str, ...] = (
-    "broken pipe",
-    "errno 32",
-    "remote protocol",
-    "connection reset",
-    "connection lost",
-    "peer closed",
-    "server disconnected",
-)
-
-
-def is_thinking_timeout(classified: object, model: str, error_msg: str) -> bool:
-    """Return True when a reasoning model's thinking phase hit a transport kill.
-
-    Args:
-        classified: a :class:`agent.error_classifier.ClassifiedError` instance
-            (duck-typed here to avoid an import cycle in unit tests).
-        model: the model slug at failure time (e.g.
-            ``"nvidia/nemotron-3-ultra-550b-a55b"``).
-        error_msg: lowercased string representation of the underlying
-            exception (typically ``str(api_error).lower()``).
-
-    Returns True when ALL conditions hold:
-        1. ``classified.reason == FailoverReason.timeout`` (the classifier
-           override at ``agent/error_classifier.py:720-738`` ensures this
-           is the case for reasoning models even on large sessions).
-        2. ``api_error`` has no ``.status_code`` attribute set (transport
-           disconnect, not an HTTP error).
-        3. ``model`` is in the reasoning-model allowlist (reuses
-           ``agent.reasoning_timeouts.get_reasoning_stale_timeout_floor``).
-        4. ``error_msg`` contains one of the transport-kill substrings.
-
-    Non-reasoning models always return False.  Non-transport errors
-    (billing / rate_limit / auth / context_overflow / format_error)
-    always return False.  HTTP-status errors always return False.
-    """
-    # Import here (not at module top) to keep this helper cheap to
-    # import even from callers that don't need it.  ``agent.reasoning_timeouts``
-    # is small and dependency-free.
-    from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-
-    # Condition 1: classifier says timeout.  Use a string/value check
-    # rather than importing FailoverReason so this module has zero
-    # import cycles from the error_classifier package.
-    reason = getattr(classified, "reason", None)
-    reason_value = getattr(reason, "value", None)
-    if reason_value != "timeout":
-        return False
-
-    # Condition 2: no HTTP status code (transport, not API error).
-    # Caller is expected to gate on ``getattr(api_error, "status_code", None) is None``
-    # before calling this helper; the surface here is just the post-gate
-    # boolean so the caller can pass an already-prepped error_msg.
-
-    # Condition 3: reasoning model allowlist.
-    if get_reasoning_stale_timeout_floor(model) is None:
-        return False
-
-    # Condition 4: transport-kill substring in the error message.
-    error_msg_lower = (error_msg or "").lower()
-    return any(p in error_msg_lower for p in _THINKING_TIMEOUT_SUBSTRINGS)
-
-
-def build_thinking_timeout_guidance(
-    provider: str, model: str, model_label: Optional[str] = None,
-) -> str:
-    """Return the user-facing guidance string appended to ``_final_response``.
-
-    Args:
-        provider: provider slug (e.g. ``"nvidia"``, ``"openai"``).
-        model: bare model slug the user would put in their config
-            (e.g. ``"nemotron-3-ultra-550b-a55b"`` if the user uses
-            NVIDIA direct, or the full ``"nvidia/nemotron-3-ultra-550b-a55b"``
-            if they go through an aggregator).  Used verbatim in the
-            config snippet so the user can copy-paste.
-        model_label: optional short label for the model name in the
-            prose (e.g. ``"Nemotron 3 Ultra"``).  Falls back to the
-            slug if not provided.
-    """
-    label = model_label or model
-    return (
-        "\n\nThe model's thinking phase exceeded the upstream proxy's "
-        "idle timeout before the first content token arrived. This is a "
-        f"known issue with reasoning models (like {label}) behind cloud "
-        "gateways (NVIDIA NIM, OpenAI, Anthropic, DeepSeek). Workarounds "
-        "in priority order:\n"
-        f"1. Set `providers.{provider}.models.{model}.stale_timeout_seconds: 900` "
-        "in `~/.hermes/config.yaml` to extend the per-call timeout. "
-        "(Hermes's built-in floor is 600s for known reasoning models — "
-        "if you still see this after raising, the upstream cap is even "
-        "shorter.)\n"
-        "2. Lower `reasoning_budget` or set `reasoning_effort: medium` on this "
-        "model if the provider supports it.\n"
-        "3. Use a smaller / faster reasoning model if the task doesn't "
-        "require deep thinking."
-    )
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@@ -26,7 +26,6 @@ from agent.display import (
    build_tool_preview as _build_tool_preview,
    get_cute_tool_message as _get_cute_tool_message_impl,
    get_tool_emoji as _get_tool_emoji,
-    redact_tool_args_for_display as _redact_tool_args_for_display,
    _detect_tool_failure,
 )
 from agent.tool_guardrails import ToolGuardrailDecision
@@ -470,11 +469,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
    if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
        for i, (tc, name, args, middleware_trace, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
-            display_args = _redact_tool_args_for_display(name, args) or args
-            args_str = json.dumps(display_args, ensure_ascii=False)
+            args_str = json.dumps(args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {name}({list(display_args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
@@ -484,9 +482,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_progress_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                preview = _build_tool_preview(name, display_args)
-                agent.tool_progress_callback("tool.started", name, preview, display_args)
+                preview = _build_tool_preview(name, args)
+                agent.tool_progress_callback("tool.started", name, preview, args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

@@ -495,8 +492,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_start_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                agent.tool_start_callback(tc.id, name, display_args)
+                agent.tool_start_callback(tc.id, name, args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -796,8 +792,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe

        if not blocked and agent.tool_complete_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                agent.tool_complete_callback(tc.id, name, display_args, function_result)
+                agent.tool_complete_callback(tc.id, name, args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

@@ -959,11 +954,10 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            agent._iters_since_skill = 0

        if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
-            display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-            args_str = json.dumps(display_args, ensure_ascii=False)
+            args_str = json.dumps(function_args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {function_name}({list(display_args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
@@ -984,16 +978,14 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_progress_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args)
-                agent.tool_progress_callback("tool.started", function_name, preview, display_args)
+                preview = _build_tool_preview(function_name, function_args)
+                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

        if not _execution_blocked and agent.tool_start_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                agent.tool_start_callback(tool_call.id, function_name, display_args)
+                agent.tool_start_callback(tool_call.id, function_name, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -1223,8 +1215,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _ce_result = None
@@ -1257,8 +1248,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _mem_result = None
@@ -1289,8 +1279,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _spinner_result = None
@@ -1452,8 +1441,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_complete_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                agent.tool_complete_callback(tool_call.id, function_name, display_args, function_result)
+                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

--- a/agent/transports/codex_event_projector.py
+++ b/agent/transports/codex_event_projector.py
@@ -217,7 +217,9 @@ class CodexEventProjector:
    def _project_mcp_tool_call(self, item: dict, item_id: str) -> ProjectionResult:
        server = item.get("server") or "mcp"
        tool = item.get("tool") or "unknown"
-        call_id = _deterministic_call_id(f"mcp_{server}_{tool}", item_id)
+        # Mirror the native MCP tool-name convention (mcp__server__tool) so the
+        # deterministic call_id input stays consistent with registration names.
+        call_id = _deterministic_call_id(f"mcp__{server}__{tool}", item_id)
        args = item.get("arguments") or {}
        if not isinstance(args, dict):
            args = {"arguments": args}
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@@ -28,7 +28,6 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional

-from agent.conversation_compression import conversation_history_after_compression
 from agent.iteration_budget import IterationBudget
 from agent.model_metadata import (
    estimate_messages_tokens_rough,
@@ -401,9 +400,7 @@ def build_turn_context(
                    _orig_len, len(messages), _orig_tokens, _preflight_tokens
                ):
                    break  # Cannot compress further: neither rows nor tokens moved
-                conversation_history = conversation_history_after_compression(
-                    agent, messages
-                )
+                conversation_history = None
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
--- a/agent/turn_finalizer.py
+++ b/agent/turn_finalizer.py
@@ -289,14 +289,7 @@ def finalize_turn(
                    and len(_stripped) <= 24
                    and _stripped[-1:] not in {".", "!", "?", "。", "！", "？", "`", ")"}
                )
-                _is_partial_stream_recovery = (
-                    str(_turn_exit_reason) == "partial_stream_recovery"
-                )
-                if (
-                    _is_empty_terminal
-                    or _is_partial_fragment
-                    or _is_partial_stream_recovery
-                ):
+                if _is_empty_terminal or _is_partial_fragment:
                    _explanation = agent._format_turn_completion_explanation(
                        _turn_exit_reason
                    )
--- a/agent/turn_retry_state.py
+++ b/agent/turn_retry_state.py
@@ -67,11 +67,6 @@ class TurnRetryState:
    # ── Restart signals (read by the outer loop after the attempt) ───────
    restart_with_compressed_messages: bool = False
    restart_with_length_continuation: bool = False
-    # Set when a content-filter stream stall (e.g. MiniMax "new_sensitive")
-    # has been escalated to the fallback chain: the partial-stream content
-    # was rolled back off ``messages`` and the loop should re-issue the API
-    # call against the newly-activated provider (#32421).
-    restart_with_rebuilt_messages: bool = False

    def __iter__(self):
        # Convenience for debugging / tests: iterate (name, value) pairs.
--- a/agent/verification_stop.py
+++ b/agent/verification_stop.py
@@ -15,135 +15,9 @@ from typing import Any, Iterable

 _MAX_CHANGED_PATHS_IN_NUDGE = 8

-# Non-code file extensions whose edits carry no verifiable runtime behavior:
-# documentation, prose, and data/markup that no test/build exercises. When a
-# turn touches ONLY these, verify-on-stop has nothing to check, so the nudge is
-# suppressed (this is fix "C" for the doc/markdown/skill false-positive — a
-# SKILL.md or README edit must never demand a /tmp verification script). A turn
-# that edits any non-listed path (a real source/code/config file) still nudges.
-_NON_CODE_VERIFY_EXTENSIONS = frozenset(
-    {
-        ".md",
-        ".markdown",
-        ".mdx",
-        ".rst",
-        ".txt",
-        ".text",
-        ".adoc",
-        ".asciidoc",
-        ".org",
-        ".log",
-        ".csv",
-        ".tsv",
-    }
-)
-
-# Filenames (case-insensitive, extension-less or otherwise) that are pure prose
-# even without a recognized doc extension.
-_NON_CODE_VERIFY_FILENAMES = frozenset(
-    {
-        "license",
-        "licence",
-        "notice",
-        "authors",
-        "contributors",
-        "changelog",
-        "codeowners",
-    }
-)
-
-
-def _is_non_code_path(raw: str) -> bool:
-    """Return True when a changed path is documentation/prose with nothing to verify."""
-    try:
-        p = Path(str(raw))
-    except Exception:
-        return False
-    suffix = p.suffix.lower()
-    if suffix in _NON_CODE_VERIFY_EXTENSIONS:
-        return True
-    if not suffix and p.name.lower() in _NON_CODE_VERIFY_FILENAMES:
-        return True
-    return False
-
-
-def _filter_verifiable_paths(paths: Iterable[str]) -> list[str]:
-    """Drop documentation/prose paths; keep paths that could have verifiable behavior."""
-    return [p for p in paths if p and not _is_non_code_path(p)]
-
-
-# Session identities (platform or source) that are NOT human conversational
-# messaging surfaces: interactive coding surfaces (CLI, TUI, desktop, codex,
-# local, gateway) and programmatic callers (API server, webhooks, tools).
-# Verify-on-stop stays ON by default for these. Any other resolved gateway
-# platform is a conversational messaging surface (Telegram, Discord, WhatsApp,
-# Signal, Slack, etc.) where the verification narrative would reach a human as
-# chat noise, so it defaults OFF. Mirrors LOCAL_SESSION_SOURCE_IDS in
-# apps/desktop/src/lib/session-source.ts; keep roughly in sync when adding a
-# local or programmatic surface. Default-deny by design: an unrecognized
-# identity is treated as messaging (OFF) so a new chat platform never leaks the
-# verification receipt before this set is updated.
-_NON_MESSAGING_SESSION_SURFACES = frozenset(
-    {
-        "",
-        "cli",
-        "codex",
-        "desktop",
-        "gateway",
-        "local",
-        "tui",
-        "tool",
-        "api_server",
-        "webhook",
-        "msgraph_webhook",
-    }
-)
-
-
-def _session_is_messaging_surface() -> bool:
-    """Return whether this turn is delivered over a human messaging channel.
-
-    The gateway binds the platform value (e.g. ``telegram``) to
-    ``HERMES_SESSION_PLATFORM``; the CLI and TUI set ``HERMES_SESSION_SOURCE``
-    (e.g. ``cli``, ``tui``) instead. Both are consulted via the session-context
-    helper (with an ``os.environ`` fallback), alongside the ``HERMES_PLATFORM``
-    override, matching the sibling platform resolution in
-    ``agent/skill_commands.py`` and ``agent/prompt_builder.py``. A turn is a
-    messaging surface when a resolved identity is present and is not a known
-    non-messaging surface.
-    """
-    try:
-        from gateway.session_context import get_session_env
-
-        platform = (
-            os.getenv("HERMES_PLATFORM")
-            or get_session_env("HERMES_SESSION_PLATFORM", "")
-        )
-        source = get_session_env("HERMES_SESSION_SOURCE", "")
-    except Exception:
-        platform = os.getenv("HERMES_PLATFORM", "") or os.environ.get(
-            "HERMES_SESSION_PLATFORM", ""
-        )
-        source = os.environ.get("HERMES_SESSION_SOURCE", "")
-    for identity in (platform, source):
-        identity = str(identity or "").strip().lower()
-        if identity and identity not in _NON_MESSAGING_SESSION_SURFACES:
-            return True
-    return False
-

 def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
-    """Return whether edit -> verify-before-finish behavior is enabled.
-
-    Precedence: an explicit ``HERMES_VERIFY_ON_STOP`` env var wins, then an
-    explicit ``agent.verify_on_stop`` config value. The config default is
-    ``False`` (see ``DEFAULT_CONFIG``) — verify-on-stop is OFF unless the user
-    opts in. The legacy ``"auto"`` sentinel is still honored for anyone who
-    sets it explicitly: it resolves to ON for interactive coding surfaces
-    (CLI, TUI, desktop) and programmatic callers, and OFF for conversational
-    messaging surfaces (Telegram, Discord, etc.). A missing/unknown value
-    falls back to OFF.
-    """
+    """Return whether edit -> verify-before-finish behavior is enabled."""
    env = os.environ.get("HERMES_VERIFY_ON_STOP")
    if env is not None:
        return env.strip().lower() not in {"0", "false", "no", "off"}
@@ -155,20 +29,9 @@ def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
        except Exception:
            config = {}
    agent_cfg = (config or {}).get("agent") if isinstance(config, dict) else None
-    cfg_val = agent_cfg.get("verify_on_stop") if isinstance(agent_cfg, dict) else None
-    if isinstance(cfg_val, bool):
-        return cfg_val
-    if isinstance(cfg_val, str):
-        token = cfg_val.strip().lower()
-        if token in {"1", "true", "yes", "on"}:
-            return True
-        if token in {"0", "false", "no", "off"}:
-            return False
-        if token == "auto":
-            # Explicit opt-in to the legacy surface-aware behavior.
-            return not _session_is_messaging_surface()
-    # Missing or unknown value -> OFF (the new default).
-    return False
+    if isinstance(agent_cfg, dict) and "verify_on_stop" in agent_cfg:
+        return bool(agent_cfg.get("verify_on_stop"))
+    return True


 def _candidate_cwds(paths: Iterable[str]) -> list[Path]:
@@ -251,10 +114,7 @@ def build_verify_on_stop_nudge(
    max_attempts: int = 2,
 ) -> str | None:
    """Return a synthetic follow-up when edited code lacks fresh verification."""
-    # Drop documentation/prose paths (markdown, skills, README, LICENSE, ...) —
-    # they carry no verifiable behavior, so a turn that touched only those has
-    # nothing to verify and must not nudge.
-    paths = sorted({str(p) for p in _filter_verifiable_paths(changed_paths)})
+    paths = sorted({str(p) for p in changed_paths if p})
    if not paths or attempts >= max_attempts:
        return None

--- a/apps/desktop/README.md
+++ b/apps/desktop/README.md
@@ -85,7 +85,7 @@ Installers are built and uploaded to GitHub Releases manually. macOS/Windows sig

 ### How it works

-The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a headless backend the app launches for you — a `hermes serve` process that serves the `tui_gateway` JSON-RPC/WebSocket API — through the framework-agnostic client in [`apps/shared`](../shared/) (the same client the web dashboard consumes), and reuses the agent runtime rather than embedding `hermes --tui`. The app is **self-contained**: it runs its own `hermes serve` backend and never opens or requires the web dashboard UI. (For backward compatibility, a runtime that predates the `serve` command automatically falls back to a headless `dashboard --no-open` — see `electron/backend-command.cjs` — so mid-upgrade installs never break.) The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.
+The packaged app ships the Electron shell and a native React chat surface. On first launch it can install the Hermes Agent runtime into `HERMES_HOME` (`~/.hermes`, or `%LOCALAPPDATA%\hermes` on Windows) — the **same layout a CLI install uses**, so the two are interchangeable. Backend resolution first honours `HERMES_DESKTOP_HERMES_ROOT`, then a completed managed install, then a probed `hermes` on `PATH` (unless `HERMES_DESKTOP_IGNORE_EXISTING=1` is set), and finally an explicit `HERMES_DESKTOP_HERMES` command override for packagers/troubleshooting. The renderer (React, in `src/`) talks to a `hermes dashboard` backend over the `tui_gateway`/dashboard APIs and reuses the agent runtime rather than embedding `hermes --tui`. The install, backend-resolution, and self-update logic all live in `electron/main.cjs`.

 ### Verification

--- a/apps/desktop/electron/backend-command.cjs
+++ b/apps/desktop/electron/backend-command.cjs
@@ -1,51 +0,0 @@
-'use strict'
-
-// Backend subcommand routing for the desktop-managed Hermes process.
-//
-// The desktop app launches its own headless backend via `hermes serve` — it
-// must NEVER depend on or launch the browser `dashboard`. But `serve` is a
-// newer subcommand: a runtime that predates it (an older managed install the
-// app hasn't updated yet, or an older `hermes` resolved from PATH) only knows
-// `dashboard --no-open`. To avoid bricking those users mid-upgrade we detect
-// whether the resolved runtime understands `serve` and, only when it does not,
-// fall back to the legacy `dashboard --no-open` invocation. Both produce the
-// exact same headless gateway; `serve` is just the decoupled name.
-//
-// These helpers are pure so they can be unit-tested without Electron.
-
-/**
- * Build the canonical headless backend argv (always `serve`).
- * @param {string} [profile] optional Hermes profile to pin via `--profile`.
- */
-function serveBackendArgs(profile) {
-  const head = profile ? ['--profile', profile] : []
-  return [...head, 'serve', '--host', '127.0.0.1', '--port', '0']
-}
-
-/**
- * Rewrite a resolved backend argv from `serve` to the legacy
- * `dashboard --no-open` form, preserving every other argument (incl. a leading
- * `-m hermes_cli.main` and any `--profile <name>`). Returns a copy; if there is
- * no `serve` token the argv is returned unchanged.
- */
-function dashboardFallbackArgs(args) {
-  const i = args.indexOf('serve')
-  if (i === -1) return args.slice()
-  return [...args.slice(0, i), 'dashboard', '--no-open', ...args.slice(i + 1)]
-}
-
-/**
- * True when a runtime's `hermes_cli/subcommands/dashboard.py` source registers
- * the `serve` subcommand. Matches `add_parser("serve"` / `add_parser('serve'`
- * specifically so the substring "server" (e.g. "start_server", "web server")
- * never produces a false positive.
- */
-function sourceDeclaresServe(dashboardPySource) {
-  return /add_parser\(\s*["']serve["']/.test(String(dashboardPySource || ''))
-}
-
-module.exports = {
-  serveBackendArgs,
-  dashboardFallbackArgs,
-  sourceDeclaresServe,
-}
--- a/apps/desktop/electron/backend-command.test.cjs
+++ b/apps/desktop/electron/backend-command.test.cjs
@@ -1,83 +0,0 @@
-'use strict'
-
-const test = require('node:test')
-const assert = require('node:assert/strict')
-
-const {
-  serveBackendArgs,
-  dashboardFallbackArgs,
-  sourceDeclaresServe,
-} = require('./backend-command.cjs')
-
-test('serveBackendArgs builds a headless serve invocation', () => {
-  assert.deepEqual(serveBackendArgs(), [
-    'serve',
-    '--host',
-    '127.0.0.1',
-    '--port',
-    '0',
-  ])
-})
-
-test('serveBackendArgs pins a profile when provided', () => {
-  assert.deepEqual(serveBackendArgs('worker'), [
-    '--profile',
-    'worker',
-    'serve',
-    '--host',
-    '127.0.0.1',
-    '--port',
-    '0',
-  ])
-})
-
-test('dashboardFallbackArgs rewrites serve -> dashboard --no-open, keeping the -m prefix', () => {
-  const serve = ['-m', 'hermes_cli.main', 'serve', '--host', '127.0.0.1', '--port', '0']
-  assert.deepEqual(dashboardFallbackArgs(serve), [
-    '-m',
-    'hermes_cli.main',
-    'dashboard',
-    '--no-open',
-    '--host',
-    '127.0.0.1',
-    '--port',
-    '0',
-  ])
-})
-
-test('dashboardFallbackArgs preserves a --profile flag ahead of serve', () => {
-  const serve = ['-m', 'hermes_cli.main', '--profile', 'worker', 'serve', '--host', '127.0.0.1', '--port', '0']
-  assert.deepEqual(dashboardFallbackArgs(serve), [
-    '-m',
-    'hermes_cli.main',
-    '--profile',
-    'worker',
-    'dashboard',
-    '--no-open',
-    '--host',
-    '127.0.0.1',
-    '--port',
-    '0',
-  ])
-})
-
-test('dashboardFallbackArgs is a no-op (copy) when there is no serve token', () => {
-  const args = ['-m', 'hermes_cli.main', 'dashboard', '--no-open']
-  const out = dashboardFallbackArgs(args)
-  assert.deepEqual(out, args)
-  assert.notEqual(out, args, 'should return a copy, not the same reference')
-})
-
-test('sourceDeclaresServe detects the serve subparser registration', () => {
-  assert.equal(sourceDeclaresServe('subparsers.add_parser("serve", help="...")'), true)
-  assert.equal(sourceDeclaresServe("subparsers.add_parser('serve')"), true)
-  assert.equal(sourceDeclaresServe('subparsers.add_parser(\n        "serve",\n)'), true)
-})
-
-test('sourceDeclaresServe does not false-positive on the substring "server"', () => {
-  const oldSource = `
-    dashboard_parser = subparsers.add_parser("dashboard", help="Start the web UI dashboard")
-    from hermes_cli.web_server import start_server  # web server
-  `
-  assert.equal(sourceDeclaresServe(oldSource), false)
-})
--- a/apps/desktop/electron/backend-env.cjs
+++ b/apps/desktop/electron/backend-env.cjs
@@ -61,7 +61,10 @@ function buildDesktopBackendPath({
  const venvBin = venvRoot ? pathModule.join(venvRoot, platform === 'win32' ? 'Scripts' : 'bin') : null
  const saneEntries = platform === 'win32' ? [] : POSIX_SANE_PATH_ENTRIES

-  return appendUniquePathEntries([hermesNodeBin, venvBin, currentPath, saneEntries], { delimiter })
+  return appendUniquePathEntries(
+    [hermesNodeBin, venvBin, currentPath, saneEntries],
+    { delimiter }
+  )
 }

 function normalizeHermesHomeRoot(hermesHome, { pathModule = pathModuleForPlatform(process.platform) } = {}) {
--- a/apps/desktop/electron/backend-env.test.cjs
+++ b/apps/desktop/electron/backend-env.test.cjs
@@ -76,7 +76,10 @@ test('normalizeHermesHomeRoot maps profile homes back to the global Hermes root'
    normalizeHermesHomeRoot('C:\\Users\\test\\AppData\\Local\\hermes\\profiles\\oracle', { pathModule: path.win32 }),
    'C:\\Users\\test\\AppData\\Local\\hermes'
  )
-  assert.equal(normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }), '/Users/test/.hermes')
+  assert.equal(
+    normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }),
+    '/Users/test/.hermes'
+  )
 })

 test('Windows PATH casing and delimiter are preserved without POSIX sane entries', () => {
@@ -101,5 +104,8 @@ test('Windows PATH casing and delimiter are preserved without POSIX sane entries
 })

 test('appendUniquePathEntries drops empty entries and keeps first occurrence', () => {
-  assert.equal(appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }), '/a:/b:/c')
+  assert.equal(
+    appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }),
+    '/a:/b:/c'
+  )
 })
--- a/apps/desktop/electron/backend-probes.cjs
+++ b/apps/desktop/electron/backend-probes.cjs
@@ -37,18 +37,7 @@ const { execFileSync } = require('node:child_process')
 const PROBE_TIMEOUT_MS = 5000

 /**
- * Return the Python snippet used to verify Hermes can import far enough to
- * launch the CLI. Kept exported for tests so dependency regressions are
- * caught without needing a real broken venv fixture.
- *
- * @returns {string}
- */
-function hermesRuntimeImportProbe() {
-  return 'import yaml; import hermes_cli.config'
-}
-
-/**
- * Return true iff the Hermes runtime import probe exits 0.
+ * Return true iff `python -c "import hermes_cli"` exits 0.
 *
 * Used to gate the "fallback to system Python with hermes_cli installed"
 * rung of resolveHermesBackend. Without this, a system Python 3.11-3.13
@@ -57,20 +46,13 @@ function hermesRuntimeImportProbe() {
 * site-packages -- and the resolver returns a backend that immediately
 * dies on spawn.
 *
- * The probe intentionally imports hermes_cli.config, not just the top-level
- * package: a broken/empty Windows launcher venv can still see the source tree
- * through PYTHONPATH but lack PyYAML, then die on the first real CLI import.
- *
 * @param {string} pythonPath - Absolute path to a python.exe / python.
- * @param {object} [opts]
- * @param {object} [opts.env] - Additional environment for the probe.
 * @returns {boolean}
 */
-function canImportHermesCli(pythonPath, opts = {}) {
+function canImportHermesCli(pythonPath) {
  if (!pythonPath) return false
  try {
-    execFileSync(pythonPath, ['-c', hermesRuntimeImportProbe()], {
-      env: { ...process.env, ...(opts.env || {}) },
+    execFileSync(pythonPath, ['-c', 'import hermes_cli'], {
      stdio: 'ignore',
      timeout: PROBE_TIMEOUT_MS,
      windowsHide: true
@@ -119,7 +101,6 @@ function verifyHermesCli(hermesCommand, opts = {}) {

 module.exports = {
  canImportHermesCli,
-  hermesRuntimeImportProbe,
  verifyHermesCli,
  PROBE_TIMEOUT_MS
 }
--- a/apps/desktop/electron/backend-probes.test.cjs
+++ b/apps/desktop/electron/backend-probes.test.cjs
@@ -11,7 +11,7 @@ const fs = require('node:fs')
 const os = require('node:os')
 const path = require('node:path')

-const { canImportHermesCli, hermesRuntimeImportProbe, verifyHermesCli } = require('./backend-probes.cjs')
+const { canImportHermesCli, verifyHermesCli } = require('./backend-probes.cjs')

 // Resolve the host's own Node binary -- guaranteed to be on disk and
 // runnable. We use it as both a stand-in for "a python that doesn't
@@ -40,12 +40,6 @@ test('canImportHermesCli returns false when binary does not exist', () => {
  assert.equal(canImportHermesCli(ghost), false)
 })

-test('hermes runtime import probe checks config dependencies', () => {
-  const probe = hermesRuntimeImportProbe()
-  assert.match(probe, /\bimport yaml\b/)
-  assert.match(probe, /\bimport hermes_cli\.config\b/)
-})
-
 test('verifyHermesCli returns false when command is falsy', () => {
  assert.equal(verifyHermesCli(''), false)
  assert.equal(verifyHermesCli(null), false)
--- a/apps/desktop/electron/backend-ready.cjs
+++ b/apps/desktop/electron/backend-ready.cjs
@@ -167,5 +167,5 @@ module.exports = {
  readDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
 }
--- a/apps/desktop/electron/backend-ready.test.cjs
+++ b/apps/desktop/electron/backend-ready.test.cjs
@@ -25,7 +25,7 @@ const {
  waitForDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
 } = require('./backend-ready.cjs')

 // A minimal stand-in for a spawned child process: an EventEmitter with a
--- a/apps/desktop/electron/bootstrap-runner.cjs
+++ b/apps/desktop/electron/bootstrap-runner.cjs
@@ -179,13 +179,7 @@ function downloadInstallScript(commit, destPath) {
  })
 }

-async function resolveInstallScript({
-  installStamp,
-  sourceRepoRoot,
-  hermesHome,
-  emit,
-  _download = downloadInstallScript
-}) {
+async function resolveInstallScript({ installStamp, sourceRepoRoot, hermesHome, emit, _download = downloadInstallScript }) {
  // 1. Dev shortcut: prefer a local checkout's installer so we can iterate
  //    without pushing. SOURCE_REPO_ROOT comes from main.cjs (path.resolve
  //    of APP_ROOT/../..).
@@ -299,19 +293,15 @@ function spawnPowerShell(scriptPath, args, { emit, stageName, abortSignal, herme
    const ps = process.platform === 'win32' ? resolveWindowsPowerShell() : 'pwsh'
    const fullArgs = ['-NoProfile', '-ExecutionPolicy', 'Bypass', '-File', scriptPath, ...args]

-    const child = spawn(
-      ps,
-      fullArgs,
-      hiddenWindowsChildOptions({
-        stdio: ['ignore', 'pipe', 'pipe'],
-        env: {
-          ...process.env,
-          // Pass HERMES_HOME through so install.ps1 respects the caller's
-          // choice rather than re-computing the default.
-          HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
-        }
-      })
-    )
+    const child = spawn(ps, fullArgs, hiddenWindowsChildOptions({
+      stdio: ['ignore', 'pipe', 'pipe'],
+      env: {
+        ...process.env,
+        // Pass HERMES_HOME through so install.ps1 respects the caller's
+        // choice rather than re-computing the default.
+        HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
+      }
+    }))

    let stdout = ''
    let stderr = ''
--- a/apps/desktop/electron/connection-config.cjs
+++ b/apps/desktop/electron/connection-config.cjs
@@ -37,20 +37,6 @@
 const AT_COOKIE_VARIANTS = ['__Host-hermes_session_at', '__Secure-hermes_session_at', 'hermes_session_at']
 const RT_COOKIE_VARIANTS = ['__Host-hermes_session_rt', '__Secure-hermes_session_rt', 'hermes_session_rt']

-// The Nous portal (NAS) does NOT use Hermes gateway session cookies — it is a
-// Privy-authed Next.js app. NAS `auth()` (src/server/auth/session.ts) reads the
-// `privy-token` access-token cookie (with `privy-id-token` alongside), which is
-// also exactly what the `/api/agents` cookie-auth path validates. So portal
-// sign-in / discovery liveness must look for the Privy cookie, NOT the gateway
-// cookies above. `privy-token` is the access token (the required signal);
-// variants cover the secured-prefix forms and the older `privy-session` name.
-const PRIVY_SESSION_COOKIE_VARIANTS = [
-  '__Host-privy-token',
-  '__Secure-privy-token',
-  'privy-token',
-  'privy-session'
-]
-
 function normalizeRemoteBaseUrl(rawUrl) {
  const value = String(rawUrl || '').trim()

@@ -156,30 +142,19 @@ function normAuthMode(mode) {
  return mode === 'oauth' ? 'oauth' : 'token'
 }

-// True for connection modes that resolve to a REMOTE backend. 'cloud' is a
-// Hermes Cloud connection (cloud-auto-discovery Q3/Q6): it carries a
-// remote-shaped block and reuses the entire remote connect/probe/reconnect
-// path, so every resolution site treats it exactly like 'remote'. The only
-// places that distinguish cloud from remote are the settings UI (which card to
-// show) and config persistence (remembering the provenance). Centralized here
-// so no resolution site forgets the third arm.
-function modeIsRemoteLike(mode) {
-  return mode === 'remote' || mode === 'cloud'
-}
-
 /**
 * Select a profile's explicit remote override from a connection config, or null
 * when it has none (so the caller falls back to env → global remote → local).
 *
 * The config may carry a `profiles` map keyed by name; an entry counts as an
- * override only with a remote-like `mode` (remote or cloud) and a non-empty
- * `url`. Pure: `token` is the raw stored secret; main.cjs decrypts it. Returns
+ * override only with `mode === 'remote'` and a non-empty `url`. Pure: `token`
+ * is the raw stored secret; main.cjs decrypts it. Returns
 * `{ url, authMode, token } | null`.
 */
 function profileRemoteOverride(config, profile) {
  const key = connectionScopeKey(profile)
  const entry = key ? config?.profiles?.[key] : null
-  if (!entry || typeof entry !== 'object' || !modeIsRemoteLike(entry.mode)) {
+  if (!entry || typeof entry !== 'object' || entry.mode !== 'remote') {
    return null
  }

@@ -286,34 +261,23 @@ function cookiesHaveSession(cookies) {
 */
 function cookiesHaveLiveSession(cookies) {
  if (!Array.isArray(cookies)) return false
-  return cookies.some(c => c && c.value && (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name)))
-}
-
-/**
- * True if the cookie jar holds a live Nous PORTAL (Privy) session — a non-empty
- * `privy-token` (access-token) cookie, or a variant. This is the portal
- * analogue of `cookiesHaveLiveSession`: the portal authenticates via Privy, not
- * the Hermes gateway session cookies, so cloud sign-in / discovery liveness
- * must check THIS, not the gateway helpers. (NAS `auth()` and the `/api/agents`
- * cookie path both key off `privy-token`.)
- */
-function cookiesHavePrivySession(cookies) {
-  if (!Array.isArray(cookies)) return false
-  return cookies.some(c => c && c.value && PRIVY_SESSION_COOKIE_VARIANTS.includes(c.name))
+  return cookies.some(
+    c =>
+      c &&
+      c.value &&
+      (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name))
+  )
 }

 module.exports = {
  AT_COOKIE_VARIANTS,
  RT_COOKIE_VARIANTS,
-  PRIVY_SESSION_COOKIE_VARIANTS,
  authModeFromStatus,
  buildGatewayWsUrl,
  buildGatewayWsUrlWithTicket,
  connectionScopeKey,
  cookiesHaveSession,
  cookiesHaveLiveSession,
-  cookiesHavePrivySession,
-  modeIsRemoteLike,
  normAuthMode,
  normalizeRemoteBaseUrl,
  pathWithGlobalRemoteProfile,
--- a/apps/desktop/electron/connection-config.test.cjs
+++ b/apps/desktop/electron/connection-config.test.cjs
@@ -22,8 +22,6 @@ const {
  connectionScopeKey,
  cookiesHaveSession,
  cookiesHaveLiveSession,
-  cookiesHavePrivySession,
-  modeIsRemoteLike,
  normAuthMode,
  normalizeRemoteBaseUrl,
  pathWithGlobalRemoteProfile,
@@ -49,19 +47,6 @@ test('normAuthMode coerces to token unless explicitly oauth', () => {
  assert.equal(normAuthMode('weird'), 'token')
 })

-// --- modeIsRemoteLike ---
-
-test('modeIsRemoteLike is true for remote and cloud, false otherwise', () => {
-  // cloud resolves to a remote backend under the hood (Q6), so every resolution
-  // site treats it like remote.
-  assert.equal(modeIsRemoteLike('remote'), true)
-  assert.equal(modeIsRemoteLike('cloud'), true)
-  assert.equal(modeIsRemoteLike('local'), false)
-  assert.equal(modeIsRemoteLike(undefined), false)
-  assert.equal(modeIsRemoteLike(null), false)
-  assert.equal(modeIsRemoteLike('weird'), false)
-})
-
 // --- profileRemoteOverride ---

 test('profileRemoteOverride returns null when no profile is given', () => {
@@ -100,21 +85,6 @@ test('profileRemoteOverride preserves an explicit oauth auth mode', () => {
  assert.equal(profileRemoteOverride(config, 'coder').authMode, 'oauth')
 })

-test('profileRemoteOverride treats a cloud entry as a remote override', () => {
-  // A 'cloud' per-profile entry resolves to the same remote backend a 'remote'
-  // entry would (Q6) — the override must be returned, not dropped.
-  const config = {
-    profiles: {
-      coder: { mode: 'cloud', url: 'https://agent-1.agents.nousresearch.com', authMode: 'oauth' }
-    }
-  }
-  assert.deepEqual(profileRemoteOverride(config, 'coder'), {
-    url: 'https://agent-1.agents.nousresearch.com',
-    authMode: 'oauth',
-    token: undefined
-  })
-})
-
 test('profileRemoteOverride tolerates a missing/!object profiles map', () => {
  assert.equal(profileRemoteOverride({}, 'coder'), null)
  assert.equal(profileRemoteOverride({ profiles: null }, 'coder'), null)
@@ -361,35 +331,6 @@ test('cookiesHaveLiveSession is false for unrelated cookies and non-arrays', ()
  assert.equal(cookiesHaveLiveSession([]), false)
 })

-// --- cookiesHavePrivySession (Nous portal / Privy auth, NOT gateway cookies) ---
-
-test('cookiesHavePrivySession detects the privy-token access cookie', () => {
-  assert.equal(cookiesHavePrivySession([{ name: 'privy-token', value: 'jwt' }]), true)
-})
-
-test('cookiesHavePrivySession detects __Host-/__Secure- prefixes and the legacy privy-session name', () => {
-  assert.equal(cookiesHavePrivySession([{ name: '__Host-privy-token', value: 'x' }]), true)
-  assert.equal(cookiesHavePrivySession([{ name: '__Secure-privy-token', value: 'x' }]), true)
-  assert.equal(cookiesHavePrivySession([{ name: 'privy-session', value: 'x' }]), true)
-})
-
-test('cookiesHavePrivySession is false for an empty value', () => {
-  assert.equal(cookiesHavePrivySession([{ name: 'privy-token', value: '' }]), false)
-})
-
-test('cookiesHavePrivySession does NOT treat hermes gateway cookies as a portal session', () => {
-  // The whole point of Q7: a gateway session cookie is NOT a portal sign-in.
-  assert.equal(cookiesHavePrivySession([{ name: 'hermes_session_at', value: 'x' }]), false)
-  assert.equal(cookiesHavePrivySession([{ name: '__Host-hermes_session_rt', value: 'x' }]), false)
-})
-
-test('cookiesHavePrivySession is false for unrelated cookies and non-arrays', () => {
-  assert.equal(cookiesHavePrivySession([{ name: 'other', value: 'x' }]), false)
-  assert.equal(cookiesHavePrivySession(null), false)
-  assert.equal(cookiesHavePrivySession(undefined), false)
-  assert.equal(cookiesHavePrivySession([]), false)
-})
-
 // --- tokenPreview ---

 test('tokenPreview returns null for empty', () => {
--- a/apps/desktop/electron/desktop-uninstall.cjs
+++ b/apps/desktop/electron/desktop-uninstall.cjs
@@ -138,7 +138,10 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
  if (pythonPath) {
    lines.push(`export PYTHONPATH=${q(pythonPath)}\${PYTHONPATH:+:$PYTHONPATH}`)
  }
-  lines.push(`cd ${q(agentRoot)} 2>/dev/null || true`, `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`)
+  lines.push(
+    `cd ${q(agentRoot)} 2>/dev/null || true`,
+    `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`
+  )
  if (appPath) {
    lines.push(`rm -rf ${q(appPath)} || true`)
  }
@@ -166,15 +169,7 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
 * Removal: even after the desktop PID is gone, Windows releases directory
 * handles lazily, so a single `rmdir /s /q` can half-fail — retry up to 10x.
 */
-function buildWindowsCleanupScript({
-  desktopPid,
-  pythonExe,
-  pythonPath,
-  agentRoot,
-  uninstallArgs,
-  appPath,
-  hermesHome
-}) {
+function buildWindowsCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot, uninstallArgs, appPath, hermesHome }) {
  const pid = Number(desktopPid) || 0
  // cmd.exe has no string escaping inside quotes; strip embedded quotes (paths
  // under %LOCALAPPDATA% never contain them). `&`/`^` in a path would still be
--- a/apps/desktop/electron/desktop-uninstall.test.cjs
+++ b/apps/desktop/electron/desktop-uninstall.test.cjs
@@ -101,7 +101,10 @@ test('resolveRemovableAppPath uses APPIMAGE on Linux when set', () => {
 })

 test('resolveRemovableAppPath finds the unpacked dir on Linux', () => {
-  assert.equal(resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}), '/opt/hermes/linux-unpacked')
+  assert.equal(
+    resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}),
+    '/opt/hermes/linux-unpacked'
+  )
  // A system-package install (/usr/bin) → null, left to apt/dnf.
  assert.equal(resolveRemovableAppPath('/usr/bin/hermes', 'linux', {}), null)
 })
--- a/apps/desktop/electron/embed-referer.cjs
+++ b/apps/desktop/electron/embed-referer.cjs
@@ -1,48 +0,0 @@
-'use strict'
-
-const { session } = require('electron')
-
-const EMBED_SESSION_PARTITION = 'persist:hermes-embed'
-const EMBED_REFERER = 'https://www.youtube.com/'
-const YOUTUBE_REFERER_HOST_RE =
-  /(^|\.)(youtube\.com|youtube-nocookie\.com|googlevideo\.com|ytimg\.com|youtubei\.googleapis\.com)$/i
-
-function installEmbedRefererForSession(embedSession) {
-  if (!embedSession) {
-    return
-  }
-
-  embedSession.webRequest.onBeforeSendHeaders((details, callback) => {
-    let host = ''
-
-    try {
-      host = new URL(details.url).hostname
-    } catch {
-      host = ''
-    }
-
-    if (!YOUTUBE_REFERER_HOST_RE.test(host)) {
-      callback({ requestHeaders: details.requestHeaders })
-      return
-    }
-
-    const headers = { ...details.requestHeaders }
-
-    if (!headers.Referer && !headers.referer) {
-      headers.Referer = EMBED_REFERER
-    }
-
-    callback({ requestHeaders: headers })
-  })
-}
-
-/** Stamp Referer on YouTube requests in the embed webview partition only. */
-function installEmbedReferer() {
-  try {
-    installEmbedRefererForSession(session.fromPartition(EMBED_SESSION_PARTITION))
-  } catch {
-    // Non-fatal: embeds still render; YouTube may show referer errors.
-  }
-}
-
-module.exports = { installEmbedReferer }
--- a/apps/desktop/electron/fs-read-dir.cjs
+++ b/apps/desktop/electron/fs-read-dir.cjs
@@ -92,7 +92,9 @@ async function readDirForIpc(dirPath, options = {}) {
  try {
    const dirents = await fsImpl.promises.readdir(resolved, { withFileTypes: true })
    const visibleDirents = dirents.filter(dirent => !FS_READDIR_HIDDEN.has(dirent.name))
-    const entries = await mapWithStatConcurrency(visibleDirents, dirent => entryForDirent(dirent, resolved, fsImpl))
+    const entries = await mapWithStatConcurrency(visibleDirents, dirent =>
+      entryForDirent(dirent, resolved, fsImpl)
+    )

    entries.sort((a, b) => Number(b.isDirectory) - Number(a.isDirectory) || a.name.localeCompare(b.name))

--- a/apps/desktop/electron/fs-read-dir.test.cjs
+++ b/apps/desktop/electron/fs-read-dir.test.cjs
@@ -349,10 +349,7 @@ test('readDirForIpc bounds concurrent stats while preserving complete sorted out
  assert.equal(result.error, undefined)
  assert.equal(result.entries.length, names.length)
  assert.equal(statCalls.length, names.length)
-  assert.equal(
-    statCalls.some(fullPath => fullPath.endsWith(`${path.sep}node_modules`)),
-    false
-  )
+  assert.equal(statCalls.some(fullPath => fullPath.endsWith(`${path.sep}node_modules`)), false)
  assert.ok(peak > 1, `expected concurrent stats, observed peak ${peak}`)
  assert.ok(peak <= 16, `expected at most 16 concurrent stats, observed peak ${peak}`)
  assert.deepEqual(
@@ -360,5 +357,8 @@ test('readDirForIpc bounds concurrent stats while preserving complete sorted out
    expectedNames
  )
  assert.equal(result.entries.find(entry => entry.name === failedName)?.isDirectory, false)
-  assert.equal(result.entries.filter(entry => entry.isDirectory).length, successfulDirectoryNames.size)
+  assert.equal(
+    result.entries.filter(entry => entry.isDirectory).length,
+    successfulDirectoryNames.size
+  )
 })
--- a/apps/desktop/electron/git-repo-scan.cjs
+++ b/apps/desktop/electron/git-repo-scan.cjs
@@ -86,8 +86,10 @@ async function scanGitRepos(roots, options = {}) {
    await mapLimit(subdirs, MAX_CONCURRENCY, sub => walk(sub, depth + 1))
  }

-  await mapLimit(searchRoots.map(root => String(root || '').trim()).filter(Boolean), MAX_CONCURRENCY, root =>
-    walk(root, 0)
+  await mapLimit(
+    searchRoots.map(root => String(root || '').trim()).filter(Boolean),
+    MAX_CONCURRENCY,
+    root => walk(root, 0)
  )

  return [...found.entries()].map(([root, label]) => ({ label, root }))
--- a/apps/desktop/electron/git-review-ops.cjs
+++ b/apps/desktop/electron/git-review-ops.cjs
@@ -10,26 +10,7 @@ const { execFile } = require('node:child_process')
 const fs = require('node:fs/promises')
 const path = require('node:path')

-// `simple-git` is a pure-JS runtime dep that workspace dedup hoists into the
-// repo-root node_modules.  Packaged builds set `files:` in package.json, which
-// excludes node_modules from the asar, so the normal require() fails at launch
-// (issue #52735: "Cannot find module 'simple-git'").  We ship the dep's
-// closure under resources/native-deps/vendor/node_modules/ via extraResources
-// + scripts/stage-native-deps.cjs, and resolve from there when the hoisted
-// require() isn't reachable.  The `vendor/` nesting matters: electron-builder
-// drops a node_modules dir at the root of an extraResources copy but keeps a
-// nested one.  Dev mode never hits the fallback -- Node's normal lookup finds
-// the hoisted copy.
-let simpleGit
-try {
-  simpleGit = require('simple-git')
-} catch {
-  const resourcesPath = process.resourcesPath
-  if (!resourcesPath) {
-    throw new Error("git-review IPC: 'simple-git' not found and no resourcesPath to fall back to")
-  }
-  simpleGit = require(path.join(resourcesPath, 'native-deps', 'vendor', 'node_modules', 'simple-git'))
-}
+const simpleGit = require('simple-git')

 const { resolveRequestedPathForIpc } = require('./hardening.cjs')

@@ -207,12 +188,7 @@ async function defaultBranchName(git) {

  // Prefer a local trunk, then a remote-only one (returns the clean name either
  // way) so "branch off main" works even before main is checked out locally.
-  for (const ref of [
-    'refs/heads/main',
-    'refs/heads/master',
-    'refs/remotes/origin/main',
-    'refs/remotes/origin/master'
-  ]) {
+  for (const ref of ['refs/heads/main', 'refs/heads/master', 'refs/remotes/origin/main', 'refs/remotes/origin/master']) {
    try {
      await git.raw(['rev-parse', '--verify', '--quiet', ref])

--- a/apps/desktop/electron/git-worktree-ops.cjs
+++ b/apps/desktop/electron/git-worktree-ops.cjs
@@ -45,10 +45,7 @@ function parseWorktrees(out) {
    } else if (!cur) {
      continue
    } else if (line.startsWith('branch ')) {
-      cur.branch = line
-        .slice(7)
-        .trim()
-        .replace(/^refs\/heads\//, '')
+      cur.branch = line.slice(7).trim().replace(/^refs\/heads\//, '')
    } else if (line === 'detached') {
      cur.detached = true
    } else if (line === 'bare') {
@@ -125,9 +122,10 @@ async function gitLine(gitBin, args, cwd) {
 }

 async function defaultBranch(gitBin, cwd) {
-  const remote = (
-    await gitLine(gitBin, ['symbolic-ref', '--quiet', '--short', 'refs/remotes/origin/HEAD'], cwd)
-  ).replace(/^origin\//, '')
+  const remote = (await gitLine(gitBin, ['symbolic-ref', '--quiet', '--short', 'refs/remotes/origin/HEAD'], cwd)).replace(
+    /^origin\//,
+    ''
+  )

  if (remote) {
    return remote
@@ -179,16 +177,7 @@ async function ensureGitRepo(gitBin, dir) {
    // Inline identity so the seed commit lands even with no global git config.
    await runGit(
      gitBin,
-      [
-        '-c',
-        'user.email=hermes@localhost',
-        '-c',
-        'user.name=Hermes',
-        'commit',
-        '--allow-empty',
-        '-m',
-        'Initial commit'
-      ],
+      ['-c', 'user.email=hermes@localhost', '-c', 'user.name=Hermes', 'commit', '--allow-empty', '-m', 'Initial commit'],
      dir
    )
  }
--- a/apps/desktop/electron/hardening.cjs
+++ b/apps/desktop/electron/hardening.cjs
@@ -186,10 +186,7 @@ async function statForIpc(fsImpl, resolvedPath, purpose, typeLabel) {
    if (code === 'ENOENT' || code === 'ENOTDIR') {
      throw ipcPathError(code || 'ENOENT', `${purpose} failed: ${typeLabel} does not exist.`)
    }
-    throw ipcPathError(
-      code || 'read-error',
-      `${purpose} failed: ${error instanceof Error ? error.message : String(error)}`
-    )
+    throw ipcPathError(code || 'read-error', `${purpose} failed: ${error instanceof Error ? error.message : String(error)}`)
  }
 }

@@ -204,10 +201,7 @@ async function realpathForIpc(fsImpl, resolvedPath, purpose) {
    return realPath
  } catch (error) {
    const code = error && typeof error === 'object' ? error.code : ''
-    throw ipcPathError(
-      code || 'read-error',
-      `${purpose} failed: ${error instanceof Error ? error.message : String(error)}`
-    )
+    throw ipcPathError(code || 'read-error', `${purpose} failed: ${error instanceof Error ? error.message : String(error)}`)
  }
 }

--- a/apps/desktop/electron/main.cjs
+++ b/apps/desktop/electron/main.cjs
--- a/apps/desktop/electron/oauth-net-request.test.cjs
+++ b/apps/desktop/electron/oauth-net-request.test.cjs
@@ -30,8 +30,5 @@ test('setJsonRequestHeaders does not set Electron-restricted Content-Length', ()
  setJsonRequestHeaders(request)

  assert.deepEqual(headers, [['Content-Type', 'application/json']])
-  assert.equal(
-    headers.some(([name]) => name.toLowerCase() === 'content-length'),
-    false
-  )
+  assert.equal(headers.some(([name]) => name.toLowerCase() === 'content-length'), false)
 })
--- a/apps/desktop/electron/preload.cjs
+++ b/apps/desktop/electron/preload.cjs
@@ -41,16 +41,6 @@ contextBridge.exposeInMainWorld('hermesDesktop', {
  probeConnectionConfig: remoteUrl => ipcRenderer.invoke('hermes:connection-config:probe', remoteUrl),
  oauthLoginConnectionConfig: remoteUrl => ipcRenderer.invoke('hermes:connection-config:oauth-login', remoteUrl),
  oauthLogoutConnectionConfig: remoteUrl => ipcRenderer.invoke('hermes:connection-config:oauth-logout', remoteUrl),
-  // Hermes Cloud: one portal login powers discovery + silent per-agent sign-in
-  // (cloud-auto-discovery Phase 3).
-  cloud: {
-    status: () => ipcRenderer.invoke('hermes:cloud:status'),
-    betaEnabled: () => ipcRenderer.invoke('hermes:cloud:beta-enabled'),
-    login: () => ipcRenderer.invoke('hermes:cloud:login'),
-    logout: () => ipcRenderer.invoke('hermes:cloud:logout'),
-    discover: org => ipcRenderer.invoke('hermes:cloud:discover', org),
-    agentSignIn: dashboardUrl => ipcRenderer.invoke('hermes:cloud:agent-sign-in', dashboardUrl)
-  },
  profile: {
    get: () => ipcRenderer.invoke('hermes:profile:get'),
    set: name => ipcRenderer.invoke('hermes:profile:set', name)
--- a/apps/desktop/electron/titlebar-overlay-width.cjs
+++ b/apps/desktop/electron/titlebar-overlay-width.cjs
@@ -1,24 +0,0 @@
-'use strict'
-
-const OVERLAY_FALLBACK_WIDTH = 144
-
-/**
- * Static pre-layout reservation (px) for the right-side native window-controls
- * overlay (min/max/close). Only a FALLBACK — once laid out the renderer reads
- * the exact width from navigator.windowControlsOverlay
- * (use-window-controls-overlay-width.ts) and uses this value only when the WCO
- * API is unavailable.
- *
- * macOS uses traffic lights positioned via trafficLightPosition, not a WCO
- * overlay, so it reserves nothing here. Every other desktop platform now paints
- * the Electron overlay (Windows, WSLg, and plain Linux KDE/GNOME), so they all
- * reserve the fallback width.
- *
- * @param {{ isWindows?: boolean, isWsl?: boolean, isMac?: boolean }} opts
- */
-function nativeOverlayWidth({ isWindows = false, isWsl = false, isMac = false } = {}) {
-  if (isMac) return 0
-  return OVERLAY_FALLBACK_WIDTH
-}
-
-module.exports = { OVERLAY_FALLBACK_WIDTH, nativeOverlayWidth }
--- a/apps/desktop/electron/titlebar-overlay-width.test.cjs
+++ b/apps/desktop/electron/titlebar-overlay-width.test.cjs
@@ -1,36 +0,0 @@
-const assert = require('node:assert/strict')
-const test = require('node:test')
-
-const { OVERLAY_FALLBACK_WIDTH, nativeOverlayWidth } = require('./titlebar-overlay-width.cjs')
-
-// This static reservation is only the pre-layout FALLBACK. Once laid out the
-// renderer reads the exact width from navigator.windowControlsOverlay
-// (use-window-controls-overlay-width.ts) and uses these values only when the WCO
-// API is unavailable.
-
-test('Windows reserves the overlay fallback width', () => {
-  assert.equal(nativeOverlayWidth({ isWindows: true }), OVERLAY_FALLBACK_WIDTH)
-})
-
-test('WSLg paints the same WCO, so it reserves the same fallback width', () => {
-  // The original bug: WSL fell through to 0, so the right tools sat under the
-  // controls and the title overran into them.
-  assert.equal(nativeOverlayWidth({ isWsl: true }), OVERLAY_FALLBACK_WIDTH)
-})
-
-test('plain Linux paints the WCO too, so it reserves the fallback width', () => {
-  // Regression #53185: re-enabling the overlay on plain Linux (KDE/GNOME)
-  // without reserving its width left the native min/max/close buttons painting
-  // on top of the app's right-edge titlebar tools.
-  assert.equal(nativeOverlayWidth({ isWindows: false, isWsl: false }), OVERLAY_FALLBACK_WIDTH)
-  assert.equal(nativeOverlayWidth(), OVERLAY_FALLBACK_WIDTH)
-  assert.equal(nativeOverlayWidth({}), OVERLAY_FALLBACK_WIDTH)
-})
-
-test('macOS uses traffic lights, not a WCO overlay, so it reserves nothing', () => {
-  assert.equal(nativeOverlayWidth({ isMac: true }), 0)
-})
-
-test('the fallback width is a sane positive pixel value', () => {
-  assert.ok(Number.isInteger(OVERLAY_FALLBACK_WIDTH) && OVERLAY_FALLBACK_WIDTH > 0)
-})
--- a/apps/desktop/electron/update-count.test.cjs
+++ b/apps/desktop/electron/update-count.test.cjs
@@ -7,81 +7,45 @@ const { resolveBehindCount, shouldCountCommits } = require('./update-count.cjs')
 // unconditionally, so a shallow checkout with no merge-base surfaced the bogus
 // rev-list count (e.g. 12104). This asserts the new shallow/no-merge-base branch.
 test('shallow checkout with no merge-base does NOT trust the bogus rev-list count', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '12104',
-      currentSha: 'aaa',
-      targetSha: 'bbb',
-      isShallow: true,
-      hasMergeBase: false
-    }),
-    1
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '12104', currentSha: 'aaa', targetSha: 'bbb',
+    isShallow: true, hasMergeBase: false,
+  }), 1)
 })

 test('shallow checkout with no merge-base but identical SHA reports up-to-date', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '12104',
-      currentSha: 'abc',
-      targetSha: 'abc',
-      isShallow: true,
-      hasMergeBase: false
-    }),
-    0
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '12104', currentSha: 'abc', targetSha: 'abc',
+    isShallow: true, hasMergeBase: false,
+  }), 0)
 })

 test('shallow checkout WITH a merge-base keeps the exact count (reliable)', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '3',
-      currentSha: 'aaa',
-      targetSha: 'bbb',
-      isShallow: true,
-      hasMergeBase: true
-    }),
-    3
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '3', currentSha: 'aaa', targetSha: 'bbb',
+    isShallow: true, hasMergeBase: true,
+  }), 3)
 })

 test('full (non-shallow) clone keeps the exact count path unchanged', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '7',
-      currentSha: 'aaa',
-      targetSha: 'bbb',
-      isShallow: false,
-      hasMergeBase: true
-    }),
-    7
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '7', currentSha: 'aaa', targetSha: 'bbb',
+    isShallow: false, hasMergeBase: true,
+  }), 7)
 })

 test('up-to-date full clone reports 0', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '0',
-      currentSha: 'x',
-      targetSha: 'x',
-      isShallow: false,
-      hasMergeBase: true
-    }),
-    0
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '0', currentSha: 'x', targetSha: 'x',
+    isShallow: false, hasMergeBase: true,
+  }), 0)
 })

 test('non-numeric count falls back to 0 (defensive, unchanged behaviour)', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '',
-      currentSha: 'aaa',
-      targetSha: 'bbb',
-      isShallow: false,
-      hasMergeBase: true
-    }),
-    0
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '', currentSha: 'aaa', targetSha: 'bbb',
+    isShallow: false, hasMergeBase: true,
+  }), 0)
 })

 // shouldCountCommits gates the expensive `rev-list --count` in checkUpdates().
@@ -104,24 +68,12 @@ test('full (non-shallow) clone always runs the count', () => {
 // The skip path produces an empty countStr; resolveBehindCount must NOT trust
 // it and must fall through to the SHA compare (mirrors the live call site).
 test('skipped-count path resolves via SHA compare, never via empty countStr', () => {
-  assert.equal(
-    resolveBehindCount({
-      countStr: '',
-      currentSha: 'aaa',
-      targetSha: 'bbb',
-      isShallow: true,
-      hasMergeBase: false
-    }),
-    1
-  )
-  assert.equal(
-    resolveBehindCount({
-      countStr: '',
-      currentSha: 'same',
-      targetSha: 'same',
-      isShallow: true,
-      hasMergeBase: false
-    }),
-    0
-  )
+  assert.equal(resolveBehindCount({
+    countStr: '', currentSha: 'aaa', targetSha: 'bbb',
+    isShallow: true, hasMergeBase: false,
+  }), 1)
+  assert.equal(resolveBehindCount({
+    countStr: '', currentSha: 'same', targetSha: 'same',
+    isShallow: true, hasMergeBase: false,
+  }), 0)
 })
--- a/apps/desktop/electron/update-relaunch.test.cjs
+++ b/apps/desktop/electron/update-relaunch.test.cjs
@@ -62,10 +62,7 @@ test('resolveUnpackedRelease is null for AppImage / .deb / .rpm / dev / unresolv
  assert.equal(resolveUnpackedRelease('/usr/lib/hermes/hermes', ROOT, 'linux'), null)
  assert.equal(resolveUnpackedRelease('/opt/Hermes/hermes', ROOT, 'linux'), null)
  // dev electron
-  assert.equal(
-    resolveUnpackedRelease('/home/u/.hermes/hermes-agent/node_modules/electron/dist/electron', ROOT, 'linux'),
-    null
-  )
+  assert.equal(resolveUnpackedRelease('/home/u/.hermes/hermes-agent/node_modules/electron/dist/electron', ROOT, 'linux'), null)
  // empty / missing
  assert.equal(resolveUnpackedRelease('', ROOT, 'linux'), null)
  assert.equal(resolveUnpackedRelease(path.join(UNPACKED, 'hermes'), '', 'linux'), null)
--- a/apps/desktop/electron/update-remote.cjs
+++ b/apps/desktop/electron/update-remote.cjs
@@ -39,9 +39,7 @@ function canonicalGitHubRemote(url) {
 }

 function isSshRemote(url) {
-  const value = String(url || '')
-    .trim()
-    .toLowerCase()
+  const value = String(url || '').trim().toLowerCase()
  return value.startsWith('git@') || value.startsWith('ssh://')
 }

--- a/apps/desktop/electron/vscode-marketplace.cjs
+++ b/apps/desktop/electron/vscode-marketplace.cjs
@@ -26,11 +26,7 @@ const REQUEST_TIMEOUT_MS = 20_000
 const ID_RE = /^[\w-]+\.[\w-]+$/

 /** Minimal HTTPS helper with redirect-following, timeout, and a size cap. */
-function request(
-  url,
-  { method = 'GET', headers = {}, body = null, maxBytes = MAX_VSIX_BYTES } = {},
-  redirectsLeft = MAX_REDIRECTS
-) {
+function request(url, { method = 'GET', headers = {}, body = null, maxBytes = MAX_VSIX_BYTES } = {}, redirectsLeft = MAX_REDIRECTS) {
  return new Promise((resolve, reject) => {
    const req = https.request(url, { method, headers }, res => {
      const status = res.statusCode ?? 0
@@ -46,13 +42,7 @@ function request(
        const next = new URL(res.headers.location, url).toString()
        res.resume()
        // Redirects to the CDN are plain GETs (drop the POST body).
-        resolve(
-          request(
-            next,
-            { method: 'GET', headers: { 'User-Agent': headers['User-Agent'] }, maxBytes },
-            redirectsLeft - 1
-          )
-        )
+        resolve(request(next, { method: 'GET', headers: { 'User-Agent': headers['User-Agent'] }, maxBytes }, redirectsLeft - 1))

        return
      }
--- a/apps/desktop/electron/window-state.test.cjs
+++ b/apps/desktop/electron/window-state.test.cjs
@@ -26,16 +26,7 @@ const LAPTOP = [{ workArea: { x: 0, y: 0, width: 1366, height: 728 } }]
 // ─── sanitizeWindowState ───────────────────────────────────────────────────

 test('sanitizeWindowState rejects missing/garbage input', () => {
-  for (const bad of [
-    null,
-    undefined,
-    'nope',
-    42,
-    {},
-    { width: 'x', height: 800 },
-    { width: NaN, height: 800 },
-    { width: 1000 }
-  ]) {
+  for (const bad of [null, undefined, 'nope', 42, {}, { width: 'x', height: 800 }, { width: NaN, height: 800 }, { width: 1000 }]) {
    assert.equal(sanitizeWindowState(bad), null)
  }
 })
@@ -121,13 +112,9 @@ test('computeWindowOptions does not clamp when displays are unknown', () => {
 test('debounce coalesces a burst into one trailing run', t => {
  t.mock.timers.enable({ apis: ['setTimeout'] })
  let calls = 0
-  const d = debounce(() => {
-    calls += 1
-  }, 250)
+  const d = debounce(() => { calls += 1 }, 250)

-  d()
-  d()
-  d()
+  d(); d(); d()
  assert.equal(calls, 0)
  t.mock.timers.tick(249)
  assert.equal(calls, 0)
@@ -138,9 +125,7 @@ test('debounce coalesces a burst into one trailing run', t => {
 test('debounce.flush runs now and cancels the pending timer', t => {
  t.mock.timers.enable({ apis: ['setTimeout'] })
  let calls = 0
-  const d = debounce(() => {
-    calls += 1
-  }, 250)
+  const d = debounce(() => { calls += 1 }, 250)

  d()
  d.flush()
--- a/apps/desktop/electron/windows-child-process.test.cjs
+++ b/apps/desktop/electron/windows-child-process.test.cjs
@@ -13,7 +13,7 @@ function readElectronFile(name) {

 function requireHiddenChildOptions(source, needle) {
  const match = needle instanceof RegExp ? needle.exec(source) : null
-  const index = needle instanceof RegExp ? (match?.index ?? -1) : source.indexOf(needle)
+  const index = needle instanceof RegExp ? match?.index ?? -1 : source.indexOf(needle)
  assert.notEqual(index, -1, `missing call site: ${needle}`)
  const snippet = source.slice(index, index + 700)
  assert.match(
@@ -38,40 +38,19 @@ test('desktop background child processes opt into hidden Windows consoles', () =
  requireHiddenChildOptions(source, /hermesProcess = spawn\(\s*backend\.command,\s*backend\.args/)
  requireHiddenChildOptions(source, /spawn\(\s*py,\s*\['-m', 'hermes_cli\.main', 'uninstall', '--gui-summary'\]/)

-  assert.match(source, /function unwrapWindowsVenvHermesCommand\(command, backendArgs\)/)
+  assert.match(source, /function unwrapWindowsVenvHermesCommand\(command, dashboardArgs\)/)
+  assert.match(source, /existing Hermes no-console Python at/)
+  assert.match(source, /function getNoConsoleVenvPython\(venvRoot\)/)
+  assert.match(source, /function toNoConsolePython\(pythonPath\)/)
+  assert.match(source, /function applyWindowsNoConsoleSpawnHints\(backend\)/)
+  assert.match(source, /function readVenvHome\(venvRoot\)/)
+  assert.match(source, /path\.join\(venvRoot, 'Scripts', 'pythonw\.exe'\)/)
+  assert.match(source, /backendStartFailure/)
+  assert.match(source, /HERMES_DESKTOP_READY_FILE/)
+  assert.match(source, /readyFile: true/)
  assert.match(source, /function getVenvSitePackagesEntries\(venvRoot\)/)
  assert.match(source, /path\.join\(venvRoot, 'Lib', 'site-packages'\)/)
-  assert.match(source, /args: \['-m', 'hermes_cli\.main', \.\.\.backendArgs\]/)
-})
-
-test('desktop backend launches console python so child consoles are inherited, not pythonw', () => {
-  const source = readElectronFile('main.cjs')
-
-  // The flash fix is structural: the backend runs as a console-subsystem
-  // python.exe under hiddenWindowsChildOptions() (-> CREATE_NO_WINDOW), so it
-  // owns ONE windowless console that every descendant spawn inherits. Launching
-  // it as GUI-subsystem pythonw.exe is what made each child allocate (and flash)
-  // its own console, so the backend command must never be pythonw.
-  assert.doesNotMatch(source, /pythonw\.exe'\)/, 'backend must not be launched via pythonw.exe')
-  assert.doesNotMatch(
-    source,
-    /function getNoConsoleVenvPython\b/,
-    'pythonw-conversion helper should be gone; console python is launched directly'
-  )
-  assert.doesNotMatch(
-    source,
-    /function applyWindowsNoConsoleSpawnHints\b/,
-    'pythonw spawn-hint rewriter should be gone'
-  )
-
-  // Console python restores stdout, so the port is announced on the normal
-  // HERMES_DASHBOARD_READY stdout line — no ready-file side channel is set.
-  assert.doesNotMatch(source, /readyFile: true/, 'no backend should opt into the pythonw ready-file path')
-
-  // Both desktop backend launches must still go through hiddenWindowsChildOptions
-  // so the single backend console is created windowless.
-  requireHiddenChildOptions(source, /spawn\(\s*backend\.command,\s*backend\.args/)
-  requireHiddenChildOptions(source, /hermesProcess = spawn\(\s*backend\.command,\s*backend\.args/)
+  assert.match(source, /args: \['-m', 'hermes_cli\.main', \.\.\.dashboardArgs\]/)
 })

 test('intentional or interactive desktop child processes stay documented', () => {
@@ -89,5 +68,5 @@ test('bootstrap PowerShell runner hides Windows console children', () => {
  const source = readElectronFile('bootstrap-runner.cjs')

  assert.match(source, /function hiddenWindowsChildOptions\(options = \{\}\)/)
-  requireHiddenChildOptions(source, /spawn\(\s*ps,\s*fullArgs/)
+  requireHiddenChildOptions(source, 'spawn(ps, fullArgs')
 })
--- a/apps/desktop/electron/windows-hermes-resolution.test.cjs
+++ b/apps/desktop/electron/windows-hermes-resolution.test.cjs
@@ -1,67 +0,0 @@
-'use strict'
-
-// Regression guards for Windows `hermes` resolution in main.cjs.
-//
-// main.cjs has no module.exports, so these follow the repo's source-assertion
-// test pattern (see windows-child-process.test.cjs). They pin the two Windows
-// resolution bugs that caused desktop reinstall loops:
-//   1. findOnPath() tried the empty extension FIRST, so an extensionless
-//      Git-Bash `hermes` shim shadowed the real hermes.cmd/hermes.exe; the
-//      shim then failed the --version probe and the desktop fell through to a
-//      spurious bootstrap/repair.
-//   2. handOffWindowsBootstrapRecovery() chose --update vs the destructive
-//      --repair by checking ONLY venv\Scripts\hermes.exe (the console-script
-//      shim, written at the END of venv setup and absent in interrupted
-//      states), so it escalated to a full venv recreate even on healthy
-//      installs.
-
-const test = require('node:test')
-const assert = require('node:assert/strict')
-const fs = require('node:fs')
-const path = require('node:path')
-
-function readMain() {
-  return fs.readFileSync(path.join(__dirname, 'main.cjs'), 'utf8').replace(/\r\n/g, '\n')
-}
-
-test('findOnPath tries PATHEXT extensions before the bare (empty) name on Windows', () => {
-  const source = readMain()
-  // Fixed order: PATHEXT first, empty string LAST.
-  assert.match(
-    source,
-    /\(process\.env\.PATHEXT \|\| '\.COM;\.EXE;\.BAT;\.CMD'\)\.split\(';'\)\.filter\(Boolean\), ''\]/,
-    'extensions array must end with the empty string, not start with it'
-  )
-  // The buggy empty-first order must not return.
-  assert.doesNotMatch(
-    source,
-    /\['', \.\.\.\(process\.env\.PATHEXT/,
-    'empty-extension-first order regressed: an extensionless shim can shadow hermes.cmd/.exe'
-  )
-})
-
-test('Windows bootstrap recovery chooses --update when any real-install signal is present', () => {
-  const source = readMain()
-  assert.match(source, /const haveRealInstall =/, 'recovery must compute haveRealInstall')
-  assert.match(
-    source,
-    /fileExists\(venvPython\)/,
-    'recovery must accept the venv interpreter as a real-install signal'
-  )
-  assert.match(
-    source,
-    /\.hermes-bootstrap-complete/,
-    'recovery must accept the bootstrap-complete marker as a real-install signal'
-  )
-  assert.match(
-    source,
-    /updaterArgs = haveRealInstall \? \['--update'/,
-    'updaterArgs must gate on haveRealInstall'
-  )
-  // The old too-narrow check (only venv\Scripts\hermes.exe) must not return.
-  assert.doesNotMatch(
-    source,
-    /updaterArgs = fileExists\(venvHermes\) \?/,
-    'recovery regressed to gating only on the hermes.exe shim, which forces destructive --repair'
-  )
-})
--- a/apps/desktop/electron/windows-user-env.cjs
+++ b/apps/desktop/electron/windows-user-env.cjs
@@ -21,7 +21,8 @@ const { execFileSync } = require('node:child_process')
 // the requested value line isn't present.
 function parseRegQueryValue(stdout, name) {
  if (!stdout || !name) return null
-  const typePattern = /^(\S+)\s+(?:REG_SZ|REG_EXPAND_SZ|REG_MULTI_SZ|REG_DWORD|REG_QWORD|REG_BINARY|REG_NONE)\s+(.*)$/
+  const typePattern =
+    /^(\S+)\s+(?:REG_SZ|REG_EXPAND_SZ|REG_MULTI_SZ|REG_DWORD|REG_QWORD|REG_BINARY|REG_NONE)\s+(.*)$/
  for (const rawLine of String(stdout).split(/\r?\n/)) {
    const line = rawLine.trim()
    const match = line.match(typePattern)
@@ -46,7 +47,10 @@ function expandWindowsEnvRefs(value, env = process.env) {
 // Read a User-scoped env var from HKCU\Environment. Windows-only: returns null
 // off-Windows (without spawning), on any spawn error, when `reg` exits non-zero
 // (the value doesn't exist), or when the value is empty.
-function readWindowsUserEnvVar(name, { platform = process.platform, env = process.env, exec = execFileSync } = {}) {
+function readWindowsUserEnvVar(
+  name,
+  { platform = process.platform, env = process.env, exec = execFileSync } = {}
+) {
  if (platform !== 'win32' || !name) return null
  let stdout
  try {
--- a/apps/desktop/electron/windows-user-env.test.cjs
+++ b/apps/desktop/electron/windows-user-env.test.cjs
@@ -1,12 +1,21 @@
 const assert = require('node:assert/strict')
 const { test } = require('node:test')

-const { expandWindowsEnvRefs, parseRegQueryValue, readWindowsUserEnvVar } = require('./windows-user-env.cjs')
+const {
+  expandWindowsEnvRefs,
+  parseRegQueryValue,
+  readWindowsUserEnvVar
+} = require('./windows-user-env.cjs')

 // ── parseRegQueryValue ─────────────────────────────────────────────────────

 test('parseRegQueryValue extracts a REG_SZ value', () => {
-  const out = ['', 'HKEY_CURRENT_USER\\Environment', '    HERMES_HOME    REG_SZ    F:\\Hermes\\data', ''].join('\r\n')
+  const out = [
+    '',
+    'HKEY_CURRENT_USER\\Environment',
+    '    HERMES_HOME    REG_SZ    F:\\Hermes\\data',
+    ''
+  ].join('\r\n')
  assert.equal(parseRegQueryValue(out, 'HERMES_HOME'), 'F:\\Hermes\\data')
 })

@@ -30,7 +39,10 @@ test('parseRegQueryValue returns null when the value line is absent', () => {
 // ── expandWindowsEnvRefs ───────────────────────────────────────────────────

 test('expandWindowsEnvRefs expands %VAR% case-insensitively', () => {
-  assert.equal(expandWindowsEnvRefs('%UserProfile%\\h', { USERPROFILE: 'C:\\Users\\jeff' }), 'C:\\Users\\jeff\\h')
+  assert.equal(
+    expandWindowsEnvRefs('%UserProfile%\\h', { USERPROFILE: 'C:\\Users\\jeff' }),
+    'C:\\Users\\jeff\\h'
+  )
 })

 test('expandWindowsEnvRefs leaves literal paths and unknown refs intact', () => {
--- a/apps/desktop/electron/workspace-cwd.cjs
+++ b/apps/desktop/electron/workspace-cwd.cjs
@@ -14,7 +14,11 @@ function isPackagedInstallPath(dir, { installRoots, isPackaged }) {
    return false
  }

-  const roots = new Set((installRoots ?? []).filter(Boolean).map(candidate => path.resolve(String(candidate))))
+  const roots = new Set(
+    (installRoots ?? [])
+      .filter(Boolean)
+      .map(candidate => path.resolve(String(candidate)))
+  )

  for (const root of roots) {
    if (resolved === root) {
--- a/apps/desktop/electron/workspace-cwd.test.cjs
+++ b/apps/desktop/electron/workspace-cwd.test.cjs
@@ -13,21 +13,33 @@ const { isPackagedInstallPath } = require('./workspace-cwd.cjs')
 const installRoot = path.resolve('/opt/Hermes')

 test('isPackagedInstallPath returns false when not packaged', () => {
-  assert.equal(isPackagedInstallPath(installRoot, { isPackaged: false, installRoots: [installRoot] }), false)
+  assert.equal(
+    isPackagedInstallPath(installRoot, { isPackaged: false, installRoots: [installRoot] }),
+    false
+  )
 })

 test('isPackagedInstallPath flags the install root itself', () => {
-  assert.equal(isPackagedInstallPath(installRoot, { isPackaged: true, installRoots: [installRoot] }), true)
+  assert.equal(
+    isPackagedInstallPath(installRoot, { isPackaged: true, installRoots: [installRoot] }),
+    true
+  )
 })

 test('isPackagedInstallPath flags paths nested under the install root', () => {
  const nested = path.join(installRoot, 'resources', 'app.asar')

-  assert.equal(isPackagedInstallPath(nested, { isPackaged: true, installRoots: [installRoot] }), true)
+  assert.equal(
+    isPackagedInstallPath(nested, { isPackaged: true, installRoots: [installRoot] }),
+    true
+  )
 })

 test('isPackagedInstallPath ignores paths outside the install root', () => {
  const homeProject = path.resolve('/home/user/projects/demo')

-  assert.equal(isPackagedInstallPath(homeProject, { isPackaged: true, installRoots: [installRoot] }), false)
+  assert.equal(
+    isPackagedInstallPath(homeProject, { isPackaged: true, installRoots: [installRoot] }),
+    false
+  )
 })
--- a/apps/desktop/electron/wsl-clipboard-image.cjs
+++ b/apps/desktop/electron/wsl-clipboard-image.cjs
@@ -1,92 +0,0 @@
-// Pull a Windows-host clipboard image from inside WSL2 via PowerShell (WSLg
-// bridges text but not images). Returns PNG bytes or null; exec injectable.
-
-const { execFileSync } = require('node:child_process')
-
-// STA is mandatory: System.Windows.Forms.Clipboard throws ThreadStateException
-// off a single-threaded apartment. We emit base64 (not raw bytes) so the PNG
-// survives stdout's text decoding intact, and write with [Console]::Out.Write
-// to avoid a trailing newline.
-const PS_SCRIPT = [
-  'Add-Type -AssemblyName System.Windows.Forms,System.Drawing',
-  '$img = [System.Windows.Forms.Clipboard]::GetImage()',
-  'if ($null -eq $img) { exit 0 }',
-  '$ms = New-Object System.IO.MemoryStream',
-  '$img.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)',
-  '[Console]::Out.Write([System.Convert]::ToBase64String($ms.ToArray()))'
-].join('\n')
-
-// PowerShell's -EncodedCommand takes UTF-16LE base64. Encoding the whole script
-// this way sidesteps every layer of WSL→Windows quoting (spaces, quotes,
-// brackets, newlines) that plain -Command arguments would mangle.
-function encodePowerShellCommand(script) {
-  return Buffer.from(String(script), 'utf16le').toString('base64')
-}
-
-// Locate powershell.exe. The bare name resolves through WSL's Windows-interop
-// PATH on every standard WSL2 setup; the absolute fallback covers a stripped
-// PATH. Returns the first candidate — execFile surfaces ENOENT if it's wrong
-// and we fall back to null.
-function powershellCandidates() {
-  return ['powershell.exe', '/mnt/c/Windows/System32/WindowsPowerShell/v1.0/powershell.exe']
-}
-
-function decodeClipboardImageBase64(stdout) {
-  const b64 = String(stdout || '').trim()
-  if (!b64) return null
-
-  let buffer
-  try {
-    buffer = Buffer.from(b64, 'base64')
-  } catch {
-    return null
-  }
-
-  // Guard against partial / garbage output: require a real PNG signature.
-  const PNG_SIGNATURE = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a])
-  if (buffer.length < PNG_SIGNATURE.length || !buffer.subarray(0, PNG_SIGNATURE.length).equals(PNG_SIGNATURE)) {
-    return null
-  }
-
-  return buffer
-}
-
-// Read the Windows clipboard image from inside WSL. Returns a PNG Buffer, or
-// null when there's no image, PowerShell is unreachable, or output is invalid.
-// Linux-only by contract (caller gates on IS_WSL); never throws.
-function readWslWindowsClipboardImage({ exec = execFileSync, candidates = powershellCandidates() } = {}) {
-  const encoded = encodePowerShellCommand(PS_SCRIPT)
-
-  for (const ps of candidates) {
-    try {
-      const stdout = exec(
-        ps,
-        ['-NoProfile', '-NonInteractive', '-STA', '-ExecutionPolicy', 'Bypass', '-EncodedCommand', encoded],
-        {
-          encoding: 'utf8',
-          windowsHide: true,
-          timeout: 8000,
-          // A 4K screenshot base64s to a few MB; give stdout generous headroom.
-          maxBuffer: 64 * 1024 * 1024,
-          // PowerShell writes progress/CLIXML noise to stderr — ignore it.
-          stdio: ['ignore', 'pipe', 'ignore']
-        }
-      )
-      const decoded = decodeClipboardImageBase64(stdout)
-      if (decoded) return decoded
-      // Empty stdout = no image on the clipboard; stop, don't try fallbacks.
-      if (String(stdout || '').trim() === '') return null
-    } catch {
-      // This powershell.exe candidate is missing/failed — try the next one.
-    }
-  }
-
-  return null
-}
-
-module.exports = {
-  decodeClipboardImageBase64,
-  encodePowerShellCommand,
-  powershellCandidates,
-  readWslWindowsClipboardImage
-}
--- a/apps/desktop/electron/wsl-clipboard-image.test.cjs
+++ b/apps/desktop/electron/wsl-clipboard-image.test.cjs
@@ -1,114 +0,0 @@
-const assert = require('node:assert/strict')
-const test = require('node:test')
-
-const {
-  decodeClipboardImageBase64,
-  encodePowerShellCommand,
-  powershellCandidates,
-  readWslWindowsClipboardImage
-} = require('./wsl-clipboard-image.cjs')
-
-const PNG_SIGNATURE = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a])
-
-function fakePngBuffer(extraBytes = 16) {
-  return Buffer.concat([PNG_SIGNATURE, Buffer.alloc(extraBytes, 0x42)])
-}
-
-test('encodePowerShellCommand produces UTF-16LE base64 PowerShell can decode', () => {
-  const encoded = encodePowerShellCommand('Write-Output "hi"')
-  const roundTripped = Buffer.from(encoded, 'base64').toString('utf16le')
-  assert.equal(roundTripped, 'Write-Output "hi"')
-})
-
-test('decodeClipboardImageBase64 returns a Buffer for valid PNG base64', () => {
-  const png = fakePngBuffer()
-  const decoded = decodeClipboardImageBase64(png.toString('base64'))
-  assert.ok(Buffer.isBuffer(decoded))
-  assert.ok(decoded.equals(png))
-})
-
-test('decodeClipboardImageBase64 trims surrounding whitespace before decoding', () => {
-  const png = fakePngBuffer()
-  const decoded = decodeClipboardImageBase64(`\n  ${png.toString('base64')}  \r\n`)
-  assert.ok(decoded && decoded.equals(png))
-})
-
-test('decodeClipboardImageBase64 returns null for empty / whitespace input', () => {
-  assert.equal(decodeClipboardImageBase64(''), null)
-  assert.equal(decodeClipboardImageBase64('   \n  '), null)
-  assert.equal(decodeClipboardImageBase64(null), null)
-  assert.equal(decodeClipboardImageBase64(undefined), null)
-})
-
-test('decodeClipboardImageBase64 rejects base64 without a PNG signature', () => {
-  // Valid base64, but the decoded bytes are not a PNG.
-  const notPng = Buffer.from('this is not a png at all').toString('base64')
-  assert.equal(decodeClipboardImageBase64(notPng), null)
-})
-
-test('readWslWindowsClipboardImage decodes the first candidate that returns a PNG', () => {
-  const png = fakePngBuffer()
-  const calls = []
-  const exec = (cmd, args) => {
-    calls.push({ cmd, args })
-    return png.toString('base64')
-  }
-
-  const result = readWslWindowsClipboardImage({ exec, candidates: ['powershell.exe'] })
-  assert.ok(result && result.equals(png))
-  assert.equal(calls.length, 1)
-  assert.equal(calls[0].cmd, 'powershell.exe')
-  // -STA is mandatory for System.Windows.Forms.Clipboard.
-  assert.ok(calls[0].args.includes('-STA'))
-  assert.ok(calls[0].args.includes('-EncodedCommand'))
-})
-
-test('readWslWindowsClipboardImage returns null and stops when stdout is empty (no image)', () => {
-  let count = 0
-  const exec = () => {
-    count += 1
-    return ''
-  }
-
-  const result = readWslWindowsClipboardImage({
-    exec,
-    candidates: ['powershell.exe', '/mnt/c/Windows/System32/WindowsPowerShell/v1.0/powershell.exe']
-  })
-  assert.equal(result, null)
-  // Empty stdout means "no image on the clipboard" — don't probe further candidates.
-  assert.equal(count, 1)
-})
-
-test('readWslWindowsClipboardImage falls through to the next candidate when one throws', () => {
-  const png = fakePngBuffer()
-  const seen = []
-  const exec = cmd => {
-    seen.push(cmd)
-    if (cmd === 'powershell.exe') {
-      throw Object.assign(new Error('not found'), { code: 'ENOENT' })
-    }
-    return png.toString('base64')
-  }
-
-  const result = readWslWindowsClipboardImage({
-    exec,
-    candidates: ['powershell.exe', '/mnt/c/Windows/System32/WindowsPowerShell/v1.0/powershell.exe']
-  })
-  assert.ok(result && result.equals(png))
-  assert.deepEqual(seen, ['powershell.exe', '/mnt/c/Windows/System32/WindowsPowerShell/v1.0/powershell.exe'])
-})
-
-test('readWslWindowsClipboardImage returns null when every candidate throws', () => {
-  const exec = () => {
-    throw new Error('boom')
-  }
-
-  const result = readWslWindowsClipboardImage({ exec, candidates: ['a', 'b'] })
-  assert.equal(result, null)
-})
-
-test('powershellCandidates lists the bare name first, then the absolute fallback', () => {
-  const candidates = powershellCandidates()
-  assert.equal(candidates[0], 'powershell.exe')
-  assert.ok(candidates.some(c => c.endsWith('WindowsPowerShell/v1.0/powershell.exe')))
-})
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@@ -18,7 +18,7 @@
    "profile:main": "wait-on http://127.0.0.1:5174 && cross-env XCURSOR_SIZE=24 HERMES_DESKTOP_DEV_SERVER=http://127.0.0.1:5174 electron --inspect=9229 .",
    "profile:main:cpu": "wait-on http://127.0.0.1:5174 && cross-env XCURSOR_SIZE=24 NODE_OPTIONS=--cpu-prof HERMES_DESKTOP_DEV_SERVER=http://127.0.0.1:5174 electron .",
    "start": "npm run build && electron .",
-    "build": "node scripts/assert-root-install.cjs && node scripts/write-build-stamp.cjs && node scripts/stage-native-deps.cjs && tsc -b && vite build && npm run postbuild",
+    "build": "node scripts/assert-root-install.cjs && node scripts/write-build-stamp.cjs && node scripts/stage-native-deps.cjs && tsc -b && vite build &&  node scripts/bundle-electron-main.mjs && npm run postbuild",
    "postbuild": "node scripts/assert-dist-built.cjs",
    "prebuilder": "node scripts/patch-electron-builder-mac-binary.cjs",
    "builder": "cross-env NODE_OPTIONS=--max-old-space-size=16384 node scripts/run-electron-builder.cjs",
@@ -37,7 +37,7 @@
    "test:desktop:nsis": "node scripts/test-desktop.mjs nsis",
    "test:desktop:existing": "node scripts/test-desktop.mjs existing",
    "test:desktop:fresh": "node scripts/test-desktop.mjs fresh",
-    "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/git-worktree-ops.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-count.test.cjs electron/update-rebuild.test.cjs electron/update-marker.test.cjs electron/update-relaunch.test.cjs electron/windows-user-env.test.cjs electron/wsl-clipboard-image.test.cjs electron/titlebar-overlay-width.test.cjs electron/window-state.test.cjs electron/windows-hermes-resolution.test.cjs",
+    "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/git-worktree-ops.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-count.test.cjs electron/update-rebuild.test.cjs electron/update-marker.test.cjs electron/update-relaunch.test.cjs electron/windows-user-env.test.cjs electron/window-state.test.cjs",
    "typecheck": "tsc -p . --noEmit",
    "lint": "eslint src/ electron/",
    "lint:fix": "eslint src/ electron/ --fix",
@@ -51,17 +51,11 @@
    "@assistant-ui/react-streamdown": "^0.1.11",
    "@audiowave/react": "^0.6.2",
    "@chenglou/pretext": "^0.0.6",
-    "@codemirror/commands": "^6.10.4",
-    "@codemirror/language": "^6.12.4",
-    "@codemirror/language-data": "^6.5.2",
-    "@codemirror/state": "^6.7.0",
-    "@codemirror/view": "^6.43.3",
    "@dnd-kit/core": "^6.3.1",
    "@dnd-kit/sortable": "^10.0.0",
    "@dnd-kit/utilities": "^3.2.2",
    "@hermes/shared": "file:../shared",
    "@icons-pack/react-simple-icons": "=13.11.1",
-    "@lezer/highlight": "^1.2.3",
    "@nanostores/react": "^1.1.0",
    "@nous-research/ui": "^0.13.0",
    "@radix-ui/react-slot": "^1.2.4",
@@ -73,7 +67,6 @@
    "@tanstack/react-virtual": "^3.13.24",
    "@vscode/codicons": "^0.0.45",
    "@xterm/addon-fit": "^0.11.0",
-    "@xterm/addon-serialize": "^0.14.0",
    "@xterm/addon-unicode11": "^0.9.0",
    "@xterm/addon-web-links": "^0.12.0",
    "@xterm/addon-webgl": "^0.19.0",
@@ -82,13 +75,11 @@
    "clsx": "^2.1.1",
    "cmdk": "^1.1.1",
    "dnd-core": "^14.0.1",
-    "dompurify": "^3.4.11",
    "hast-util-from-html-isomorphic": "^2.0.0",
    "hast-util-to-text": "^4.0.2",
    "ignore": "^7.0.5",
    "katex": "^0.16.45",
    "leva": "^0.10.1",
-    "mermaid": "^11.15.0",
    "motion": "^12.38.0",
    "nanostores": "^1.3.0",
    "node-pty": "1.1.0",
--- a/apps/desktop/scripts/bundle-electron-main.mjs
+++ b/apps/desktop/scripts/bundle-electron-main.mjs
@@ -0,0 +1,33 @@
+#!/usr/bin/env node
+// bundle-electron-main.mjs — bundles electron/main.cjs into a single
+// self-contained file so the nix build doesn't need to ship node_modules/.
+//
+// `electron` is provided by the runtime; `node-pty` is staged separately
+// via stage-native-deps.cjs.  `preload.cjs` is NOT require()'d by main —
+// Electron loads it via path.join(__dirname, 'preload.cjs') — so it stays
+// as a separate file and doesn't need bundling.
+import { build } from 'esbuild'
+import { resolve, dirname } from 'node:path'
+import { fileURLToPath } from 'node:url'
+import { renameSync } from 'node:fs'
+
+const here = dirname(fileURLToPath(import.meta.url))
+const root = resolve(here, '..')
+const entry = resolve(root, 'electron/main.cjs')
+const tmp = resolve(root, 'electron/main.bundled.cjs')
+
+await build({
+  entryPoints: [entry],
+  bundle: true,
+  platform: 'node',
+  format: 'cjs',
+  target: 'node20',
+  outfile: tmp,
+  external: ['electron', 'node-pty'],
+  logLevel: 'info'
+})
+
+// Overwrite the original with the bundled version.
+renameSync(tmp, entry)
+
+console.log(`bundled ${entry}`)
--- a/apps/desktop/scripts/stage-native-deps.cjs
+++ b/apps/desktop/scripts/stage-native-deps.cjs
@@ -66,31 +66,6 @@ const NATIVE_DEPS = [
  }
 ]

-// Pure-JS runtime dependencies that the packaged electron main require()s but
-// that workspace dedup hoists into the repo-root node_modules -- out of reach
-// of electron-builder's file collector, exactly like node-pty above.  Unlike
-// node-pty there is no native binary to select; we stage each package's whole
-// directory into build/native-deps/vendor/node_modules/<name> so the dep's own
-// internal require()s resolve against a real node_modules tree, and the
-// requiring file (electron/git-review-ops.cjs) falls back to that path via
-// process.resourcesPath when the normal require() fails.  See issue #52735
-// (packaged app crashed at launch on `Cannot find module 'simple-git'`).
-//
-// The closure is resolved at stage time by walking dependencies +
-// optionalDependencies, so a simple-git version bump that pulls in a new
-// transitive dep can't silently re-introduce the crash.
-//
-// Layout note: the closure lands in build/native-deps/vendor/node_modules/,
-// NOT build/native-deps/node_modules/.  electron-builder's file collector
-// hard-drops a `node_modules` directory that sits at the ROOT of an
-// extraResources copy (app-builder-lib/out/util/filter.js: `if (relative ===
-// "node_modules") return false`), but keeps a NESTED one.  Nesting under
-// `vendor/` makes node_modules a subdirectory so it survives packing; the
-// require() fallback in git-review-ops.cjs resolves the matching
-// vendor/node_modules path.
-const JS_DEP_ROOTS = ['simple-git']
-const JS_DEP_STAGE_ROOT = path.join(STAGE_ROOT, 'vendor', 'node_modules')
-
 function rmrf(target) {
  fs.rmSync(target, { recursive: true, force: true })
 }
@@ -173,111 +148,12 @@ function stageOne(spec) {
  console.log(`[stage-native-deps] ${path.relative(APP_ROOT, spec.to)}: ${copied} files`)
 }

-// Resolve a package's directory by name, searching the repo-root node_modules
-// first (where workspace dedup hoists everything) and then the requiring
-// package's own node_modules for any non-hoisted nested copy.
-//
-// We deliberately do NOT use require.resolve(`${name}/package.json`): packages
-// with an "exports" map that doesn't list "./package.json" (e.g. simple-git
-// 3.x) make that subpath unresolvable under Node's exports enforcement
-// (ERR_PACKAGE_PATH_NOT_EXPORTED), which fails on CI even though it happened to
-// work locally.  Instead resolve the package's main entry (exports-aware) and
-// walk up to the directory whose package.json's "name" matches.
-function resolvePkgDir(name, fromDir) {
-  const searchPaths = [fromDir, REPO_ROOT, path.join(REPO_ROOT, 'node_modules')]
-  let entry
-  try {
-    entry = require.resolve(name, { paths: searchPaths })
-  } catch {
-    return null
-  }
-  // Walk up from the resolved entry file to the package root: the first
-  // ancestor dir whose package.json declares this package's name.
-  let dir = path.dirname(entry)
-  while (true) {
-    const pjPath = path.join(dir, 'package.json')
-    try {
-      const pj = JSON.parse(fs.readFileSync(pjPath, 'utf8'))
-      if (pj.name === name) {
-        return dir
-      }
-    } catch {
-      // no package.json here (or unreadable) — keep walking up
-    }
-    const parent = path.dirname(dir)
-    if (parent === dir) {
-      return null
-    }
-    dir = parent
-  }
-}
-
-// Walk dependencies + optionalDependencies from each root package and return
-// the set of resolved package directories in the runtime closure.  Keyed by
-// package name so a dep reached via two paths is staged once.
-function resolveJsClosure(roots) {
-  const closure = new Map() // name -> absolute package dir
-  const stack = roots.map(name => ({ name, fromDir: REPO_ROOT }))
-  while (stack.length) {
-    const { name, fromDir } = stack.pop()
-    if (closure.has(name)) continue
-    const dir = resolvePkgDir(name, fromDir)
-    if (!dir) {
-      throw new Error(
-        `stage-native-deps: could not resolve '${name}' for the simple-git ` +
-          `closure.  Run \`npm install\` at the workspace root first.`
-      )
-    }
-    closure.set(name, dir)
-    let pj
-    try {
-      pj = JSON.parse(fs.readFileSync(path.join(dir, 'package.json'), 'utf8'))
-    } catch {
-      continue
-    }
-    const deps = { ...(pj.dependencies || {}), ...(pj.optionalDependencies || {}) }
-    for (const depName of Object.keys(deps)) {
-      stack.push({ name: depName, fromDir: dir })
-    }
-  }
-  return closure
-}
-
-// Stage the resolved JS dependency closure into build/native-deps/vendor/node_modules/
-// so the packaged app (and the nix output) can require() it from
-// process.resourcesPath when the hoisted-root require() isn't reachable.  Each
-// package is copied whole (minus node_modules/ — the closure is flattened so
-// every dep already has its own top-level entry) into a real node_modules
-// layout, which keeps the deps' own internal require()s working unchanged.
-function stageJsClosure(roots) {
-  const closure = resolveJsClosure(roots)
-  rmrf(JS_DEP_STAGE_ROOT)
-  ensureDir(JS_DEP_STAGE_ROOT)
-  let staged = 0
-  for (const [name, fromDir] of closure) {
-    const dest = path.join(JS_DEP_STAGE_ROOT, name)
-    ensureDir(path.dirname(dest))
-    // Copy the package directory but skip any nested node_modules/ — the
-    // closure is flattened, so nested copies would just bloat the bundle.
-    fs.cpSync(fromDir, dest, {
-      recursive: true,
-      filter: src => path.basename(src) !== 'node_modules'
-    })
-    staged += 1
-  }
-  console.log(
-    `[stage-native-deps] vendor/node_modules/: ${staged} package(s) ` +
-      `(${[...closure.keys()].sort().join(', ')})`
-  )
-}
-
 function main() {
  rmrf(STAGE_ROOT)
  ensureDir(STAGE_ROOT)
  for (const spec of NATIVE_DEPS) {
    stageOne(spec)
  }
-  stageJsClosure(JS_DEP_ROOTS)
 }

 main()
--- a/apps/desktop/src/app/agents/index.tsx
+++ b/apps/desktop/src/app/agents/index.tsx
@@ -3,8 +3,8 @@ import { type ReactNode, useEffect, useMemo, useState } from 'react'

 import { useElapsedSeconds } from '@/components/chat/activity-timer'
 import { ActivityTimerText } from '@/components/chat/activity-timer-text'
-import { Codicon } from '@/components/ui/codicon'
 import { FadeText } from '@/components/ui/fade-text'
+import { Codicon } from '@/components/ui/codicon'
 import { GlyphSpinner } from '@/components/ui/glyph-spinner'
 import { type Translations, useI18n } from '@/i18n'
 import { AlertCircle, CheckCircle2 } from '@/lib/icons'
@@ -19,7 +19,7 @@ import {
  type SubagentStreamEntry
 } from '@/store/subagents'

-import { Panel, PanelEmpty, PanelHeader } from '../overlays/panel'
+import { OverlayView } from '../overlays/overlay-view'

 // Mirrors statusGlyph() in tool-fallback.tsx so subagent rows speak the
 // same visual vocabulary as the chat tool blocks.
@@ -86,16 +86,18 @@ export function AgentsView({ onClose }: AgentsViewProps) {
  const tree = useMemo(() => buildSubagentTree(allSubagents(subagentsBySession)), [subagentsBySession])

  return (
-    <Panel closeLabel={t.agents.close} onClose={onClose}>
-      {tree.length === 0 ? (
-        <PanelEmpty description={t.agents.emptyDesc} icon="hubot" title={t.agents.emptyTitle} />
-      ) : (
-        <>
-          <PanelHeader subtitle={t.agents.subtitle} title={t.agents.title} />
-          <SubagentTree tree={tree} />
-        </>
-      )}
-    </Panel>
+    <OverlayView
+      closeLabel={t.agents.close}
+      contentClassName="px-5 pt-5 pb-4 sm:px-6"
+      onClose={onClose}
+      rootClassName="mx-auto max-w-3xl"
+    >
+      <header className="mb-3 shrink-0">
+        <h2 className="text-sm font-semibold text-foreground">{t.agents.title}</h2>
+        <p className="text-xs text-muted-foreground/80">{t.agents.subtitle}</p>
+      </header>
+      <SubagentTree tree={tree} />
+    </OverlayView>
  )
 }

--- a/apps/desktop/src/app/artifacts/index.tsx
+++ b/apps/desktop/src/app/artifacts/index.tsx
@@ -477,20 +477,17 @@ export function ArtifactsView({ setStatusbarItemGroup: _setStatusbarItemGroup, .
    }
  }, [artifacts])

-  const openArtifact = useCallback(
-    async (href: string) => {
-      try {
-        if (window.hermesDesktop?.openExternal) {
-          await window.hermesDesktop.openExternal(href)
-        } else {
-          window.open(href, '_blank', 'noopener,noreferrer')
-        }
-      } catch (err) {
-        notifyError(err, a.openFailed)
+  const openArtifact = useCallback(async (href: string) => {
+    try {
+      if (window.hermesDesktop?.openExternal) {
+        await window.hermesDesktop.openExternal(href)
+      } else {
+        window.open(href, '_blank', 'noopener,noreferrer')
      }
-    },
-    [a]
-  )
+    } catch (err) {
+      notifyError(err, a.openFailed)
+    }
+  }, [a])

  const markImageFailed = useCallback((id: string) => {
    setFailedImageIds(current => {
@@ -842,8 +839,7 @@ const ARTIFACT_COLUMNS: readonly ArtifactColumn[] = [
  {
    Cell: PrimaryCell,
    bodyClassName: 'p-0',
-    header: (filter, a) =>
-      filter === 'link' ? a.colTitleLink : filter === 'file' ? a.colTitleFile : a.colTitleDefault,
+    header: (filter, a) => (filter === 'link' ? a.colTitleLink : filter === 'file' ? a.colTitleFile : a.colTitleDefault),
    id: 'primary',
    width: filter => (filter === 'link' ? 'w-[50%]' : 'w-[35%]')
  },
--- a/apps/desktop/src/app/chat/composer/attachments.test.tsx
+++ b/apps/desktop/src/app/chat/composer/attachments.test.tsx
@@ -2,9 +2,9 @@ import { cleanup, render, screen } from '@testing-library/react'
 import { afterEach, describe, expect, it } from 'vitest'

 import { I18nProvider } from '@/i18n/context'
-import type { ComposerAttachment } from '@/store/composer'

 import { AttachmentList } from './attachments'
+import type { ComposerAttachment } from '@/store/composer'

 function makeAttachment(id: string, label = 'test.pdf'): ComposerAttachment {
  return { id, kind: 'file', label }
@@ -32,10 +32,7 @@ describe('AttachmentList', () => {

  it('renders empty list without error', () => {
    renderWithI18n(<AttachmentList attachments={[]} />)
-
-    const container =
-      screen.getByTestId?.('composer-attachments') ?? document.querySelector('[data-slot="composer-attachments"]')
-
+    const container = screen.getByTestId?.('composer-attachments') ?? document.querySelector('[data-slot="composer-attachments"]')
    expect(container).toBeDefined()
  })

@@ -58,7 +55,10 @@ describe('AttachmentList', () => {
  })

  it('does not crash when attachments array contains null entries', () => {
-    const attachments = [null as unknown as ComposerAttachment, makeAttachment('a', 'valid.txt')]
+    const attachments = [
+      null as unknown as ComposerAttachment,
+      makeAttachment('a', 'valid.txt')
+    ]

    expect(() => {
      renderWithI18n(<AttachmentList attachments={attachments} />)
--- a/apps/desktop/src/app/chat/composer/context-menu.tsx
+++ b/apps/desktop/src/app/chat/composer/context-menu.tsx
@@ -73,11 +73,7 @@ export function ContextMenu({
          <ContextMenuItem disabled={!onPickImages} icon={ImageIcon} onSelect={onPickImages}>
            {c.images}
          </ContextMenuItem>
-          <ContextMenuItem
-            disabled={!onPasteClipboardImage}
-            icon={Clipboard}
-            onSelect={onPasteClipboardImage ? () => void onPasteClipboardImage() : undefined}
-          >
+          <ContextMenuItem disabled={!onPasteClipboardImage} icon={Clipboard} onSelect={onPasteClipboardImage}>
            {c.pasteImage}
          </ContextMenuItem>
          <ContextMenuItem icon={Link} onSelect={onOpenUrlDialog}>
@@ -171,7 +167,7 @@ interface ContextMenuItemProps {
 interface ContextMenuProps {
  onInsertText: (text: string) => void
  onOpenUrlDialog: () => void
-  onPasteClipboardImage?: (opts?: { silent?: boolean }) => Promise<boolean> | void
+  onPasteClipboardImage?: () => void
  onPickFiles?: () => void
  onPickFolders?: () => void
  onPickImages?: () => void
--- a/apps/desktop/src/app/chat/composer/controls.tsx
+++ b/apps/desktop/src/app/chat/composer/controls.tsx
@@ -4,7 +4,7 @@ import { KbdCombo } from '@/components/ui/kbd'
 import { Tip } from '@/components/ui/tooltip'
 import { useI18n } from '@/i18n'
 import { triggerHaptic } from '@/lib/haptics'
-import { AudioLines, Layers3, Loader2, Square, SteeringWheel, Volume2, VolumeX } from '@/lib/icons'
+import { AudioLines, Layers3, Loader2, Square, SteeringWheel } from '@/lib/icons'
 import { formatCombo } from '@/lib/keybinds/combo'
 import { cn } from '@/lib/utils'

@@ -39,7 +39,6 @@ interface ConversationProps {
 }

 export function ComposerControls({
-  autoSpeak,
  busy,
  busyAction,
  canSteer,
@@ -51,10 +50,8 @@ export function ComposerControls({
  state,
  voiceStatus,
  onDictate,
-  onSteer,
-  onToggleAutoSpeak
+  onSteer
 }: {
-  autoSpeak: boolean
  busy: boolean
  busyAction: 'queue' | 'stop'
  canSteer: boolean
@@ -67,7 +64,6 @@ export function ComposerControls({
  voiceStatus: VoiceStatus
  onDictate: () => void
  onSteer: () => void
-  onToggleAutoSpeak: () => void
 }) {
  const { t } = useI18n()
  const c = t.composer
@@ -109,7 +105,6 @@ export function ComposerControls({
      ) : (
        <DictationButton disabled={disabled} onToggle={onDictate} state={state.voice} status={voiceStatus} />
      )}
-      <AutoSpeakButton active={autoSpeak} disabled={disabled} onToggle={onToggleAutoSpeak} />
      {showVoicePrimary ? (
        <Tip label={c.startVoice}>
          <Button
@@ -259,47 +254,6 @@ function ConversationIndicator({
  )
 }

-// Pure-TTS toggle: type normally, but have every assistant reply read aloud —
-// no dictation, no full conversation loop. Filled/accent when on, mirroring the
-// muted-mic pressed state above. Driven by (and persisted to) `voice.auto_tts`.
-function AutoSpeakButton({
-  active,
-  disabled,
-  onToggle
-}: {
-  active: boolean
-  disabled: boolean
-  onToggle: () => void
-}) {
-  const { t } = useI18n()
-  const c = t.composer
-  const label = active ? c.stopSpeakingReplies : c.speakReplies
-
-  return (
-    <Tip label={label}>
-      <Button
-        aria-label={label}
-        aria-pressed={active}
-        className={cn(
-          GHOST_ICON_BTN,
-          'p-0',
-          active && 'bg-primary/10 text-primary hover:bg-primary/15 hover:text-primary'
-        )}
-        disabled={disabled}
-        onClick={() => {
-          triggerHaptic(active ? 'close' : 'open')
-          onToggle()
-        }}
-        size="icon"
-        type="button"
-        variant="ghost"
-      >
-        {active ? <Volume2 size={14} /> : <VolumeX size={14} />}
-      </Button>
-    </Tip>
-  )
-}
-
 function DictationButton({
  disabled,
  state,
--- a/apps/desktop/src/app/chat/composer/enter-submit-dom-race.test.tsx
+++ b/apps/desktop/src/app/chat/composer/enter-submit-dom-race.test.tsx
@@ -59,10 +59,8 @@ function Harness({
    }

    const editor = editorRef.current
-
    if (editor) {
      const domText = composerPlainText(editor)
-
      if (domText !== draftRef.current) {
        draftRef.current = domText
        setDraft(domText)
@@ -129,11 +127,9 @@ function Harness({
 describe('composer Enter submit — live DOM vs stale composer state (#39630)', () => {
  it('sends the just-typed text on Enter even when composer state has not synced', async () => {
    const onSubmit = vi.fn()
-
    const { getByTestId } = render(
      <Harness onCancel={vi.fn()} onDrain={vi.fn()} onQueue={vi.fn()} onSubmit={onSubmit} />
    )
-
    const editor = getByTestId('editor')

    // Fast typing: the DOM has the text but NO input event fired, so `draft`
@@ -150,11 +146,9 @@ describe('composer Enter submit — live DOM vs stale composer state (#39630)',
    const onQueue = vi.fn()
    const onDrain = vi.fn()
    const onCancel = vi.fn()
-
    const { getByTestId } = render(
      <Harness busy onCancel={onCancel} onDrain={onDrain} onQueue={onQueue} onSubmit={vi.fn()} queued={['queued-1']} />
    )
-
    const editor = getByTestId('editor')

    await act(async () => {
@@ -171,11 +165,9 @@ describe('composer Enter submit — live DOM vs stale composer state (#39630)',
    const onCancel = vi.fn()
    const onSubmit = vi.fn()
    const onQueue = vi.fn()
-
    const { getByTestId } = render(
      <Harness busy onCancel={onCancel} onDrain={vi.fn()} onQueue={onQueue} onSubmit={onSubmit} />
    )
-
    const editor = getByTestId('editor')

    await act(async () => {
@@ -191,11 +183,9 @@ describe('composer Enter submit — live DOM vs stale composer state (#39630)',
  it('drains the next queued prompt on Enter when idle with a truly empty editor', async () => {
    const onDrain = vi.fn()
    const onSubmit = vi.fn()
-
    const { getByTestId } = render(
      <Harness onCancel={vi.fn()} onDrain={onDrain} onQueue={vi.fn()} onSubmit={onSubmit} queued={['queued-1']} />
    )
-
    const editor = getByTestId('editor')

    await act(async () => {
@@ -210,18 +200,9 @@ describe('composer Enter submit — live DOM vs stale composer state (#39630)',
  it('keeps reconnect drafts editable but blocks Enter submit until the gateway returns', async () => {
    const onSubmit = vi.fn()
    const onDrain = vi.fn()
-
    const { getByTestId } = render(
-      <Harness
-        disabled
-        onCancel={vi.fn()}
-        onDrain={onDrain}
-        onQueue={vi.fn()}
-        onSubmit={onSubmit}
-        queued={['queued-1']}
-      />
+      <Harness disabled onCancel={vi.fn()} onDrain={onDrain} onQueue={vi.fn()} onSubmit={onSubmit} queued={['queued-1']} />
    )
-
    const editor = getByTestId('editor')

    await act(async () => {
--- a/apps/desktop/src/app/chat/composer/help-hint.tsx
+++ b/apps/desktop/src/app/chat/composer/help-hint.tsx
@@ -33,7 +33,7 @@ export function HelpHint() {

      <Section title={c.hotkeys}>
        {COMPOSER_HOTKEY_ROWS.map(row => (
-          <HotkeyRow combos={[...row.combos]} description={c.hotkeyDescs[row.id] ?? ''} key={row.id} />
+          <HotkeyRow description={c.hotkeyDescs[row.id] ?? ''} combos={[...row.combos]} key={row.id} />
        ))}
      </Section>

--- a/Show More
+++ b/Show More